-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreamlit_app.py
109 lines (88 loc) · 4.16 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import re
import io
# Function to normalize text
def normalize(text):
text = str(text).lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
text = ' '.join(text.split()) # Remove extra whitespace
return text
# Streamlit app
def main():
st.title("Product Matching App")
# File upload
uploaded_file1 = st.file_uploader("Choose the first Excel file", type="xlsx")
uploaded_file2 = st.file_uploader("Choose the second Excel file", type="xlsx")
if uploaded_file1 and uploaded_file2:
df1 = pd.read_excel(uploaded_file1)
df2 = pd.read_excel(uploaded_file2)
st.write("DF1 Columns:", df1.columns.tolist())
st.write("DF2 Columns:", df2.columns.tolist())
# Preprocess product names
df1['product_name_normalized'] = df1['product_name'].apply(normalize)
df2['product_name_normalized'] = df2['product_name'].apply(normalize)
# Compute embeddings for product names
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings1 = model.encode(df1['product_name_normalized'].tolist(), convert_to_tensor=False)
embeddings2 = model.encode(df2['product_name_normalized'].tolist(), convert_to_tensor=False)
# Fit NearestNeighbors model on embeddings from df2
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric='cosine').fit(embeddings2)
# Find nearest neighbors in df2 for entries in df1
distances, indices = nbrs.kneighbors(embeddings1)
# Set a similarity threshold
threshold = st.slider("Set similarity threshold", 0.0, 1.0, 0.2)
matches = []
unmatched_indices = []
for idx, (distance_array, index_array) in enumerate(zip(distances, indices)):
index = index_array[0]
distance = distance_array[0]
similarity = 1 - distance # Cosine similarity
if distance <= threshold:
matches.append({
'product_name_1': df1.iloc[idx]['product_name'],
'product_price_1': df1.iloc[idx]['product_price'],
'product_name_2': df2.iloc[index]['product_name'],
'product_price_2': df2.iloc[index]['product_price'],
'similarity': similarity
})
else:
unmatched_indices.append(idx)
# Compile matched and unmatched products
matched_df = pd.DataFrame(matches)
unmatched_df = df1.iloc[unmatched_indices]
st.write("Matched Products:")
st.dataframe(matched_df)
st.write("Unmatched Products:")
st.dataframe(unmatched_df)
# Ensure product prices are numeric
matched_df['product_price_1'] = pd.to_numeric(matched_df['product_price_1'], errors='coerce')
matched_df['product_price_2'] = pd.to_numeric(matched_df['product_price_2'], errors='coerce')
# Price comparison
matched_df['price_difference'] = matched_df['product_price_1'] - matched_df['product_price_2']
# Convert DataFrames to Excel files in memory
matched_buffer = io.BytesIO()
unmatched_buffer = io.BytesIO()
# Write DataFrames to the in-memory buffers
with pd.ExcelWriter(matched_buffer, engine='xlsxwriter') as writer:
matched_df.to_excel(writer, index=False)
with pd.ExcelWriter(unmatched_buffer, engine='xlsxwriter') as writer:
unmatched_df.to_excel(writer, index=False)
# Download buttons
st.download_button(
label="Download Matched Products",
data=matched_buffer.getvalue(),
file_name='matched_products.xlsx',
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
st.download_button(
label="Download Unmatched Products",
data=unmatched_buffer.getvalue(),
file_name='unmatched_products.xlsx',
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
if __name__ == "__main__":
main()