Spaces:
Sleeping
Sleeping
Update product_recommender.py
Browse files- product_recommender.py +75 -73
product_recommender.py
CHANGED
@@ -2,9 +2,9 @@ from typing import Dict, List
|
|
2 |
import aiohttp
|
3 |
import asyncio
|
4 |
import re
|
|
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
from sentence_transformers import SentenceTransformer
|
7 |
-
import numpy as np
|
8 |
|
9 |
class DynamicRecommender:
|
10 |
def __init__(self):
|
@@ -15,17 +15,41 @@ class DynamicRecommender:
|
|
15 |
'Chrome/100.0.4896.75 Safari/537.36'
|
16 |
)
|
17 |
}
|
18 |
-
# Load your model
|
19 |
self.model = SentenceTransformer('all-mpnet-base-v2')
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# ------------------------------------------------------------------
|
22 |
# Amazon search
|
23 |
# ------------------------------------------------------------------
|
24 |
async def search_amazon(self, query: str) -> List[Dict]:
|
25 |
-
"""
|
26 |
-
Search Amazon for products by building the search URL
|
27 |
-
and parsing the resulting HTML.
|
28 |
-
"""
|
29 |
print(f"Searching Amazon for: {query}")
|
30 |
search_url = f"https://www.amazon.in/s?k={query}"
|
31 |
async with aiohttp.ClientSession() as session:
|
@@ -39,7 +63,7 @@ class DynamicRecommender:
|
|
39 |
soup = BeautifulSoup(html, 'html.parser')
|
40 |
products = []
|
41 |
|
42 |
-
# These selectors may need updating if Amazon changes
|
43 |
search_items = soup.select('.s-result-item')
|
44 |
|
45 |
for item in search_items:
|
@@ -47,32 +71,27 @@ class DynamicRecommender:
|
|
47 |
name_elem = item.select_one('.a-text-normal')
|
48 |
price_elem = item.select_one('.a-price-whole')
|
49 |
link_elem = item.select_one('a.a-link-normal')
|
50 |
-
|
51 |
if name_elem and price_elem and link_elem:
|
52 |
product_name = name_elem.get_text(strip=True)
|
53 |
product_price = price_elem.get_text(strip=True)
|
54 |
product_url = link_elem.get('href')
|
55 |
-
|
56 |
products.append({
|
57 |
'name': product_name,
|
58 |
'price': product_price,
|
59 |
'source': 'Amazon',
|
60 |
'url': 'https://www.amazon.in' + product_url,
|
61 |
-
'description':
|
62 |
})
|
63 |
except Exception:
|
64 |
continue
|
65 |
|
66 |
-
print(f"Found {len(products)} Amazon products.")
|
67 |
return products[:5]
|
68 |
|
69 |
# ------------------------------------------------------------------
|
70 |
# Flipkart search
|
71 |
# ------------------------------------------------------------------
|
72 |
async def search_flipkart(self, query: str) -> List[Dict]:
|
73 |
-
"""
|
74 |
-
Search Flipkart for products.
|
75 |
-
"""
|
76 |
print(f"Searching Flipkart for: {query}")
|
77 |
search_url = f"https://www.flipkart.com/search?q={query}"
|
78 |
async with aiohttp.ClientSession() as session:
|
@@ -86,7 +105,7 @@ class DynamicRecommender:
|
|
86 |
soup = BeautifulSoup(html, 'html.parser')
|
87 |
products = []
|
88 |
|
89 |
-
# These selectors may need updating if Flipkart changes
|
90 |
item_cards = soup.select('._1AtVbE')
|
91 |
|
92 |
for item in item_cards:
|
@@ -94,33 +113,27 @@ class DynamicRecommender:
|
|
94 |
name_elem = item.select_one('._4rR01T')
|
95 |
price_elem = item.select_one('._30jeq3')
|
96 |
link_elem = item.select_one('a')
|
97 |
-
|
98 |
if name_elem and price_elem and link_elem:
|
99 |
product_name = name_elem.get_text(strip=True)
|
100 |
product_price = price_elem.get_text(strip=True)
|
101 |
product_url = link_elem.get('href')
|
102 |
-
|
103 |
products.append({
|
104 |
'name': product_name,
|
105 |
'price': product_price,
|
106 |
'source': 'Flipkart',
|
107 |
'url': 'https://www.flipkart.com' + product_url,
|
108 |
-
'description':
|
109 |
})
|
110 |
except Exception:
|
111 |
continue
|
112 |
|
113 |
-
print(f"Found {len(products)} Flipkart products.")
|
114 |
return products[:5]
|
115 |
|
116 |
# ------------------------------------------------------------------
|
117 |
-
# IGP search
|
118 |
# ------------------------------------------------------------------
|
119 |
async def search_igp(self, query: str) -> List[Dict]:
|
120 |
-
"""
|
121 |
-
Search IGP for products (gift store).
|
122 |
-
Adjust the selectors or approach as needed.
|
123 |
-
"""
|
124 |
print(f"Searching IGP for: {query}")
|
125 |
search_url = f"https://www.igp.com/search/{query}"
|
126 |
async with aiohttp.ClientSession() as session:
|
@@ -134,8 +147,7 @@ class DynamicRecommender:
|
|
134 |
soup = BeautifulSoup(html, 'html.parser')
|
135 |
products = []
|
136 |
|
137 |
-
#
|
138 |
-
# This is just an *example*; may not match actual IGP HTML
|
139 |
item_cards = soup.select('.product-item')
|
140 |
|
141 |
for item in item_cards:
|
@@ -143,86 +155,80 @@ class DynamicRecommender:
|
|
143 |
name_elem = item.select_one('.product-title')
|
144 |
price_elem = item.select_one('.product-price')
|
145 |
link_elem = item.select_one('a')
|
146 |
-
|
147 |
if name_elem and price_elem and link_elem:
|
148 |
product_name = name_elem.get_text(strip=True)
|
149 |
product_price = price_elem.get_text(strip=True)
|
150 |
product_url = link_elem.get('href')
|
151 |
-
|
152 |
products.append({
|
153 |
'name': product_name,
|
154 |
'price': product_price,
|
155 |
'source': 'IGP',
|
156 |
'url': 'https://www.igp.com' + product_url,
|
157 |
-
'description':
|
158 |
})
|
159 |
except Exception:
|
160 |
continue
|
161 |
|
162 |
-
print(f"Found {len(products)} IGP products.")
|
163 |
return products[:5]
|
164 |
|
165 |
# ------------------------------------------------------------------
|
166 |
-
# Extract
|
167 |
# ------------------------------------------------------------------
|
168 |
def _extract_keywords(self, text: str) -> List[str]:
|
169 |
"""
|
170 |
-
|
171 |
-
|
|
|
|
|
172 |
"""
|
173 |
-
|
174 |
-
|
175 |
-
# Try to find age
|
176 |
-
age_match = re.search(r'age\s+(\d+)', text_lower)
|
177 |
age = age_match.group(1) if age_match else None
|
178 |
|
179 |
-
|
|
|
180 |
|
181 |
-
#
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
if 'novel' in text_lower or 'leader' in text_lower or 'leadership' in text_lower:
|
187 |
-
interests.append('leadership novels')
|
188 |
-
if 'successful' in text_lower:
|
189 |
-
interests.extend(['self help books', 'business books'])
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
# You can decide how exactly you want to incorporate age
|
198 |
-
interests = [f"{interest} for {age} year old" for interest in interests]
|
199 |
-
|
200 |
-
print("Extracted keywords:", interests)
|
201 |
-
return interests
|
202 |
|
203 |
# ------------------------------------------------------------------
|
204 |
# Main recommendations
|
205 |
# ------------------------------------------------------------------
|
206 |
async def get_recommendations(self, text: str) -> List[Dict]:
|
207 |
"""
|
208 |
-
|
209 |
"""
|
210 |
try:
|
211 |
-
#
|
212 |
-
|
213 |
|
214 |
-
#
|
215 |
all_products = []
|
216 |
-
for
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
220 |
|
221 |
all_products.extend(amazon_products)
|
222 |
all_products.extend(flipkart_products)
|
223 |
all_products.extend(igp_products)
|
224 |
|
225 |
-
#
|
226 |
seen = set()
|
227 |
unique_products = []
|
228 |
for product in all_products:
|
@@ -230,12 +236,8 @@ class DynamicRecommender:
|
|
230 |
seen.add(product['name'])
|
231 |
unique_products.append(product)
|
232 |
|
233 |
-
#
|
234 |
-
|
235 |
-
final_results = unique_products[:5]
|
236 |
-
|
237 |
-
print(f"Returning {len(final_results)} products.")
|
238 |
-
return final_results
|
239 |
|
240 |
except Exception as e:
|
241 |
print(f"Error in recommendations: {str(e)}")
|
|
|
2 |
import aiohttp
|
3 |
import asyncio
|
4 |
import re
|
5 |
+
import torch
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
from bs4 import BeautifulSoup
|
|
|
|
|
8 |
|
9 |
class DynamicRecommender:
|
10 |
def __init__(self):
|
|
|
15 |
'Chrome/100.0.4896.75 Safari/537.36'
|
16 |
)
|
17 |
}
|
18 |
+
# Load your model
|
19 |
self.model = SentenceTransformer('all-mpnet-base-v2')
|
20 |
|
21 |
+
# Preβdefine some candidate categories you might want to search for.
|
22 |
+
# Adjust these to suit your domain. The more you add, the more "general"
|
23 |
+
# your coverage becomes. They can be as broad or as niche as you like.
|
24 |
+
self.candidate_categories = [
|
25 |
+
"tech gadgets",
|
26 |
+
"programming books",
|
27 |
+
"self help books",
|
28 |
+
"business books",
|
29 |
+
"leadership novels",
|
30 |
+
"fashion accessories",
|
31 |
+
"beauty products",
|
32 |
+
"board games",
|
33 |
+
"music instruments",
|
34 |
+
"cooking utensils",
|
35 |
+
"cookbooks",
|
36 |
+
"art and painting supplies",
|
37 |
+
"home decor",
|
38 |
+
"pet supplies",
|
39 |
+
"novels",
|
40 |
+
"gaming consoles",
|
41 |
+
"smartphones",
|
42 |
+
"camera gear",
|
43 |
+
"toys",
|
44 |
+
"gift hamper"
|
45 |
+
]
|
46 |
+
# Preβencode those categories for faster scoring.
|
47 |
+
self.category_embeddings = self.model.encode(self.candidate_categories, convert_to_tensor=True)
|
48 |
+
|
49 |
# ------------------------------------------------------------------
|
50 |
# Amazon search
|
51 |
# ------------------------------------------------------------------
|
52 |
async def search_amazon(self, query: str) -> List[Dict]:
|
|
|
|
|
|
|
|
|
53 |
print(f"Searching Amazon for: {query}")
|
54 |
search_url = f"https://www.amazon.in/s?k={query}"
|
55 |
async with aiohttp.ClientSession() as session:
|
|
|
63 |
soup = BeautifulSoup(html, 'html.parser')
|
64 |
products = []
|
65 |
|
66 |
+
# These selectors may need updating if Amazon changes HTML
|
67 |
search_items = soup.select('.s-result-item')
|
68 |
|
69 |
for item in search_items:
|
|
|
71 |
name_elem = item.select_one('.a-text-normal')
|
72 |
price_elem = item.select_one('.a-price-whole')
|
73 |
link_elem = item.select_one('a.a-link-normal')
|
|
|
74 |
if name_elem and price_elem and link_elem:
|
75 |
product_name = name_elem.get_text(strip=True)
|
76 |
product_price = price_elem.get_text(strip=True)
|
77 |
product_url = link_elem.get('href')
|
78 |
+
|
79 |
products.append({
|
80 |
'name': product_name,
|
81 |
'price': product_price,
|
82 |
'source': 'Amazon',
|
83 |
'url': 'https://www.amazon.in' + product_url,
|
84 |
+
'description': f"This item is from Amazon related to '{product_name}'."
|
85 |
})
|
86 |
except Exception:
|
87 |
continue
|
88 |
|
|
|
89 |
return products[:5]
|
90 |
|
91 |
# ------------------------------------------------------------------
|
92 |
# Flipkart search
|
93 |
# ------------------------------------------------------------------
|
94 |
async def search_flipkart(self, query: str) -> List[Dict]:
|
|
|
|
|
|
|
95 |
print(f"Searching Flipkart for: {query}")
|
96 |
search_url = f"https://www.flipkart.com/search?q={query}"
|
97 |
async with aiohttp.ClientSession() as session:
|
|
|
105 |
soup = BeautifulSoup(html, 'html.parser')
|
106 |
products = []
|
107 |
|
108 |
+
# These selectors may need updating if Flipkart changes HTML
|
109 |
item_cards = soup.select('._1AtVbE')
|
110 |
|
111 |
for item in item_cards:
|
|
|
113 |
name_elem = item.select_one('._4rR01T')
|
114 |
price_elem = item.select_one('._30jeq3')
|
115 |
link_elem = item.select_one('a')
|
|
|
116 |
if name_elem and price_elem and link_elem:
|
117 |
product_name = name_elem.get_text(strip=True)
|
118 |
product_price = price_elem.get_text(strip=True)
|
119 |
product_url = link_elem.get('href')
|
120 |
+
|
121 |
products.append({
|
122 |
'name': product_name,
|
123 |
'price': product_price,
|
124 |
'source': 'Flipkart',
|
125 |
'url': 'https://www.flipkart.com' + product_url,
|
126 |
+
'description': f"This item is from Flipkart related to '{product_name}'."
|
127 |
})
|
128 |
except Exception:
|
129 |
continue
|
130 |
|
|
|
131 |
return products[:5]
|
132 |
|
133 |
# ------------------------------------------------------------------
|
134 |
+
# IGP search
|
135 |
# ------------------------------------------------------------------
|
136 |
async def search_igp(self, query: str) -> List[Dict]:
|
|
|
|
|
|
|
|
|
137 |
print(f"Searching IGP for: {query}")
|
138 |
search_url = f"https://www.igp.com/search/{query}"
|
139 |
async with aiohttp.ClientSession() as session:
|
|
|
147 |
soup = BeautifulSoup(html, 'html.parser')
|
148 |
products = []
|
149 |
|
150 |
+
# Likely need to update based on actual IGP HTML
|
|
|
151 |
item_cards = soup.select('.product-item')
|
152 |
|
153 |
for item in item_cards:
|
|
|
155 |
name_elem = item.select_one('.product-title')
|
156 |
price_elem = item.select_one('.product-price')
|
157 |
link_elem = item.select_one('a')
|
|
|
158 |
if name_elem and price_elem and link_elem:
|
159 |
product_name = name_elem.get_text(strip=True)
|
160 |
product_price = price_elem.get_text(strip=True)
|
161 |
product_url = link_elem.get('href')
|
162 |
+
|
163 |
products.append({
|
164 |
'name': product_name,
|
165 |
'price': product_price,
|
166 |
'source': 'IGP',
|
167 |
'url': 'https://www.igp.com' + product_url,
|
168 |
+
'description': f"This item is from IGP related to '{product_name}'."
|
169 |
})
|
170 |
except Exception:
|
171 |
continue
|
172 |
|
|
|
173 |
return products[:5]
|
174 |
|
175 |
# ------------------------------------------------------------------
|
176 |
+
# Extract categories from user text using embeddings
|
177 |
# ------------------------------------------------------------------
|
178 |
def _extract_keywords(self, text: str) -> List[str]:
|
179 |
"""
|
180 |
+
1. Parse out age if present
|
181 |
+
2. Use embeddings to find top 2-3 matching categories
|
182 |
+
from self.candidate_categories.
|
183 |
+
3. Combine them with the age if found.
|
184 |
"""
|
185 |
+
# 1) Check for age with a regex
|
186 |
+
age_match = re.search(r'age\s+(\d+)', text.lower())
|
|
|
|
|
187 |
age = age_match.group(1) if age_match else None
|
188 |
|
189 |
+
# 2) Use the entire user text as an embedding
|
190 |
+
user_emb = self.model.encode(text, convert_to_tensor=True)
|
191 |
|
192 |
+
# Compute similarity with each candidate category
|
193 |
+
sims = util.cos_sim(user_emb, self.category_embeddings)[0] # shape: [num_categories]
|
194 |
+
# Grab top 3 indices
|
195 |
+
top_k = min(3, len(self.candidate_categories))
|
196 |
+
top_results = torch.topk(sims, k=top_k)
|
|
|
|
|
|
|
|
|
197 |
|
198 |
+
best_categories = []
|
199 |
+
for idx in top_results.indices:
|
200 |
+
cat_text = self.candidate_categories[idx]
|
201 |
+
if age:
|
202 |
+
cat_text = f"{cat_text} for {age} year old"
|
203 |
+
best_categories.append(cat_text)
|
204 |
|
205 |
+
print("Embedding-based categories:", best_categories)
|
206 |
+
return best_categories
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
# ------------------------------------------------------------------
|
209 |
# Main recommendations
|
210 |
# ------------------------------------------------------------------
|
211 |
async def get_recommendations(self, text: str) -> List[Dict]:
|
212 |
"""
|
213 |
+
Search across Amazon, Flipkart, and IGP based on the top category matches.
|
214 |
"""
|
215 |
try:
|
216 |
+
# 1) Figure out best categories (queries) from user text
|
217 |
+
queries = self._extract_keywords(text)
|
218 |
|
219 |
+
# 2) Search each site for each query
|
220 |
all_products = []
|
221 |
+
for query in queries:
|
222 |
+
# For each query, hit Amazon, Flipkart, IGP
|
223 |
+
amazon_products = await self.search_amazon(query)
|
224 |
+
flipkart_products = await self.search_flipkart(query)
|
225 |
+
igp_products = await self.search_igp(query)
|
226 |
|
227 |
all_products.extend(amazon_products)
|
228 |
all_products.extend(flipkart_products)
|
229 |
all_products.extend(igp_products)
|
230 |
|
231 |
+
# 3) Deβduplicate by product name
|
232 |
seen = set()
|
233 |
unique_products = []
|
234 |
for product in all_products:
|
|
|
236 |
seen.add(product['name'])
|
237 |
unique_products.append(product)
|
238 |
|
239 |
+
# 4) Optionally slice or sort further
|
240 |
+
return unique_products[:5]
|
|
|
|
|
|
|
|
|
241 |
|
242 |
except Exception as e:
|
243 |
print(f"Error in recommendations: {str(e)}")
|