RSHVR commited on
Commit
7ef24bf
·
verified ·
1 Parent(s): a260438

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +404 -0
app.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import uuid
3
+ import re
4
+ import os
5
+ import logging
6
+ from typing import Dict, Any, Optional, List, Tuple
7
+ from dataclasses import dataclass, field
8
+
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ from PIL import Image
12
+ from io import BytesIO
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class ImageInfo:
24
+ """Class for storing image information"""
25
+ url: str
26
+ alt: str = ""
27
+ type: str = "unknown"
28
+ path: Optional[str] = None
29
+ id: Optional[str] = None
30
+
31
+
32
+ @dataclass
33
+ class ExtractionResult:
34
+ """Class for storing the results of a webpage extraction"""
35
+ request_id: str
36
+ images: Dict[str, ImageInfo] = field(default_factory=dict)
37
+ measurements: Dict[str, str] = field(default_factory=dict)
38
+ materials: Dict[str, str] = field(default_factory=dict)
39
+ output_dir: Optional[str] = None
40
+
41
+ def to_dict(self) -> Dict[str, Any]:
42
+ """Convert the extraction result to a dictionary"""
43
+ images_dict = {
44
+ img_id: {
45
+ "id": img_id,
46
+ "url": img_info.url,
47
+ "alt": img_info.alt,
48
+ "type": img_info.type,
49
+ "path": img_info.path
50
+ } for img_id, img_info in self.images.items()
51
+ }
52
+
53
+ return {
54
+ "request_id": self.request_id,
55
+ "images": images_dict,
56
+ "measurements": self.measurements,
57
+ "materials": self.materials,
58
+ "output_dir": self.output_dir
59
+ }
60
+
61
+
62
+ class SrcsetParser:
63
+ """Helper class for parsing srcset attributes from HTML img tags"""
64
+
65
+ @staticmethod
66
+ def parse_srcset(srcset: str) -> List[Dict[str, Any]]:
67
+ """
68
+ Parse a srcset attribute into a structured list of image URLs and descriptors.
69
+
70
+ Args:
71
+ srcset: The srcset attribute from an img tag
72
+
73
+ Returns:
74
+ List of dictionaries containing parsed srcset components
75
+ """
76
+ if not srcset:
77
+ return []
78
+
79
+ results = []
80
+ srcset_parts = [part.strip() for part in srcset.split(',')]
81
+
82
+ for part in srcset_parts:
83
+ parts = part.split()
84
+ if len(parts) < 2:
85
+ continue
86
+
87
+ url = parts[0]
88
+ descriptor = parts[1]
89
+
90
+ try:
91
+ width = int(re.search(r'\d+', descriptor).group(0)) if re.search(r'\d+', descriptor) else 0
92
+ results.append({"url": url, "descriptor": descriptor, "width": width})
93
+ except (AttributeError, ValueError):
94
+ continue
95
+
96
+ return results
97
+
98
+ @classmethod
99
+ def extract_f_xl_image(cls, srcset: str) -> Optional[str]:
100
+ """
101
+ Extract specifically the image URL with f=xl 900w from a srcset attribute.
102
+
103
+ Args:
104
+ srcset: The srcset attribute from an img tag
105
+
106
+ Returns:
107
+ The URL with f=xl 900w descriptor or None if not found
108
+ """
109
+ if not srcset:
110
+ return None
111
+
112
+ srcset_entries = cls.parse_srcset(srcset)
113
+
114
+ # First, look for f=xl with 900w
115
+ for entry in srcset_entries:
116
+ if "f=xl" in entry["url"] and entry["descriptor"] == "900w":
117
+ return entry["url"]
118
+
119
+ # If not found, try any 900w image
120
+ for entry in srcset_entries:
121
+ if entry["descriptor"] == "900w":
122
+ return entry["url"]
123
+
124
+ # Finally, fall back to highest resolution
125
+ if srcset_entries:
126
+ srcset_entries.sort(key=lambda x: x["width"], reverse=True)
127
+ return srcset_entries[0]["url"]
128
+
129
+ return None
130
+
131
+
132
+ class WebPageFetcher:
133
+ """Helper class for fetching web pages"""
134
+
135
+ DEFAULT_HEADERS = {
136
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
137
+ }
138
+
139
+ @classmethod
140
+ def fetch_page(cls, url: str) -> Tuple[str, BeautifulSoup]:
141
+ """
142
+ Fetch a web page and return its content as text and parsed BeautifulSoup.
143
+
144
+ Args:
145
+ url: The URL to fetch
146
+
147
+ Returns:
148
+ Tuple containing (raw_html, parsed_soup)
149
+
150
+ Raises:
151
+ requests.exceptions.RequestException: If the request fails
152
+ """
153
+ logger.info(f"Fetching page: {url}")
154
+ response = requests.get(url, headers=cls.DEFAULT_HEADERS, timeout=30)
155
+ response.raise_for_status()
156
+ html = response.text
157
+
158
+ # Parse HTML with BeautifulSoup
159
+ soup = BeautifulSoup(html, 'html.parser')
160
+ return html, soup
161
+
162
+
163
+ class ProductExtractor:
164
+ """Main class for extracting product information"""
165
+
166
+ def __init__(self):
167
+ self.srcset_parser = SrcsetParser()
168
+
169
+ def extract_images_from_url(self, url: str) -> ExtractionResult:
170
+ """
171
+ Extract images with preference for f=xl 900w versions from a URL.
172
+
173
+ Args:
174
+ url: The URL to extract images from
175
+
176
+ Returns:
177
+ ExtractionResult object with extracted image information
178
+
179
+ Raises:
180
+ requests.exceptions.RequestException: If the request fails
181
+ ValueError: If the HTML cannot be parsed correctly
182
+ """
183
+ try:
184
+ logger.info(f"Extracting images from: {url}")
185
+
186
+ # Fetch the HTML content
187
+ _, soup = WebPageFetcher.fetch_page(url)
188
+
189
+ # Generate a UUID for this request
190
+ request_uuid = str(uuid.uuid4())
191
+ logger.info(f"Generated request ID: {request_uuid}")
192
+
193
+ # Initialize result
194
+ result = ExtractionResult(request_id=request_uuid)
195
+
196
+ # Extract images
197
+ self._extract_main_product_image(soup, result, request_uuid)
198
+ self._extract_measurement_image(soup, result, request_uuid)
199
+
200
+ # If no specific images found, try general approach
201
+ if not result.images:
202
+ self._extract_images_general_approach(soup, result, request_uuid)
203
+
204
+ # Extract measurements
205
+ self._extract_measurements(soup, result)
206
+
207
+ # Extract materials (IKEA often has materials in specifications)
208
+ self._extract_materials(soup, result)
209
+
210
+ logger.info(f"Total images found: {len(result.images)}")
211
+ logger.info(f"Measurements extracted: {result.measurements}")
212
+ return result
213
+
214
+ except requests.exceptions.RequestException as e:
215
+ logger.error(f"Error fetching URL: {e}")
216
+ raise
217
+ except Exception as e:
218
+ logger.error(f"Error extracting images: {e}")
219
+ raise
220
+
221
+ def _extract_main_product_image(self, soup: BeautifulSoup, result: ExtractionResult, request_uuid: str) -> None:
222
+ """Extract the main product image"""
223
+ main_image_element = soup.select_one('div[data-type="MAIN_PRODUCT_IMAGE"] img.pip-image')
224
+ if main_image_element and main_image_element.get('srcset'):
225
+ srcset = main_image_element.get('srcset')
226
+ target_url = self.srcset_parser.extract_f_xl_image(srcset)
227
+ if target_url:
228
+ logger.info(f"Found main product image: {target_url}")
229
+ image_id = f"{request_uuid}-main"
230
+ result.images[image_id] = ImageInfo(
231
+ id=image_id,
232
+ url=target_url,
233
+ alt=main_image_element.get('alt', ''),
234
+ type="main"
235
+ )
236
+
237
+ def _extract_measurement_image(self, soup: BeautifulSoup, result: ExtractionResult, request_uuid: str) -> None:
238
+ """Extract the measurement illustration image"""
239
+ measurement_image_element = soup.select_one('div[data-type="MEASUREMENT_ILLUSTRATION"] img.pip-image')
240
+ if measurement_image_element and measurement_image_element.get('srcset'):
241
+ srcset = measurement_image_element.get('srcset')
242
+ target_url = self.srcset_parser.extract_f_xl_image(srcset)
243
+ if target_url:
244
+ logger.info(f"Found measurement image: {target_url}")
245
+ image_id = f"{request_uuid}-measurement"
246
+ result.images[image_id] = ImageInfo(
247
+ id=image_id,
248
+ url=target_url,
249
+ alt=measurement_image_element.get('alt', ''),
250
+ type="measurement"
251
+ )
252
+
253
+ def _extract_images_general_approach(self, soup: BeautifulSoup, result: ExtractionResult, request_uuid: str) -> None:
254
+ """Extract images using a more general approach"""
255
+ logger.info("No specific images found, trying general approach...")
256
+ for i, img in enumerate(soup.select('img[srcset]')):
257
+ srcset = img.get('srcset')
258
+ target_url = self.srcset_parser.extract_f_xl_image(srcset)
259
+ if target_url:
260
+ img_type = self._determine_image_type(img)
261
+ logger.info(f"Found {img_type} image: {target_url}")
262
+ image_id = f"{request_uuid}-{img_type}-{i}"
263
+ result.images[image_id] = ImageInfo(
264
+ id=image_id,
265
+ url=target_url,
266
+ alt=img.get('alt', ''),
267
+ type=img_type
268
+ )
269
+
270
+ def _determine_image_type(self, img_element: BeautifulSoup) -> str:
271
+ """Determine the type of image based on its context"""
272
+ parent_html = str(img_element.parent.parent)
273
+ if "MAIN_PRODUCT_IMAGE" in parent_html or "main" in parent_html.lower():
274
+ return "main"
275
+ elif "MEASUREMENT" in parent_html or "measurement" in parent_html.lower():
276
+ return "measurement"
277
+ return "unknown"
278
+
279
+ def _extract_measurements(self, soup: BeautifulSoup, result: ExtractionResult) -> None:
280
+ """Extract product measurements"""
281
+ dimensions_ul = soup.select_one('ul.pip-product-dimensions__dimensions-container')
282
+ if dimensions_ul:
283
+ for li in dimensions_ul.select('li.pip-product-dimensions__measurement-wrapper'):
284
+ label_span = li.select_one('span.pip-product-dimensions__measurement-name')
285
+ if label_span:
286
+ label = label_span.get_text(strip=True).replace(":", "")
287
+ full_text = li.get_text(strip=True)
288
+ value = full_text.replace(label_span.get_text(), '').strip()
289
+ result.measurements[label.lower()] = value
290
+
291
+ def _extract_materials(self, soup: BeautifulSoup, result: ExtractionResult) -> None:
292
+ """Extract product materials information"""
293
+ # Look for materials in product details
294
+ materials_section = soup.select_one('div.pip-product-details__container')
295
+ if materials_section:
296
+ material_headers = materials_section.select('h3, h4')
297
+ for header in material_headers:
298
+ if 'material' in header.get_text().lower():
299
+ # Get the next paragraphs after this header
300
+ materials_content = []
301
+ next_element = header.next_sibling
302
+ while next_element and not (next_element.name == 'h3' or next_element.name == 'h4'):
303
+ if next_element.name == 'p':
304
+ materials_content.append(next_element.get_text(strip=True))
305
+ next_element = next_element.next_sibling
306
+
307
+ if materials_content:
308
+ result.materials['materials'] = ' '.join(materials_content)
309
+
310
+
311
+ # Create a singleton instance
312
+ extractor = ProductExtractor()
313
+
314
+ def get_product_data_from_url(url):
315
+ """
316
+ Retrieve product data (images, measurements, materials) from a URL directly.
317
+
318
+ Args:
319
+ url: Product URL to extract data from
320
+
321
+ Returns:
322
+ Tuple of (image_list, measurements_str, materials_str)
323
+ """
324
+ try:
325
+ # Extract data directly instead of using API
326
+ extraction_result = extractor.extract_images_from_url(url)
327
+ data = extraction_result.to_dict()
328
+
329
+ # Extract images
330
+ images = [img["url"] for img in data.get("images", {}).values()]
331
+
332
+ # Format measurements into markdown
333
+ measurements = data.get("measurements", {})
334
+ if measurements:
335
+ measurements_str = "\n".join([f"- **{k.title()}**: {v}" for k, v in measurements.items()])
336
+ else:
337
+ measurements_str = "No measurements found."
338
+
339
+ # Format materials into markdown
340
+ materials = data.get("materials", {})
341
+ if materials:
342
+ materials_str = "\n".join([f"- **{k.title()}**: {v}" for k, v in materials.items()])
343
+ else:
344
+ materials_str = "No materials information found."
345
+
346
+ return images, measurements_str, materials_str
347
+
348
+ except Exception as e:
349
+ error_message = f"Error: {str(e)}"
350
+ return [], error_message, error_message
351
+
352
+ def create_interface():
353
+ """Create and configure the Gradio interface"""
354
+ with gr.Blocks(title="IKEA Product Image + Measurement Extractor") as demo:
355
+ gr.Markdown("## IKEA Product Image + Measurement Extractor")
356
+ gr.Markdown("Enter an IKEA product URL to extract images, measurements, and materials information.")
357
+
358
+ with gr.Row():
359
+ with gr.Column(scale=1):
360
+ # Input section
361
+ url_input = gr.Textbox(
362
+ label="Product URL",
363
+ placeholder="https://www.ikea.com/product/...",
364
+ info="Paste IKEA product URL here"
365
+ )
366
+ submit_btn = gr.Button("Extract Product Data", variant="primary")
367
+
368
+ # Results section - Measurements and Materials
369
+ with gr.Accordion("Product Information", open=True):
370
+ measurements_display = gr.Markdown(label="Measurements")
371
+ materials_display = gr.Markdown(label="Materials")
372
+
373
+ with gr.Column(scale=2):
374
+ # Gallery component for displaying images
375
+ image_gallery = gr.Gallery(
376
+ label="Product Images",
377
+ show_label=True,
378
+ columns=2,
379
+ height=500,
380
+ object_fit="contain"
381
+ )
382
+
383
+ # Add example URLs
384
+ gr.Examples(
385
+ examples=[
386
+ ["https://www.ikea.com/ca/en/p/hammaroen-pergola-gray-beige-dark-gray-beige-20549239/"],
387
+ ["https://www.ikea.com/ca/en/p/fniss-trash-can-white-40295439/"],
388
+ ["https://www.ikea.com/ca/en/p/klimpfjaell-norraryd-table-and-6-chairs-gray-brown-black-s99418424/"]
389
+ ],
390
+ inputs=url_input
391
+ )
392
+
393
+ # Set up the click event
394
+ submit_btn.click(
395
+ fn=get_product_data_from_url,
396
+ inputs=url_input,
397
+ outputs=[image_gallery, measurements_display, materials_display]
398
+ )
399
+
400
+ return demo
401
+
402
+ if __name__ == "__main__":
403
+ demo = create_interface()
404
+ demo.launch()