RSHVR commited on
Commit
8474ed7
·
verified ·
1 Parent(s): 6f555e5

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +91 -0
  2. custom-api.py +209 -0
  3. image_extractor.py +390 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+
4
+ API_URL = "http://localhost:8000/extract"
5
+
6
+ def get_product_data_from_url(url):
7
+ """
8
+ Retrieve product data (images, measurements, materials) from the API.
9
+
10
+ Args:
11
+ url: Product URL to extract data from
12
+
13
+ Returns:
14
+ Tuple of (image_list, measurements_str, materials_str)
15
+ """
16
+ try:
17
+ payload = {
18
+ "url": url,
19
+ "download_images": False
20
+ }
21
+
22
+ response = requests.post(API_URL, json=payload)
23
+ response.raise_for_status()
24
+ data = response.json()
25
+
26
+ # Extract images
27
+ images = [img["url"] for img in data.get("images", {}).values()]
28
+
29
+ # Format measurements into markdown
30
+ measurements = data.get("measurements", {})
31
+ if measurements:
32
+ measurements_str = "\n".join([f"- **{k.title()}**: {v}" for k, v in measurements.items()])
33
+ else:
34
+ measurements_str = "No measurements found."
35
+
36
+ # Format materials into markdown
37
+ materials = data.get("materials", {})
38
+ if materials:
39
+ materials_str = "\n".join([f"- **{k.title()}**: {v}" for k, v in materials.items()])
40
+ else:
41
+ materials_str = "No materials information found."
42
+
43
+ return images, measurements_str, materials_str
44
+
45
+ except Exception as e:
46
+ error_message = f"Error: {str(e)}"
47
+ return [], error_message, error_message
48
+
49
+ def create_interface():
50
+ """Create and configure the Gradio interface"""
51
+ with gr.Blocks(title="IKEA Product Image + Measurement Extractor") as demo:
52
+ gr.Markdown("## IKEA Product Image + Measurement Extractor")
53
+ gr.Markdown("Enter an IKEA product URL to extract images, measurements, and materials information.")
54
+
55
+ with gr.Row():
56
+ with gr.Column(scale=1):
57
+ # Input section
58
+ url_input = gr.Textbox(
59
+ label="Product URL",
60
+ placeholder="https://www.ikea.com/product/...",
61
+ info="Paste IKEA product URL here"
62
+ )
63
+ submit_btn = gr.Button("Extract Product Data", variant="primary")
64
+
65
+ # Results section - Measurements and Materials
66
+ with gr.Accordion("Product Information", open=True):
67
+ measurements_display = gr.Markdown(label="Measurements")
68
+ materials_display = gr.Markdown(label="Materials")
69
+
70
+ with gr.Column(scale=2):
71
+ # Gallery component for displaying images
72
+ image_gallery = gr.Gallery(
73
+ label="Product Images",
74
+ show_label=True,
75
+ columns=2,
76
+ height=500,
77
+ object_fit="contain"
78
+ )
79
+
80
+ # Set up the click event
81
+ submit_btn.click(
82
+ fn=get_product_data_from_url,
83
+ inputs=url_input,
84
+ outputs=[image_gallery, measurements_display, materials_display]
85
+ )
86
+
87
+ return demo
88
+
89
+ if __name__ == "__main__":
90
+ demo = create_interface()
91
+ demo.launch(share=False)
custom-api.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image Extractor API
3
+
4
+ A FastAPI application for extracting high-resolution product images from web pages.
5
+ """
6
+
7
+ from fastapi import FastAPI, HTTPException, Depends, Request
8
+ from fastapi.responses import JSONResponse
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from fastapi.openapi.utils import get_openapi
11
+ from pydantic import BaseModel, HttpUrl, Field
12
+ import os
13
+ import uuid
14
+ from typing import Dict, Any, Optional, List, Union
15
+ import logging
16
+ import uvicorn
17
+ import time
18
+
19
+ # Import from our refactored image_extractor module
20
+ from image_extractor import extract_images_from_url, download_image, process_product_page
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Define API Models
30
+ class ExtractImageRequest(BaseModel):
31
+ """Request model for image extraction"""
32
+ url: HttpUrl = Field(..., description="URL of the product page to extract images from")
33
+ download_images: bool = Field(True, description="Whether to download the images or just return URLs")
34
+ custom_output_dir: Optional[str] = Field(None, description="Optional custom directory to save images to")
35
+
36
+ class Config:
37
+ schema_extra = {
38
+ "example": {
39
+ "url": "https://www.ikea.com/us/en/p/poaeng-armchair-birch-veneer-knisa-light-beige-s49388439/",
40
+ "download_images": True,
41
+ "custom_output_dir": "my_images/poaeng"
42
+ }
43
+ }
44
+
45
+ class ImageInfo(BaseModel):
46
+ """Image information model"""
47
+ id: str = Field(..., description="Unique identifier for the image")
48
+ url: str = Field(..., description="URL of the image")
49
+ alt: str = Field(..., description="Alt text of the image")
50
+ type: str = Field(..., description="Type of image (main, measurement, etc.)")
51
+ path: Optional[str] = Field(None, description="Local path where image is saved (if downloaded)")
52
+
53
+ class ExtractImageResponse(BaseModel):
54
+ """Response model for image extraction"""
55
+ request_id: str = Field(..., description="Unique identifier for this request")
56
+ images: Dict[str, Dict[str, Any]] = Field(..., description="Dictionary of extracted images")
57
+ output_dir: Optional[str] = Field(None, description="Directory where images were saved (if downloaded)")
58
+ measurements: Optional[Dict[str, str]] = Field(None, description="Product measurements extracted from the page")
59
+ materials: Optional[Dict[str, str]] = Field(None, description="Product materials extracted from the page")
60
+
61
+ class ErrorResponse(BaseModel):
62
+ """Error response model"""
63
+ detail: str
64
+
65
+ # Create API application
66
+ app = FastAPI(
67
+ title="Image Extractor API",
68
+ description="API for extracting high-resolution product images from web pages",
69
+ version="1.0.0",
70
+ docs_url="/docs",
71
+ redoc_url="/redoc",
72
+ openapi_url="/openapi.json",
73
+ responses={
74
+ 500: {"model": ErrorResponse}
75
+ }
76
+ )
77
+
78
+ # Add CORS middleware
79
+ app.add_middleware(
80
+ CORSMiddleware,
81
+ allow_origins=["*"], # In production, replace with specific origins
82
+ allow_credentials=True,
83
+ allow_methods=["*"],
84
+ allow_headers=["*"],
85
+ )
86
+
87
+ # Custom OpenAPI schema
88
+ def custom_openapi():
89
+ if app.openapi_schema:
90
+ return app.openapi_schema
91
+
92
+ openapi_schema = get_openapi(
93
+ title=app.title,
94
+ version=app.version,
95
+ description=app.description,
96
+ routes=app.routes,
97
+ )
98
+
99
+ # Custom schema customizations can be added here
100
+
101
+ app.openapi_schema = openapi_schema
102
+ return app.openapi_schema
103
+
104
+ app.openapi = custom_openapi
105
+
106
+ # Middleware for request timing and logging
107
+ @app.middleware("http")
108
+ async def log_requests(request: Request, call_next):
109
+ """Log requests and their timing"""
110
+ start_time = time.time()
111
+
112
+ # Process the request
113
+ response = await call_next(request)
114
+
115
+ # Calculate duration
116
+ duration = time.time() - start_time
117
+
118
+ # Log the request details
119
+ logger.info(
120
+ f"Request {request.method} {request.url.path} "
121
+ f"completed in {duration:.3f}s with status {response.status_code}"
122
+ )
123
+
124
+ return response
125
+
126
+ # API Routes
127
+ @app.get("/", summary="Welcome endpoint", tags=["General"])
128
+ def read_root():
129
+ """Welcome endpoint for the API"""
130
+ return {"message": "Welcome to the Image Extractor API"}
131
+
132
+ @app.post(
133
+ "/extract",
134
+ response_model=ExtractImageResponse,
135
+ responses={
136
+ 200: {"description": "Successfully extracted images"},
137
+ 500: {"description": "Server error", "model": ErrorResponse}
138
+ },
139
+ summary="Extract images from a URL",
140
+ tags=["Extraction"]
141
+ )
142
+ async def extract_images(request: ExtractImageRequest):
143
+ """
144
+ Extract high-resolution images from a product URL.
145
+
146
+ - **url**: URL of the product page to extract images from
147
+ - **download_images**: Whether to download the images or just return URLs
148
+ - **custom_output_dir**: Optional custom directory to save images to
149
+
150
+ Returns information about extracted images and product measurements.
151
+ """
152
+ try:
153
+ logger.info(f"Processing extraction request for URL: {request.url}")
154
+ url = str(request.url) # Convert from Pydantic HttpUrl to string
155
+
156
+ if request.download_images:
157
+ # Process the page and download images
158
+ logger.info(f"Downloading images to {'custom directory' if request.custom_output_dir else 'default directory'}")
159
+ result = process_product_page(url, request.custom_output_dir)
160
+ return result
161
+ else:
162
+ # Only extract image URLs without downloading
163
+ logger.info("Extracting image URLs without downloading")
164
+ extraction_result = extract_images_from_url(url)
165
+
166
+ # Convert the result to match our response model
167
+ return {
168
+ "request_id": extraction_result.request_id if hasattr(extraction_result, 'request_id') else extraction_result["requestId"],
169
+ "images": {
170
+ img_id: {
171
+ "id": img_id,
172
+ "url": img_info["url"] if isinstance(img_info, dict) else img_info.url,
173
+ "alt": img_info["alt"] if isinstance(img_info, dict) else img_info.alt,
174
+ "type": img_info["type"] if isinstance(img_info, dict) else img_info.type
175
+ }
176
+ for img_id, img_info in (extraction_result.images.items() if hasattr(extraction_result, 'images') else extraction_result["images"].items())
177
+ },
178
+ "measurements": (
179
+ extraction_result.measurements
180
+ if hasattr(extraction_result, 'measurements')
181
+ else extraction_result.get("measurements", {})
182
+ ),
183
+ "materials": (
184
+ extraction_result.materials
185
+ if hasattr(extraction_result, 'materials')
186
+ else extraction_result.get("materials", {})
187
+ ),
188
+ }
189
+
190
+ except Exception as e:
191
+ logger.error(f"Error processing URL: {str(e)}", exc_info=True)
192
+ raise HTTPException(
193
+ status_code=500,
194
+ detail=f"Error processing URL: {str(e)}"
195
+ )
196
+
197
+ @app.get("/health", summary="Health check endpoint", tags=["Monitoring"])
198
+ def health_check():
199
+ """
200
+ Health check endpoint for monitoring the API status.
201
+
202
+ Returns a simple status message indicating the API is healthy.
203
+ """
204
+ return {"status": "healthy", "timestamp": time.time()}
205
+
206
+ # Run the server directly if the file is executed
207
+ if __name__ == "__main__":
208
+ logger.info("Starting Image Extractor API server")
209
+ uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
image_extractor.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image Extractor Module
3
+
4
+ This module extracts high-resolution product images and measurements from web pages.
5
+ Designed primarily for IKEA product pages but can be extended for other sites.
6
+ """
7
+
8
+ import uuid
9
+ import re
10
+ import os
11
+ import logging
12
+ from typing import Dict, Any, Optional, List, Tuple
13
+ from dataclasses import dataclass, field
14
+
15
+ import requests
16
+ from bs4 import BeautifulSoup
17
+ from PIL import Image
18
+ from io import BytesIO
19
+
20
+
21
+ # Configure logging
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class ImageInfo:
31
+ """Class for storing image information"""
32
+ url: str
33
+ alt: str = ""
34
+ type: str = "unknown"
35
+ path: Optional[str] = None
36
+ id: Optional[str] = None
37
+
38
+
39
+ @dataclass
40
+ class ExtractionResult:
41
+ """Class for storing the results of a webpage extraction"""
42
+ request_id: str
43
+ images: Dict[str, ImageInfo] = field(default_factory=dict)
44
+ measurements: Dict[str, str] = field(default_factory=dict)
45
+ materials: Dict[str, str] = field(default_factory=dict)
46
+ output_dir: Optional[str] = None
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert the extraction result to a dictionary"""
50
+ images_dict = {
51
+ img_id: {
52
+ "id": img_id,
53
+ "url": img_info.url,
54
+ "alt": img_info.alt,
55
+ "type": img_info.type,
56
+ "path": img_info.path
57
+ } for img_id, img_info in self.images.items()
58
+ }
59
+
60
+ return {
61
+ "request_id": self.request_id,
62
+ "images": images_dict,
63
+ "measurements": self.measurements,
64
+ "materials": self.materials,
65
+ "output_dir": self.output_dir
66
+ }
67
+
68
+
69
+ class SrcsetParser:
70
+ """Helper class for parsing srcset attributes from HTML img tags"""
71
+
72
+ @staticmethod
73
+ def parse_srcset(srcset: str) -> List[Dict[str, Any]]:
74
+ """
75
+ Parse a srcset attribute into a structured list of image URLs and descriptors.
76
+
77
+ Args:
78
+ srcset: The srcset attribute from an img tag
79
+
80
+ Returns:
81
+ List of dictionaries containing parsed srcset components
82
+ """
83
+ if not srcset:
84
+ return []
85
+
86
+ results = []
87
+ srcset_parts = [part.strip() for part in srcset.split(',')]
88
+
89
+ for part in srcset_parts:
90
+ parts = part.split()
91
+ if len(parts) < 2:
92
+ continue
93
+
94
+ url = parts[0]
95
+ descriptor = parts[1]
96
+
97
+ try:
98
+ width = int(re.search(r'\d+', descriptor).group(0)) if re.search(r'\d+', descriptor) else 0
99
+ results.append({"url": url, "descriptor": descriptor, "width": width})
100
+ except (AttributeError, ValueError):
101
+ continue
102
+
103
+ return results
104
+
105
+ @classmethod
106
+ def extract_f_xl_image(cls, srcset: str) -> Optional[str]:
107
+ """
108
+ Extract specifically the image URL with f=xl 900w from a srcset attribute.
109
+
110
+ Args:
111
+ srcset: The srcset attribute from an img tag
112
+
113
+ Returns:
114
+ The URL with f=xl 900w descriptor or None if not found
115
+ """
116
+ if not srcset:
117
+ return None
118
+
119
+ srcset_entries = cls.parse_srcset(srcset)
120
+
121
+ # First, look for f=xl with 900w
122
+ for entry in srcset_entries:
123
+ if "f=xl" in entry["url"] and entry["descriptor"] == "900w":
124
+ return entry["url"]
125
+
126
+ # If not found, try any 900w image
127
+ for entry in srcset_entries:
128
+ if entry["descriptor"] == "900w":
129
+ return entry["url"]
130
+
131
+ # Finally, fall back to highest resolution
132
+ if srcset_entries:
133
+ srcset_entries.sort(key=lambda x: x["width"], reverse=True)
134
+ return srcset_entries[0]["url"]
135
+
136
+ return None
137
+
138
+
139
+ class ImageDownloader:
140
+ """Helper class for downloading images"""
141
+
142
+ @staticmethod
143
+ def download_image(image_url: str, save_path: str) -> Optional[str]:
144
+ """
145
+ Download an image from URL and save it to disk.
146
+
147
+ Args:
148
+ image_url: URL of the image to download
149
+ save_path: Path where the image will be saved
150
+
151
+ Returns:
152
+ The path to the saved image or None if download failed
153
+ """
154
+ try:
155
+ # Create directory if it doesn't exist
156
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
157
+
158
+ # Get the image content
159
+ response = requests.get(image_url, timeout=30)
160
+ response.raise_for_status()
161
+
162
+ # Save the image
163
+ img = Image.open(BytesIO(response.content))
164
+ img.save(save_path)
165
+
166
+ logger.info(f"Image saved to {save_path}")
167
+ return save_path
168
+ except requests.exceptions.RequestException as e:
169
+ logger.error(f"Error downloading image: {e}")
170
+ return None
171
+ except IOError as e:
172
+ logger.error(f"Error saving image: {e}")
173
+ return None
174
+ except Exception as e:
175
+ logger.error(f"Unexpected error while downloading image: {e}")
176
+ return None
177
+
178
+
179
+ class WebPageFetcher:
180
+ """Helper class for fetching web pages"""
181
+
182
+ DEFAULT_HEADERS = {
183
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
184
+ }
185
+
186
+ @classmethod
187
+ def fetch_page(cls, url: str) -> Tuple[str, BeautifulSoup]:
188
+ """
189
+ Fetch a web page and return its content as text and parsed BeautifulSoup.
190
+
191
+ Args:
192
+ url: The URL to fetch
193
+
194
+ Returns:
195
+ Tuple containing (raw_html, parsed_soup)
196
+
197
+ Raises:
198
+ requests.exceptions.RequestException: If the request fails
199
+ """
200
+ logger.info(f"Fetching page: {url}")
201
+ response = requests.get(url, headers=cls.DEFAULT_HEADERS, timeout=30)
202
+ response.raise_for_status()
203
+ html = response.text
204
+
205
+ # Parse HTML with BeautifulSoup
206
+ soup = BeautifulSoup(html, 'html.parser')
207
+ return html, soup
208
+
209
+
210
+ class ProductExtractor:
211
+ """Main class for extracting product information"""
212
+
213
+ def __init__(self):
214
+ self.srcset_parser = SrcsetParser()
215
+ self.image_downloader = ImageDownloader()
216
+
217
+ def extract_images_from_url(self, url: str) -> ExtractionResult:
218
+ """
219
+ Extract images with preference for f=xl 900w versions from a URL.
220
+
221
+ Args:
222
+ url: The URL to extract images from
223
+
224
+ Returns:
225
+ ExtractionResult object with extracted image information
226
+
227
+ Raises:
228
+ requests.exceptions.RequestException: If the request fails
229
+ ValueError: If the HTML cannot be parsed correctly
230
+ """
231
+ try:
232
+ logger.info(f"Extracting images from: {url}")
233
+
234
+ # Fetch the HTML content
235
+ _, soup = WebPageFetcher.fetch_page(url)
236
+
237
+ # Generate a UUID for this request
238
+ request_uuid = str(uuid.uuid4())
239
+ logger.info(f"Generated request ID: {request_uuid}")
240
+
241
+ # Initialize result
242
+ result = ExtractionResult(request_id=request_uuid)
243
+
244
+ # Extract images
245
+ self._extract_main_product_image(soup, result, request_uuid)
246
+ self._extract_measurement_image(soup, result, request_uuid)
247
+
248
+ # If no specific images found, try general approach
249
+ if not result.images:
250
+ self._extract_images_general_approach(soup, result, request_uuid)
251
+
252
+ # Extract measurements
253
+ self._extract_measurements(soup, result)
254
+
255
+ logger.info(f"Total images found: {len(result.images)}")
256
+ logger.info(f"Measurements extracted: {result.measurements}")
257
+ return result
258
+
259
+ except requests.exceptions.RequestException as e:
260
+ logger.error(f"Error fetching URL: {e}")
261
+ raise
262
+ except Exception as e:
263
+ logger.error(f"Error extracting images: {e}")
264
+ raise
265
+
266
+ def _extract_main_product_image(self, soup: BeautifulSoup, result: ExtractionResult, request_uuid: str) -> None:
267
+ """Extract the main product image"""
268
+ main_image_element = soup.select_one('div[data-type="MAIN_PRODUCT_IMAGE"] img.pip-image')
269
+ if main_image_element and main_image_element.get('srcset'):
270
+ srcset = main_image_element.get('srcset')
271
+ target_url = self.srcset_parser.extract_f_xl_image(srcset)
272
+ if target_url:
273
+ logger.info(f"Found main product image: {target_url}")
274
+ image_id = f"{request_uuid}-main"
275
+ result.images[image_id] = ImageInfo(
276
+ id=image_id,
277
+ url=target_url,
278
+ alt=main_image_element.get('alt', ''),
279
+ type="main"
280
+ )
281
+
282
+ def _extract_measurement_image(self, soup: BeautifulSoup, result: ExtractionResult, request_uuid: str) -> None:
283
+ """Extract the measurement illustration image"""
284
+ measurement_image_element = soup.select_one('div[data-type="MEASUREMENT_ILLUSTRATION"] img.pip-image')
285
+ if measurement_image_element and measurement_image_element.get('srcset'):
286
+ srcset = measurement_image_element.get('srcset')
287
+ target_url = self.srcset_parser.extract_f_xl_image(srcset)
288
+ if target_url:
289
+ logger.info(f"Found measurement image: {target_url}")
290
+ image_id = f"{request_uuid}-measurement"
291
+ result.images[image_id] = ImageInfo(
292
+ id=image_id,
293
+ url=target_url,
294
+ alt=measurement_image_element.get('alt', ''),
295
+ type="measurement"
296
+ )
297
+
298
+ def _extract_images_general_approach(self, soup: BeautifulSoup, result: ExtractionResult, request_uuid: str) -> None:
299
+ """Extract images using a more general approach"""
300
+ logger.info("No specific images found, trying general approach...")
301
+ for i, img in enumerate(soup.select('img[srcset]')):
302
+ srcset = img.get('srcset')
303
+ target_url = self.srcset_parser.extract_f_xl_image(srcset)
304
+ if target_url:
305
+ img_type = self._determine_image_type(img)
306
+ logger.info(f"Found {img_type} image: {target_url}")
307
+ image_id = f"{request_uuid}-{img_type}-{i}"
308
+ result.images[image_id] = ImageInfo(
309
+ id=image_id,
310
+ url=target_url,
311
+ alt=img.get('alt', ''),
312
+ type=img_type
313
+ )
314
+
315
+ def _determine_image_type(self, img_element: BeautifulSoup) -> str:
316
+ """Determine the type of image based on its context"""
317
+ parent_html = str(img_element.parent.parent)
318
+ if "MAIN_PRODUCT_IMAGE" in parent_html or "main" in parent_html.lower():
319
+ return "main"
320
+ elif "MEASUREMENT" in parent_html or "measurement" in parent_html.lower():
321
+ return "measurement"
322
+ return "unknown"
323
+
324
+ def _extract_measurements(self, soup: BeautifulSoup, result: ExtractionResult) -> None:
325
+ """Extract product measurements"""
326
+ dimensions_ul = soup.select_one('ul.pip-product-dimensions__dimensions-container')
327
+ if dimensions_ul:
328
+ for li in dimensions_ul.select('li.pip-product-dimensions__measurement-wrapper'):
329
+ label_span = li.select_one('span.pip-product-dimensions__measurement-name')
330
+ if label_span:
331
+ label = label_span.get_text(strip=True).replace(":", "")
332
+ full_text = li.get_text(strip=True)
333
+ value = full_text.replace(label_span.get_text(), '').strip()
334
+ result.measurements[label.lower()] = value
335
+
336
+ def process_product_page(self, url: str, output_dir: Optional[str] = None) -> Dict[str, Any]:
337
+ """
338
+ Process a product page to extract and save high-resolution images.
339
+
340
+ Args:
341
+ url: The product page URL
342
+ output_dir: Optional custom output directory
343
+
344
+ Returns:
345
+ Dictionary with paths to downloaded images and other product information
346
+ """
347
+ # Extract images and measurements
348
+ extraction_result = self.extract_images_from_url(url)
349
+
350
+ # Create a directory for the images using the request ID
351
+ if not output_dir:
352
+ output_dir = f"output/{extraction_result.request_id}"
353
+
354
+ extraction_result.output_dir = output_dir
355
+
356
+ # Process all extracted images
357
+ downloaded_images = {}
358
+
359
+ for image_id, image_info in extraction_result.images.items():
360
+ # Determine filename based on image type
361
+ image_type = image_info.type
362
+ file_ext = os.path.splitext(image_info.url.split('?')[0])[1] or '.jpg'
363
+ filename = f"{image_type}{file_ext}"
364
+
365
+ # Download the image
366
+ save_path = os.path.join(output_dir, filename)
367
+ image_path = self.image_downloader.download_image(image_info.url, save_path)
368
+
369
+ if image_path:
370
+ image_info.path = image_path
371
+ downloaded_images[image_type] = {
372
+ 'id': image_id,
373
+ 'path': image_path,
374
+ 'url': image_info.url,
375
+ 'alt': image_info.alt,
376
+ 'type': image_type
377
+ }
378
+
379
+ logger.info(f"Images downloaded to directory: {output_dir}")
380
+
381
+ return extraction_result.to_dict()
382
+
383
+
384
+ # Create a singleton instance for easy import
385
+ extractor = ProductExtractor()
386
+
387
+ # Export the main functions for API use
388
+ extract_images_from_url = extractor.extract_images_from_url
389
+ process_product_page = extractor.process_product_page
390
+ download_image = ImageDownloader.download_image