Spaces:
Sleeping
Sleeping
| """ | |
| Image Downloader for Photo Dataset | |
| -------------------------------- | |
| This script downloads and optimizes images from URLs in a photos.csv file with parallel processing. | |
| Requirements: | |
| pip install pandas pillow requests tqdm concurrent.futures | |
| Usage: | |
| 1. Ensure photos_url.csv is in the same directory as this script | |
| 2. Run the script: python download_images.py | |
| 3. Images will be downloaded to the 'images' folder | |
| Note: | |
| - Default image size is 800x800 pixels (maintains aspect ratio) | |
| - Images are saved as optimized JPEGs | |
| - You can modify num_images parameter to download fewer images | |
| - Approximate size of the dataset is 1.5GB and total images are 25,000 images | |
| - Uses parallel downloading for maximum efficiency | |
| """ | |
| import pandas as pd | |
| import requests | |
| import os | |
| from PIL import Image | |
| from io import BytesIO | |
| from tqdm import tqdm | |
| import concurrent.futures | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import time | |
| from pathlib import Path | |
| def download_single_image(args): | |
| """ | |
| Download a single image with error handling | |
| Args: | |
| args: Tuple of (idx, url, output_path, target_size) | |
| Returns: | |
| Tuple of (success, idx, error_message) | |
| """ | |
| idx, url, output_path, target_size = args | |
| try: | |
| # Skip if file already exists | |
| if os.path.exists(output_path): | |
| return True, idx, "Already exists" | |
| response = requests.get(url, timeout=15, stream=True) | |
| if response.status_code == 200: | |
| # Process image | |
| img = Image.open(BytesIO(response.content)) | |
| if img.mode in ('RGBA', 'P'): | |
| img = img.convert('RGB') | |
| img.thumbnail(target_size, Image.Resampling.LANCZOS) | |
| img.save(output_path, 'JPEG', quality=85, optimize=True) | |
| return True, idx, None | |
| else: | |
| return False, idx, f"HTTP {response.status_code}" | |
| except Exception as e: | |
| return False, idx, str(e) | |
| def check_images_downloaded(output_dir="images", expected_count=None): | |
| """ | |
| Check if images are already downloaded | |
| Args: | |
| output_dir: Directory to check for images | |
| expected_count: Expected number of images (optional) | |
| Returns: | |
| Tuple of (is_complete, current_count, missing_count) | |
| """ | |
| images_dir = Path(output_dir) | |
| if not images_dir.exists(): | |
| return False, 0, expected_count or 0 | |
| # Count existing images | |
| existing_images = list(images_dir.glob("*.jpg")) | |
| current_count = len(existing_images) | |
| if expected_count is None: | |
| # Try to get expected count from CSV | |
| try: | |
| df = pd.read_csv("photos_url.csv") | |
| expected_count = len(df) | |
| except: | |
| expected_count = current_count | |
| missing_count = max(0, expected_count - current_count) | |
| is_complete = missing_count == 0 | |
| return is_complete, current_count, missing_count | |
| def download_images(num_images=None, output_dir="images", target_size=(800, 800), max_workers=20): | |
| """ | |
| Download and optimize images from photos.csv with parallel processing | |
| Args: | |
| num_images: Number of images to download (default: all images in CSV) | |
| output_dir: Directory to save images (default: 'images') | |
| target_size: Max image dimensions (default: (800, 800)) | |
| max_workers: Maximum number of parallel download threads (default: 20) | |
| """ | |
| # Create output directory | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Read CSV and prepare dataset | |
| df = pd.read_csv("photos_url.csv") | |
| if num_images: | |
| df = df.head(num_images) | |
| total_images = len(df) | |
| print(f"π Total images to process: {total_images:,}") | |
| # Check existing images | |
| is_complete, current_count, missing_count = check_images_downloaded(output_dir, total_images) | |
| if is_complete: | |
| print(f"β All {current_count:,} images are already downloaded!") | |
| return True | |
| print(f"π₯ Found {current_count:,} existing images, need to download {missing_count:,} more") | |
| # Prepare download tasks - only for missing images | |
| download_tasks = [] | |
| for idx, row in df.iterrows(): | |
| filename = f"{(idx+1):04d}.jpg" | |
| output_path = os.path.join(output_dir, filename) | |
| # Only add to download tasks if file doesn't exist | |
| if not os.path.exists(output_path): | |
| download_tasks.append((idx, row['photo_image_url'], output_path, target_size)) | |
| if not download_tasks: | |
| print("β All images are already downloaded!") | |
| return True | |
| print(f"π Starting parallel download of {len(download_tasks):,} missing images with {max_workers} workers...") | |
| start_time = time.time() | |
| successful_downloads = 0 | |
| failed_downloads = 0 | |
| skipped_downloads = 0 | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| # Submit all tasks | |
| future_to_task = {executor.submit(download_single_image, task): task for task in download_tasks} | |
| # Process completed tasks with progress bar | |
| with tqdm(total=len(download_tasks), desc="Downloading images") as pbar: | |
| for future in as_completed(future_to_task): | |
| success, idx, error = future.result() | |
| if success: | |
| if error == "Already exists": | |
| skipped_downloads += 1 | |
| else: | |
| successful_downloads += 1 | |
| else: | |
| failed_downloads += 1 | |
| if error and error != "Already exists": | |
| # Only show error for non-404 errors to reduce noise | |
| if "404" not in str(error) and "NameResolutionError" not in str(error): | |
| print(f"β Failed to download image {idx+1}: {error}") | |
| pbar.update(1) | |
| end_time = time.time() | |
| duration = end_time - start_time | |
| print(f"\nπ Download Summary:") | |
| print(f" β New downloads: {successful_downloads:,}") | |
| print(f" βοΈ Skipped (already exist): {skipped_downloads:,}") | |
| print(f" β Failed: {failed_downloads:,}") | |
| print(f" β±οΈ Duration: {duration:.1f} seconds") | |
| if duration > 0: | |
| print(f" π Speed: {successful_downloads/duration:.1f} images/second") | |
| # Final check | |
| is_complete, final_count, final_missing = check_images_downloaded(output_dir, total_images) | |
| if final_count >= total_images * 0.95: # Consider successful if we have 95% or more | |
| print(f"π Download completed! Now have {final_count:,} images ({final_missing:,} missing)") | |
| return True | |
| elif final_count > current_count: | |
| print(f"β Download partially successful! Now have {final_count:,} images ({final_missing:,} missing)") | |
| return True | |
| else: | |
| print(f"β οΈ Download had issues. Still have {final_count:,} images ({final_missing:,} missing)") | |
| return False | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Download images with parallel processing") | |
| parser.add_argument("--num-images", type=int, default=None, help="Number of images to download (default: all)") | |
| parser.add_argument("--output-dir", type=str, default="images", help="Output directory (default: images)") | |
| parser.add_argument("--max-workers", type=int, default=20, help="Maximum parallel workers (default: 20)") | |
| parser.add_argument("--check-only", action="store_true", help="Only check if images are downloaded") | |
| args = parser.parse_args() | |
| if args.check_only: | |
| # Just check the status | |
| is_complete, current_count, missing_count = check_images_downloaded(args.output_dir) | |
| if is_complete: | |
| print(f"β All {current_count:,} images are downloaded!") | |
| else: | |
| print(f"π Status: {current_count:,} downloaded, {missing_count:,} missing") | |
| else: | |
| # Download images | |
| success = download_images( | |
| num_images=args.num_images, | |
| output_dir=args.output_dir, | |
| max_workers=args.max_workers | |
| ) | |
| exit(0 if success else 1) |