Spaces:
Sleeping
Sleeping
""" | |
Image Downloader for Photo Dataset | |
-------------------------------- | |
This script downloads and optimizes images from URLs in a photos.csv file with parallel processing. | |
Requirements: | |
pip install pandas pillow requests tqdm concurrent.futures | |
Usage: | |
1. Ensure photos_url.csv is in the same directory as this script | |
2. Run the script: python download_images.py | |
3. Images will be downloaded to the 'images' folder | |
Note: | |
- Default image size is 800x800 pixels (maintains aspect ratio) | |
- Images are saved as optimized JPEGs | |
- You can modify num_images parameter to download fewer images | |
- Approximate size of the dataset is 1.5GB and total images are 25,000 images | |
- Uses parallel downloading for maximum efficiency | |
""" | |
import pandas as pd | |
import requests | |
import os | |
from PIL import Image | |
from io import BytesIO | |
from tqdm import tqdm | |
import concurrent.futures | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import time | |
from pathlib import Path | |
def download_single_image(args): | |
""" | |
Download a single image with error handling | |
Args: | |
args: Tuple of (idx, url, output_path, target_size) | |
Returns: | |
Tuple of (success, idx, error_message) | |
""" | |
idx, url, output_path, target_size = args | |
try: | |
# Skip if file already exists | |
if os.path.exists(output_path): | |
return True, idx, "Already exists" | |
response = requests.get(url, timeout=15, stream=True) | |
if response.status_code == 200: | |
# Process image | |
img = Image.open(BytesIO(response.content)) | |
if img.mode in ('RGBA', 'P'): | |
img = img.convert('RGB') | |
img.thumbnail(target_size, Image.Resampling.LANCZOS) | |
img.save(output_path, 'JPEG', quality=85, optimize=True) | |
return True, idx, None | |
else: | |
return False, idx, f"HTTP {response.status_code}" | |
except Exception as e: | |
return False, idx, str(e) | |
def check_images_downloaded(output_dir="images", expected_count=None): | |
""" | |
Check if images are already downloaded | |
Args: | |
output_dir: Directory to check for images | |
expected_count: Expected number of images (optional) | |
Returns: | |
Tuple of (is_complete, current_count, missing_count) | |
""" | |
images_dir = Path(output_dir) | |
if not images_dir.exists(): | |
return False, 0, expected_count or 0 | |
# Count existing images | |
existing_images = list(images_dir.glob("*.jpg")) | |
current_count = len(existing_images) | |
if expected_count is None: | |
# Try to get expected count from CSV | |
try: | |
df = pd.read_csv("photos_url.csv") | |
expected_count = len(df) | |
except: | |
expected_count = current_count | |
missing_count = max(0, expected_count - current_count) | |
is_complete = missing_count == 0 | |
return is_complete, current_count, missing_count | |
def download_images(num_images=None, output_dir="images", target_size=(800, 800), max_workers=20): | |
""" | |
Download and optimize images from photos.csv with parallel processing | |
Args: | |
num_images: Number of images to download (default: all images in CSV) | |
output_dir: Directory to save images (default: 'images') | |
target_size: Max image dimensions (default: (800, 800)) | |
max_workers: Maximum number of parallel download threads (default: 20) | |
""" | |
# Create output directory | |
os.makedirs(output_dir, exist_ok=True) | |
# Read CSV and prepare dataset | |
df = pd.read_csv("photos_url.csv") | |
if num_images: | |
df = df.head(num_images) | |
total_images = len(df) | |
print(f"π Total images to process: {total_images:,}") | |
# Check existing images | |
is_complete, current_count, missing_count = check_images_downloaded(output_dir, total_images) | |
if is_complete: | |
print(f"β All {current_count:,} images are already downloaded!") | |
return True | |
print(f"π₯ Found {current_count:,} existing images, need to download {missing_count:,} more") | |
# Prepare download tasks - only for missing images | |
download_tasks = [] | |
for idx, row in df.iterrows(): | |
filename = f"{(idx+1):04d}.jpg" | |
output_path = os.path.join(output_dir, filename) | |
# Only add to download tasks if file doesn't exist | |
if not os.path.exists(output_path): | |
download_tasks.append((idx, row['photo_image_url'], output_path, target_size)) | |
if not download_tasks: | |
print("β All images are already downloaded!") | |
return True | |
print(f"π Starting parallel download of {len(download_tasks):,} missing images with {max_workers} workers...") | |
start_time = time.time() | |
successful_downloads = 0 | |
failed_downloads = 0 | |
skipped_downloads = 0 | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
# Submit all tasks | |
future_to_task = {executor.submit(download_single_image, task): task for task in download_tasks} | |
# Process completed tasks with progress bar | |
with tqdm(total=len(download_tasks), desc="Downloading images") as pbar: | |
for future in as_completed(future_to_task): | |
success, idx, error = future.result() | |
if success: | |
if error == "Already exists": | |
skipped_downloads += 1 | |
else: | |
successful_downloads += 1 | |
else: | |
failed_downloads += 1 | |
if error and error != "Already exists": | |
# Only show error for non-404 errors to reduce noise | |
if "404" not in str(error) and "NameResolutionError" not in str(error): | |
print(f"β Failed to download image {idx+1}: {error}") | |
pbar.update(1) | |
end_time = time.time() | |
duration = end_time - start_time | |
print(f"\nπ Download Summary:") | |
print(f" β New downloads: {successful_downloads:,}") | |
print(f" βοΈ Skipped (already exist): {skipped_downloads:,}") | |
print(f" β Failed: {failed_downloads:,}") | |
print(f" β±οΈ Duration: {duration:.1f} seconds") | |
if duration > 0: | |
print(f" π Speed: {successful_downloads/duration:.1f} images/second") | |
# Final check | |
is_complete, final_count, final_missing = check_images_downloaded(output_dir, total_images) | |
if final_count >= total_images * 0.95: # Consider successful if we have 95% or more | |
print(f"π Download completed! Now have {final_count:,} images ({final_missing:,} missing)") | |
return True | |
elif final_count > current_count: | |
print(f"β Download partially successful! Now have {final_count:,} images ({final_missing:,} missing)") | |
return True | |
else: | |
print(f"β οΈ Download had issues. Still have {final_count:,} images ({final_missing:,} missing)") | |
return False | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser(description="Download images with parallel processing") | |
parser.add_argument("--num-images", type=int, default=None, help="Number of images to download (default: all)") | |
parser.add_argument("--output-dir", type=str, default="images", help="Output directory (default: images)") | |
parser.add_argument("--max-workers", type=int, default=20, help="Maximum parallel workers (default: 20)") | |
parser.add_argument("--check-only", action="store_true", help="Only check if images are downloaded") | |
args = parser.parse_args() | |
if args.check_only: | |
# Just check the status | |
is_complete, current_count, missing_count = check_images_downloaded(args.output_dir) | |
if is_complete: | |
print(f"β All {current_count:,} images are downloaded!") | |
else: | |
print(f"π Status: {current_count:,} downloaded, {missing_count:,} missing") | |
else: | |
# Download images | |
success = download_images( | |
num_images=args.num_images, | |
output_dir=args.output_dir, | |
max_workers=args.max_workers | |
) | |
exit(0 if success else 1) |