Spaces:
Sleeping
Sleeping
File size: 8,317 Bytes
427f366 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
"""
Image Downloader for Photo Dataset
--------------------------------
This script downloads and optimizes images from URLs in a photos.csv file with parallel processing.
Requirements:
pip install pandas pillow requests tqdm concurrent.futures
Usage:
1. Ensure photos_url.csv is in the same directory as this script
2. Run the script: python download_images.py
3. Images will be downloaded to the 'images' folder
Note:
- Default image size is 800x800 pixels (maintains aspect ratio)
- Images are saved as optimized JPEGs
- You can modify num_images parameter to download fewer images
- Approximate size of the dataset is 1.5GB and total images are 25,000 images
- Uses parallel downloading for maximum efficiency
"""
import pandas as pd
import requests
import os
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from pathlib import Path
def download_single_image(args):
"""
Download a single image with error handling
Args:
args: Tuple of (idx, url, output_path, target_size)
Returns:
Tuple of (success, idx, error_message)
"""
idx, url, output_path, target_size = args
try:
# Skip if file already exists
if os.path.exists(output_path):
return True, idx, "Already exists"
response = requests.get(url, timeout=15, stream=True)
if response.status_code == 200:
# Process image
img = Image.open(BytesIO(response.content))
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.thumbnail(target_size, Image.Resampling.LANCZOS)
img.save(output_path, 'JPEG', quality=85, optimize=True)
return True, idx, None
else:
return False, idx, f"HTTP {response.status_code}"
except Exception as e:
return False, idx, str(e)
def check_images_downloaded(output_dir="images", expected_count=None):
"""
Check if images are already downloaded
Args:
output_dir: Directory to check for images
expected_count: Expected number of images (optional)
Returns:
Tuple of (is_complete, current_count, missing_count)
"""
images_dir = Path(output_dir)
if not images_dir.exists():
return False, 0, expected_count or 0
# Count existing images
existing_images = list(images_dir.glob("*.jpg"))
current_count = len(existing_images)
if expected_count is None:
# Try to get expected count from CSV
try:
df = pd.read_csv("photos_url.csv")
expected_count = len(df)
except:
expected_count = current_count
missing_count = max(0, expected_count - current_count)
is_complete = missing_count == 0
return is_complete, current_count, missing_count
def download_images(num_images=None, output_dir="images", target_size=(800, 800), max_workers=20):
"""
Download and optimize images from photos.csv with parallel processing
Args:
num_images: Number of images to download (default: all images in CSV)
output_dir: Directory to save images (default: 'images')
target_size: Max image dimensions (default: (800, 800))
max_workers: Maximum number of parallel download threads (default: 20)
"""
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Read CSV and prepare dataset
df = pd.read_csv("photos_url.csv")
if num_images:
df = df.head(num_images)
total_images = len(df)
print(f"π Total images to process: {total_images:,}")
# Check existing images
is_complete, current_count, missing_count = check_images_downloaded(output_dir, total_images)
if is_complete:
print(f"β
All {current_count:,} images are already downloaded!")
return True
print(f"π₯ Found {current_count:,} existing images, need to download {missing_count:,} more")
# Prepare download tasks - only for missing images
download_tasks = []
for idx, row in df.iterrows():
filename = f"{(idx+1):04d}.jpg"
output_path = os.path.join(output_dir, filename)
# Only add to download tasks if file doesn't exist
if not os.path.exists(output_path):
download_tasks.append((idx, row['photo_image_url'], output_path, target_size))
if not download_tasks:
print("β
All images are already downloaded!")
return True
print(f"π Starting parallel download of {len(download_tasks):,} missing images with {max_workers} workers...")
start_time = time.time()
successful_downloads = 0
failed_downloads = 0
skipped_downloads = 0
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_task = {executor.submit(download_single_image, task): task for task in download_tasks}
# Process completed tasks with progress bar
with tqdm(total=len(download_tasks), desc="Downloading images") as pbar:
for future in as_completed(future_to_task):
success, idx, error = future.result()
if success:
if error == "Already exists":
skipped_downloads += 1
else:
successful_downloads += 1
else:
failed_downloads += 1
if error and error != "Already exists":
# Only show error for non-404 errors to reduce noise
if "404" not in str(error) and "NameResolutionError" not in str(error):
print(f"β Failed to download image {idx+1}: {error}")
pbar.update(1)
end_time = time.time()
duration = end_time - start_time
print(f"\nπ Download Summary:")
print(f" β
New downloads: {successful_downloads:,}")
print(f" βοΈ Skipped (already exist): {skipped_downloads:,}")
print(f" β Failed: {failed_downloads:,}")
print(f" β±οΈ Duration: {duration:.1f} seconds")
if duration > 0:
print(f" π Speed: {successful_downloads/duration:.1f} images/second")
# Final check
is_complete, final_count, final_missing = check_images_downloaded(output_dir, total_images)
if final_count >= total_images * 0.95: # Consider successful if we have 95% or more
print(f"π Download completed! Now have {final_count:,} images ({final_missing:,} missing)")
return True
elif final_count > current_count:
print(f"β
Download partially successful! Now have {final_count:,} images ({final_missing:,} missing)")
return True
else:
print(f"β οΈ Download had issues. Still have {final_count:,} images ({final_missing:,} missing)")
return False
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Download images with parallel processing")
parser.add_argument("--num-images", type=int, default=None, help="Number of images to download (default: all)")
parser.add_argument("--output-dir", type=str, default="images", help="Output directory (default: images)")
parser.add_argument("--max-workers", type=int, default=20, help="Maximum parallel workers (default: 20)")
parser.add_argument("--check-only", action="store_true", help="Only check if images are downloaded")
args = parser.parse_args()
if args.check_only:
# Just check the status
is_complete, current_count, missing_count = check_images_downloaded(args.output_dir)
if is_complete:
print(f"β
All {current_count:,} images are downloaded!")
else:
print(f"π Status: {current_count:,} downloaded, {missing_count:,} missing")
else:
# Download images
success = download_images(
num_images=args.num_images,
output_dir=args.output_dir,
max_workers=args.max_workers
)
exit(0 if success else 1) |