Spaces:

manisharma494
/

Virtual-Search-System

Sleeping

App Files Files Community

Virtual-Search-System / download_images.py

manisharma494

Upload 5 files

427f366 verified about 2 months ago

raw

history blame

8.32 kB

	"""
	Image Downloader for Photo Dataset
	--------------------------------
	This script downloads and optimizes images from URLs in a photos.csv file with parallel processing.

	Requirements:
	pip install pandas pillow requests tqdm concurrent.futures

	Usage:
	1. Ensure photos_url.csv is in the same directory as this script
	2. Run the script: python download_images.py
	3. Images will be downloaded to the 'images' folder

	Note:
	- Default image size is 800x800 pixels (maintains aspect ratio)
	- Images are saved as optimized JPEGs
	- You can modify num_images parameter to download fewer images
	- Approximate size of the dataset is 1.5GB and total images are 25,000 images
	- Uses parallel downloading for maximum efficiency
	"""

	import pandas as pd
	import requests
	import os
	from PIL import Image
	from io import BytesIO
	from tqdm import tqdm
	import concurrent.futures
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import time
	from pathlib import Path

	def download_single_image(args):
	"""
	Download a single image with error handling

	Args:
	args: Tuple of (idx, url, output_path, target_size)

	Returns:
	Tuple of (success, idx, error_message)
	"""
	idx, url, output_path, target_size = args

	try:
	# Skip if file already exists
	if os.path.exists(output_path):
	return True, idx, "Already exists"

	response = requests.get(url, timeout=15, stream=True)
	if response.status_code == 200:
	# Process image
	img = Image.open(BytesIO(response.content))
	if img.mode in ('RGBA', 'P'):
	img = img.convert('RGB')
	img.thumbnail(target_size, Image.Resampling.LANCZOS)
	img.save(output_path, 'JPEG', quality=85, optimize=True)
	return True, idx, None
	else:
	return False, idx, f"HTTP {response.status_code}"

	except Exception as e:
	return False, idx, str(e)

	def check_images_downloaded(output_dir="images", expected_count=None):
	"""
	Check if images are already downloaded

	Args:
	output_dir: Directory to check for images
	expected_count: Expected number of images (optional)

	Returns:
	Tuple of (is_complete, current_count, missing_count)
	"""
	images_dir = Path(output_dir)
	if not images_dir.exists():
	return False, 0, expected_count or 0

	# Count existing images
	existing_images = list(images_dir.glob("*.jpg"))
	current_count = len(existing_images)

	if expected_count is None:
	# Try to get expected count from CSV
	try:
	df = pd.read_csv("photos_url.csv")
	expected_count = len(df)
	except:
	expected_count = current_count

	missing_count = max(0, expected_count - current_count)
	is_complete = missing_count == 0

	return is_complete, current_count, missing_count

	def download_images(num_images=None, output_dir="images", target_size=(800, 800), max_workers=20):
	"""
	Download and optimize images from photos.csv with parallel processing

	Args:
	num_images: Number of images to download (default: all images in CSV)
	output_dir: Directory to save images (default: 'images')
	target_size: Max image dimensions (default: (800, 800))
	max_workers: Maximum number of parallel download threads (default: 20)
	"""
	# Create output directory
	os.makedirs(output_dir, exist_ok=True)

	# Read CSV and prepare dataset
	df = pd.read_csv("photos_url.csv")
	if num_images:
	df = df.head(num_images)

	total_images = len(df)
	print(f"📊 Total images to process: {total_images:,}")

	# Check existing images
	is_complete, current_count, missing_count = check_images_downloaded(output_dir, total_images)

	if is_complete:
	print(f"✅ All {current_count:,} images are already downloaded!")
	return True

	print(f"📥 Found {current_count:,} existing images, need to download {missing_count:,} more")

	# Prepare download tasks - only for missing images
	download_tasks = []
	for idx, row in df.iterrows():
	filename = f"{(idx+1):04d}.jpg"
	output_path = os.path.join(output_dir, filename)

	# Only add to download tasks if file doesn't exist
	if not os.path.exists(output_path):
	download_tasks.append((idx, row['photo_image_url'], output_path, target_size))

	if not download_tasks:
	print("✅ All images are already downloaded!")
	return True

	print(f"🚀 Starting parallel download of {len(download_tasks):,} missing images with {max_workers} workers...")
	start_time = time.time()

	successful_downloads = 0
	failed_downloads = 0
	skipped_downloads = 0

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	# Submit all tasks
	future_to_task = {executor.submit(download_single_image, task): task for task in download_tasks}

	# Process completed tasks with progress bar
	with tqdm(total=len(download_tasks), desc="Downloading images") as pbar:
	for future in as_completed(future_to_task):
	success, idx, error = future.result()

	if success:
	if error == "Already exists":
	skipped_downloads += 1
	else:
	successful_downloads += 1
	else:
	failed_downloads += 1
	if error and error != "Already exists":
	# Only show error for non-404 errors to reduce noise
	if "404" not in str(error) and "NameResolutionError" not in str(error):
	print(f"❌ Failed to download image {idx+1}: {error}")

	pbar.update(1)

	end_time = time.time()
	duration = end_time - start_time

	print(f"\n📈 Download Summary:")
	print(f" ✅ New downloads: {successful_downloads:,}")
	print(f" ⏭️ Skipped (already exist): {skipped_downloads:,}")
	print(f" ❌ Failed: {failed_downloads:,}")
	print(f" ⏱️ Duration: {duration:.1f} seconds")
	if duration > 0:
	print(f" 🚀 Speed: {successful_downloads/duration:.1f} images/second")

	# Final check
	is_complete, final_count, final_missing = check_images_downloaded(output_dir, total_images)

	if final_count >= total_images * 0.95: # Consider successful if we have 95% or more
	print(f"🎉 Download completed! Now have {final_count:,} images ({final_missing:,} missing)")
	return True
	elif final_count > current_count:
	print(f"✅ Download partially successful! Now have {final_count:,} images ({final_missing:,} missing)")
	return True
	else:
	print(f"⚠️ Download had issues. Still have {final_count:,} images ({final_missing:,} missing)")
	return False

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Download images with parallel processing")
	parser.add_argument("--num-images", type=int, default=None, help="Number of images to download (default: all)")
	parser.add_argument("--output-dir", type=str, default="images", help="Output directory (default: images)")
	parser.add_argument("--max-workers", type=int, default=20, help="Maximum parallel workers (default: 20)")
	parser.add_argument("--check-only", action="store_true", help="Only check if images are downloaded")

	args = parser.parse_args()

	if args.check_only:
	# Just check the status
	is_complete, current_count, missing_count = check_images_downloaded(args.output_dir)
	if is_complete:
	print(f"✅ All {current_count:,} images are downloaded!")
	else:
	print(f"📊 Status: {current_count:,} downloaded, {missing_count:,} missing")
	else:
	# Download images
	success = download_images(
	num_images=args.num_images,
	output_dir=args.output_dir,
	max_workers=args.max_workers
	)
	exit(0 if success else 1)