File size: 8,317 Bytes
427f366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
Image Downloader for Photo Dataset
--------------------------------
This script downloads and optimizes images from URLs in a photos.csv file with parallel processing.

Requirements:
    pip install pandas pillow requests tqdm concurrent.futures

Usage:
    1. Ensure photos_url.csv is in the same directory as this script
    2. Run the script: python download_images.py
    3. Images will be downloaded to the 'images' folder

Note: 
    - Default image size is 800x800 pixels (maintains aspect ratio)
    - Images are saved as optimized JPEGs
    - You can modify num_images parameter to download fewer images
    - Approximate size of the dataset is 1.5GB and total images are 25,000 images
    - Uses parallel downloading for maximum efficiency
"""

import pandas as pd
import requests
import os
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from pathlib import Path

def download_single_image(args):
    """
    Download a single image with error handling
    
    Args:
        args: Tuple of (idx, url, output_path, target_size)
    
    Returns:
        Tuple of (success, idx, error_message)
    """
    idx, url, output_path, target_size = args
    
    try:
        # Skip if file already exists
        if os.path.exists(output_path):
            return True, idx, "Already exists"
        
        response = requests.get(url, timeout=15, stream=True)
        if response.status_code == 200:
            # Process image
            img = Image.open(BytesIO(response.content))
            if img.mode in ('RGBA', 'P'):
                img = img.convert('RGB')
            img.thumbnail(target_size, Image.Resampling.LANCZOS)
            img.save(output_path, 'JPEG', quality=85, optimize=True)
            return True, idx, None
        else:
            return False, idx, f"HTTP {response.status_code}"
            
    except Exception as e:
        return False, idx, str(e)

def check_images_downloaded(output_dir="images", expected_count=None):
    """
    Check if images are already downloaded
    
    Args:
        output_dir: Directory to check for images
        expected_count: Expected number of images (optional)
    
    Returns:
        Tuple of (is_complete, current_count, missing_count)
    """
    images_dir = Path(output_dir)
    if not images_dir.exists():
        return False, 0, expected_count or 0
    
    # Count existing images
    existing_images = list(images_dir.glob("*.jpg"))
    current_count = len(existing_images)
    
    if expected_count is None:
        # Try to get expected count from CSV
        try:
            df = pd.read_csv("photos_url.csv")
            expected_count = len(df)
        except:
            expected_count = current_count
    
    missing_count = max(0, expected_count - current_count)
    is_complete = missing_count == 0
    
    return is_complete, current_count, missing_count

def download_images(num_images=None, output_dir="images", target_size=(800, 800), max_workers=20):
    """
    Download and optimize images from photos.csv with parallel processing
    
    Args:
        num_images: Number of images to download (default: all images in CSV)
        output_dir: Directory to save images (default: 'images')
        target_size: Max image dimensions (default: (800, 800))
        max_workers: Maximum number of parallel download threads (default: 20)
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Read CSV and prepare dataset
    df = pd.read_csv("photos_url.csv")
    if num_images:
        df = df.head(num_images)
    
    total_images = len(df)
    print(f"πŸ“Š Total images to process: {total_images:,}")
    
    # Check existing images
    is_complete, current_count, missing_count = check_images_downloaded(output_dir, total_images)
    
    if is_complete:
        print(f"βœ… All {current_count:,} images are already downloaded!")
        return True
    
    print(f"πŸ“₯ Found {current_count:,} existing images, need to download {missing_count:,} more")
    
    # Prepare download tasks - only for missing images
    download_tasks = []
    for idx, row in df.iterrows():
        filename = f"{(idx+1):04d}.jpg"
        output_path = os.path.join(output_dir, filename)
        
        # Only add to download tasks if file doesn't exist
        if not os.path.exists(output_path):
            download_tasks.append((idx, row['photo_image_url'], output_path, target_size))
    
    if not download_tasks:
        print("βœ… All images are already downloaded!")
        return True
    
    print(f"πŸš€ Starting parallel download of {len(download_tasks):,} missing images with {max_workers} workers...")
    start_time = time.time()
    
    successful_downloads = 0
    failed_downloads = 0
    skipped_downloads = 0
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_task = {executor.submit(download_single_image, task): task for task in download_tasks}
        
        # Process completed tasks with progress bar
        with tqdm(total=len(download_tasks), desc="Downloading images") as pbar:
            for future in as_completed(future_to_task):
                success, idx, error = future.result()
                
                if success:
                    if error == "Already exists":
                        skipped_downloads += 1
                    else:
                        successful_downloads += 1
                else:
                    failed_downloads += 1
                    if error and error != "Already exists":
                        # Only show error for non-404 errors to reduce noise
                        if "404" not in str(error) and "NameResolutionError" not in str(error):
                            print(f"❌ Failed to download image {idx+1}: {error}")
                
                pbar.update(1)
    
    end_time = time.time()
    duration = end_time - start_time
    
    print(f"\nπŸ“ˆ Download Summary:")
    print(f"   βœ… New downloads: {successful_downloads:,}")
    print(f"   ⏭️  Skipped (already exist): {skipped_downloads:,}")
    print(f"   ❌ Failed: {failed_downloads:,}")
    print(f"   ⏱️  Duration: {duration:.1f} seconds")
    if duration > 0:
        print(f"   πŸš€ Speed: {successful_downloads/duration:.1f} images/second")
    
    # Final check
    is_complete, final_count, final_missing = check_images_downloaded(output_dir, total_images)
    
    if final_count >= total_images * 0.95:  # Consider successful if we have 95% or more
        print(f"πŸŽ‰ Download completed! Now have {final_count:,} images ({final_missing:,} missing)")
        return True
    elif final_count > current_count:
        print(f"βœ… Download partially successful! Now have {final_count:,} images ({final_missing:,} missing)")
        return True
    else:
        print(f"⚠️  Download had issues. Still have {final_count:,} images ({final_missing:,} missing)")
        return False

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Download images with parallel processing")
    parser.add_argument("--num-images", type=int, default=None, help="Number of images to download (default: all)")
    parser.add_argument("--output-dir", type=str, default="images", help="Output directory (default: images)")
    parser.add_argument("--max-workers", type=int, default=20, help="Maximum parallel workers (default: 20)")
    parser.add_argument("--check-only", action="store_true", help="Only check if images are downloaded")
    
    args = parser.parse_args()
    
    if args.check_only:
        # Just check the status
        is_complete, current_count, missing_count = check_images_downloaded(args.output_dir)
        if is_complete:
            print(f"βœ… All {current_count:,} images are downloaded!")
        else:
            print(f"πŸ“Š Status: {current_count:,} downloaded, {missing_count:,} missing")
    else:
        # Download images
        success = download_images(
            num_images=args.num_images,
            output_dir=args.output_dir,
            max_workers=args.max_workers
        )
        exit(0 if success else 1)