Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Merge pull request #4 from khalidsaifullaah/main
Browse files- data/CC3M_downloader.py +156 -0
    	
        data/CC3M_downloader.py
    ADDED
    
    | @@ -0,0 +1,156 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # It expects you to have the train and validation `.tsv` file downloaded in the current directory
         | 
| 2 | 
            +
            # Head around to this link to download the `.tsv` files
         | 
| 3 | 
            +
            # https://ai.google.com/research/ConceptualCaptions/download
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            '''
         | 
| 6 | 
            +
            This script was adapted from https://github.com/igorbrigadir/DownloadConceptualCaptions
         | 
| 7 | 
            +
            Few changes were made post that (excluding the post processing of data). We'll have
         | 
| 8 | 
            +
            only csv file with image url and captions written in different languages but not images
         | 
| 9 | 
            +
            as we do not own any of the images in the dataset and hence cannot legally provide them to you.
         | 
| 10 | 
            +
            '''
         | 
| 11 | 
            +
            import pandas as pd
         | 
| 12 | 
            +
            import numpy as np
         | 
| 13 | 
            +
            import requests
         | 
| 14 | 
            +
            import zlib
         | 
| 15 | 
            +
            import os
         | 
| 16 | 
            +
            import shelve
         | 
| 17 | 
            +
            import magic
         | 
| 18 | 
            +
            from multiprocessing import Pool
         | 
| 19 | 
            +
            from tqdm import tqdm
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            headers = {
         | 
| 22 | 
            +
                'User-Agent':'Googlebot-Image/1.0', # Pretend to be googlebot
         | 
| 23 | 
            +
                'X-Forwarded-For': '64.18.15.200'
         | 
| 24 | 
            +
            }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            def _df_split_apply(tup_arg):
         | 
| 27 | 
            +
                split_ind, subset, func = tup_arg
         | 
| 28 | 
            +
                r = subset.apply(func, axis=1)
         | 
| 29 | 
            +
                return (split_ind, r)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            def df_multiprocess(df, processes, chunk_size, func, dataset_name):
         | 
| 32 | 
            +
                print("Generating parts...")
         | 
| 33 | 
            +
                with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    pbar = tqdm(total=len(df), position=0)
         | 
| 36 | 
            +
                    # Resume:
         | 
| 37 | 
            +
                    finished_chunks = set([int(k) for k in results.keys()])
         | 
| 38 | 
            +
                    pbar.desc = "Resuming"
         | 
| 39 | 
            +
                    for k in results.keys():
         | 
| 40 | 
            +
                        pbar.update(len(results[str(k)][1]))
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    pool_data = ((index, df[i:i + chunk_size], func) for index, i in enumerate(range(0, len(df), chunk_size)) if index not in finished_chunks)
         | 
| 43 | 
            +
                    print(int(len(df) / chunk_size), "parts.", chunk_size, "per part.", "Using", processes, "processes")
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    pbar.desc = "Downloading"
         | 
| 46 | 
            +
                    with Pool(processes) as pool:
         | 
| 47 | 
            +
                        for i, result in enumerate(pool.imap_unordered(_df_split_apply, pool_data, 2)):
         | 
| 48 | 
            +
                            results[str(result[0])] = result
         | 
| 49 | 
            +
                            pbar.update(len(result[1]))
         | 
| 50 | 
            +
                    pbar.close()
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                print("Finished Downloading.")
         | 
| 53 | 
            +
                return
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            # Unique name based on url
         | 
| 56 | 
            +
            def _file_name(row):
         | 
| 57 | 
            +
                return "%s/%s_%s" % (row['folder'], row.name, (zlib.crc32(row['url'].encode('utf-8')) & 0xffffffff))
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # For checking mimetypes separately without download
         | 
| 60 | 
            +
            def check_mimetype(row):
         | 
| 61 | 
            +
                if os.path.isfile(str(row['file'])):
         | 
| 62 | 
            +
                    row['mimetype'] = magic.from_file(row['file'], mime=True)
         | 
| 63 | 
            +
                    row['size'] = os.stat(row['file']).st_size
         | 
| 64 | 
            +
                return row
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            # Don't download image, just check with a HEAD request, can't resume.
         | 
| 67 | 
            +
            # Can use this instead of download_image to get HTTP status codes.
         | 
| 68 | 
            +
            def check_download(row):
         | 
| 69 | 
            +
                fname = _file_name(row)
         | 
| 70 | 
            +
                try:
         | 
| 71 | 
            +
                    # not all sites will support HEAD
         | 
| 72 | 
            +
                    response = requests.head(row['url'], stream=False, timeout=5, allow_redirects=True, headers=headers)
         | 
| 73 | 
            +
                    row['status'] = response.status_code
         | 
| 74 | 
            +
                    row['headers'] = dict(response.headers)
         | 
| 75 | 
            +
                except:
         | 
| 76 | 
            +
                    # log errors later, set error as 408 timeout
         | 
| 77 | 
            +
                    row['status'] = 408
         | 
| 78 | 
            +
                    return row
         | 
| 79 | 
            +
                if response.ok:
         | 
| 80 | 
            +
                    row['file'] = fname
         | 
| 81 | 
            +
                return row
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            def download_image(row):
         | 
| 84 | 
            +
                fname = _file_name(row)
         | 
| 85 | 
            +
                # Skip Already downloaded, retry others later
         | 
| 86 | 
            +
                if os.path.isfile(fname):
         | 
| 87 | 
            +
                    row['status'] = 200
         | 
| 88 | 
            +
                    row['file'] = fname
         | 
| 89 | 
            +
                    row['mimetype'] = magic.from_file(row['file'], mime=True)
         | 
| 90 | 
            +
                    row['size'] = os.stat(row['file']).st_size
         | 
| 91 | 
            +
                    return row
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                try:
         | 
| 94 | 
            +
                    # use smaller timeout to skip errors, but can result in failed downloads
         | 
| 95 | 
            +
                    response = requests.get(row['url'], stream=False, timeout=10, allow_redirects=True, headers=headers)
         | 
| 96 | 
            +
                    row['status'] = response.status_code
         | 
| 97 | 
            +
                    #row['headers'] = dict(response.headers)
         | 
| 98 | 
            +
                except Exception as e:
         | 
| 99 | 
            +
                    # log errors later, set error as 408 timeout
         | 
| 100 | 
            +
                    row['status'] = 408
         | 
| 101 | 
            +
                    return row
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                if response.ok:
         | 
| 104 | 
            +
                    try:
         | 
| 105 | 
            +
                        with open(fname, 'wb') as out_file:
         | 
| 106 | 
            +
                            # some sites respond with gzip transport encoding
         | 
| 107 | 
            +
                            response.raw.decode_content = True
         | 
| 108 | 
            +
                            out_file.write(response.content)
         | 
| 109 | 
            +
                        row['mimetype'] = magic.from_file(fname, mime=True)
         | 
| 110 | 
            +
                        row['size'] = os.stat(fname).st_size
         | 
| 111 | 
            +
                    except:
         | 
| 112 | 
            +
                        # This is if it times out during a download or decode
         | 
| 113 | 
            +
                        row['status'] = 408
         | 
| 114 | 
            +
                        return row
         | 
| 115 | 
            +
                    row['file'] = fname
         | 
| 116 | 
            +
                return row
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            def open_tsv(fname, folder):
         | 
| 119 | 
            +
                print("Opening %s Data File..." % fname)
         | 
| 120 | 
            +
                df = pd.read_csv(fname, sep='\t', names=["caption","url"], usecols=range(1,2))
         | 
| 121 | 
            +
                df['folder'] = folder
         | 
| 122 | 
            +
                print("Processing", len(df), " Images:")
         | 
| 123 | 
            +
                return df
         | 
| 124 | 
            +
             | 
| 125 | 
            +
            def df_from_shelve(chunk_size, func, dataset_name):
         | 
| 126 | 
            +
                print("Generating Dataframe from results...")
         | 
| 127 | 
            +
                with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
         | 
| 128 | 
            +
                    keylist = sorted([int(k) for k in results.keys()])
         | 
| 129 | 
            +
                    df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
         | 
| 130 | 
            +
                return df
         | 
| 131 | 
            +
             | 
| 132 | 
            +
            # number of processes in the pool can be larger than cores
         | 
| 133 | 
            +
            num_processes = 256
         | 
| 134 | 
            +
            # chunk_size is how many images per chunk per process - changing this resets progress when restarting.
         | 
| 135 | 
            +
            images_per_part = 200
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            '''
         | 
| 138 | 
            +
            A bunch of them will fail to download, and return web pages instead. These will
         | 
| 139 | 
            +
            need to be cleaned up later. See downloaded_validation_report.tsv after it downloads
         | 
| 140 | 
            +
            for HTTP errors. Around 10-11% of images are gone, based on validation set results. Setting
         | 
| 141 | 
            +
            the user agent could fix some errors too maybe - not sure if any requests are rejected by
         | 
| 142 | 
            +
            sites based on this.
         | 
| 143 | 
            +
            '''
         | 
| 144 | 
            +
            data_name = "validation"
         | 
| 145 | 
            +
            df = open_tsv("Validation_GCC-1.1.0-Validation.tsv", data_name)
         | 
| 146 | 
            +
            df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
         | 
| 147 | 
            +
            df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
         | 
| 148 | 
            +
            df.to_csv("downloaded_%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
         | 
| 149 | 
            +
            print("Saved.")
         | 
| 150 | 
            +
             | 
| 151 | 
            +
            data_name = "training"
         | 
| 152 | 
            +
            df = open_tsv("Train-GCC-training.tsv",data_name)
         | 
| 153 | 
            +
            df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
         | 
| 154 | 
            +
            df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
         | 
| 155 | 
            +
            df.to_csv("downloaded_%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
         | 
| 156 | 
            +
            print("Saved.")
         | 

