Spaces:
Sleeping
Sleeping
import json | |
import os | |
import traceback | |
from tqdm import tqdm | |
from multiprocessing import Pool | |
ROOT_FROM = 'XXX' # the path of laion-ocr-zip | |
ROOT_TO = 'XXX' # the path for saving dataset | |
MULTIPROCESSING_NUM = 64 | |
DOWNLOAD_IMAGES = False # whether to download images from urls | |
def unzip_file(idx): | |
if not os.path.exists(f'{ROOT_FROM}/{idx}.zip') or os.path.exists(f'{ROOT_TO}/{idx}'): | |
return | |
cmd = f'unzip -q {ROOT_FROM}/{idx}.zip -d {ROOT_TO}' | |
os.system(cmd) | |
def multiprocess_unzip_file(idxs): | |
os.makedirs(ROOT_TO, exist_ok=True) | |
with Pool(processes=MULTIPROCESSING_NUM) as p: | |
with tqdm(total=len(idxs), desc='total') as pbar: | |
for i, _ in enumerate(p.imap_unordered(unzip_file, idxs)): | |
pbar.update() | |
print("multiprocess_unzip_file done!") | |
if __name__ == '__main__': | |
files = os.listdir(ROOT_FROM) | |
idxs = [str(idx[:-4]).zfill(5) for idx in files] | |
multiprocess_unzip_file(idxs) | |
print("Finished!") | |