Tzktz's picture
Upload 7664 files
6fc683c verified
raw
history blame contribute delete
983 Bytes
import json
import os
import traceback
from tqdm import tqdm
from multiprocessing import Pool
ROOT_FROM = 'XXX' # the path of laion-ocr-zip
ROOT_TO = 'XXX' # the path for saving dataset
MULTIPROCESSING_NUM = 64
DOWNLOAD_IMAGES = False # whether to download images from urls
def unzip_file(idx):
if not os.path.exists(f'{ROOT_FROM}/{idx}.zip') or os.path.exists(f'{ROOT_TO}/{idx}'):
return
cmd = f'unzip -q {ROOT_FROM}/{idx}.zip -d {ROOT_TO}'
os.system(cmd)
def multiprocess_unzip_file(idxs):
os.makedirs(ROOT_TO, exist_ok=True)
with Pool(processes=MULTIPROCESSING_NUM) as p:
with tqdm(total=len(idxs), desc='total') as pbar:
for i, _ in enumerate(p.imap_unordered(unzip_file, idxs)):
pbar.update()
print("multiprocess_unzip_file done!")
if __name__ == '__main__':
files = os.listdir(ROOT_FROM)
idxs = [str(idx[:-4]).zfill(5) for idx in files]
multiprocess_unzip_file(idxs)
print("Finished!")