Tzktz's picture
Upload 7664 files
6fc683c verified
import json
import hashlib
import io
import os
import base64
from PIL import Image
from tqdm import tqdm
def calculate_md5(image):
md5_hash = hashlib.md5()
with io.BytesIO() as output:
image.save(output, format='JPEG')
image_data = output.getvalue()
md5_hash.update(image_data)
return md5_hash.hexdigest()
def process_files(directory):
tsv_data = []
for file in tqdm(os.listdir(directory)):
if file.endswith('.json'):
json_path = os.path.join(directory, file)
jpg_path = os.path.join(directory, file.replace('.json', '.jpg'))
with open(json_path, 'r') as json_file:
data = json.load(json_file)
image = Image.open(jpg_path)
md5 = calculate_md5(image)
caption = data['caption']
width = data['width']
height = data['height']
with io.BytesIO() as buffer:
image.save(buffer, format='JPEG')
image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
combined_data_str = {'phrase': data['noun_chunks'], 'expression_v1': data['ref_exps']}
tsv_row = [md5, caption, image_base64, width, height, combined_data_str]
tsv_data.append('\t'.join(map(str, tsv_row)))
return tsv_data
def write_tsv(tsv_data, output_file):
with open(output_file, 'w') as file:
file.write('\n'.join(tsv_data))
if __name__ == '__main__':
directory = '/tmp/grit'
output_file = '/tmp/output.tsv'
tsv_data = process_files(directory)
write_tsv(tsv_data, output_file)