t5-vae-wiki / convert_files.py
Fraser's picture
add dataset scripts
2095da4
raw
history blame contribute delete
490 Bytes
import json
from tqdm import tqdm
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
for i in tqdm(range(298)):
with open(f'wikipedia_json_64_filtered/wikipedia.segmented.nltk.split.seq64.{i}.json', 'r') as f:
rows = json.load(f)
tokens = [row['gpt2_token'] for row in rows]
texts = tokenizer.batch_decode(tokens)
with open(f'wikipedia/{i}.txt', 'w') as f:
for txt in texts:
f.write(txt.strip() + '\n')