import json | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained('gpt2') | |
for i in tqdm(range(298)): | |
with open(f'wikipedia_json_64_filtered/wikipedia.segmented.nltk.split.seq64.{i}.json', 'r') as f: | |
rows = json.load(f) | |
tokens = [row['gpt2_token'] for row in rows] | |
texts = tokenizer.batch_decode(tokens) | |
with open(f'wikipedia/{i}.txt', 'w') as f: | |
for txt in texts: | |
f.write(txt.strip() + '\n') | |