Spaces:
Running
Running
File size: 873 Bytes
5446629 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import os
import json
from datasets import load_dataset
full_dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split='train')
dataset = full_dataset.shuffle(seed=42).select(range(50000))
script_dir = os.getcwd()
data_folder = os.path.join(script_dir, 'data', 'raw_documents')
if not os.path.exists(data_folder):
os.makedirs(data_folder)
for article in dataset:
article_data = {
'id': article['id'],
'url': article['url'],
'title': article['title'],
'text': article['text'],
}
file_path = os.path.join(data_folder, f"{article['id']}.json")
if not os.path.exists(file_path):
with open(file_path, 'w', encoding='utf-8') as f:
print(f.name, 'does not exist. creating file..')
json.dump(article_data, f, indent=4)
if __name__ == '__main__':
pass |