File size: 873 Bytes
5446629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
import json
from datasets import load_dataset

full_dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split='train')

dataset = full_dataset.shuffle(seed=42).select(range(50000))

script_dir = os.getcwd()
data_folder = os.path.join(script_dir, 'data', 'raw_documents')

if not os.path.exists(data_folder):
    os.makedirs(data_folder)

for article in dataset:
    article_data = {
        'id': article['id'],
        'url': article['url'],
        'title': article['title'],
        'text': article['text'],
    }
    file_path = os.path.join(data_folder, f"{article['id']}.json")
    if not os.path.exists(file_path):
        with open(file_path, 'w', encoding='utf-8') as f:
            print(f.name, 'does not exist. creating file..')
            json.dump(article_data, f, indent=4)

if __name__ == '__main__':
    pass