Spaces:
Sleeping
Sleeping
from datasets import load_dataset, concatenate_datasets, load_from_disk | |
import torch | |
import os | |
def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01): | |
list_datasets = [] | |
for directory in os.listdir(root_data_path): | |
path_to_dir = os.path.join(root_data_path, directory) | |
if os.path.isdir(path_to_dir): | |
print(f"Processing: {path_to_dir}") | |
english_text = None | |
french_text = None | |
for file_dir in os.listdir(path_to_dir): | |
if file_dir.endswith(".en"): | |
english_text = os.path.join(path_to_dir, file_dir) | |
if file_dir.endswith(".fr"): | |
french_text = os.path.join(path_to_dir, file_dir) | |
if english_text is not None and french_text is not None: | |
english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"] | |
french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"] | |
english_dataset = english_dataset.rename_column("text", "english_src") | |
dataset = english_dataset.add_column("french_tgt", french_dataset["text"]) | |
list_datasets.append(dataset) | |
hf_dataset = concatenate_datasets(list_datasets) | |
hf_dataset = hf_dataset.train_test_split(test_size=test_size) | |
hf_dataset.save_to_disk(save_data_path) | |
print(f"Dataset successfully saved in: {save_data_path}") | |
def push_dataset_into_hf_hub(save_data_path): | |
dataset = load_from_disk(dataset_path=save_data_path) | |
dataset = dataset.shuffle() | |
dataset.push_to_hub(repo_id="ngia/translation-en-fr") | |
print("Successfully pushed on Hugging Face Hub") | |
if __name__ == "__main__": | |
root_data_path = "data/raw_data/" | |
save_data_path = "data/saved_data/" | |
cache_data_path = "data/cached_data/" | |
create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path) | |
dataset = load_from_disk(dataset_path=save_data_path) | |
print(dataset["train"][10]) | |
push_dataset_into_hf_hub(save_data_path=save_data_path) | |