from datasets import load_dataset, concatenate_datasets, load_from_disk import torch import os def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01): list_datasets = [] for directory in os.listdir(root_data_path): path_to_dir = os.path.join(root_data_path, directory) if os.path.isdir(path_to_dir): print(f"Processing: {path_to_dir}") english_text = None french_text = None for file_dir in os.listdir(path_to_dir): if file_dir.endswith(".en"): english_text = os.path.join(path_to_dir, file_dir) if file_dir.endswith(".fr"): french_text = os.path.join(path_to_dir, file_dir) if english_text is not None and french_text is not None: english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"] french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"] english_dataset = english_dataset.rename_column("text", "english_src") dataset = english_dataset.add_column("french_tgt", french_dataset["text"]) list_datasets.append(dataset) hf_dataset = concatenate_datasets(list_datasets) hf_dataset = hf_dataset.train_test_split(test_size=test_size) hf_dataset.save_to_disk(save_data_path) print(f"Dataset successfully saved in: {save_data_path}") def push_dataset_into_hf_hub(save_data_path): dataset = load_from_disk(dataset_path=save_data_path) dataset = dataset.shuffle() dataset.push_to_hub(repo_id="ngia/translation-en-fr") print("Successfully pushed on Hugging Face Hub") if __name__ == "__main__": root_data_path = "data/raw_data/" save_data_path = "data/saved_data/" cache_data_path = "data/cached_data/" create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path) dataset = load_from_disk(dataset_path=save_data_path) print(dataset["train"][10]) push_dataset_into_hf_hub(save_data_path=save_data_path)