translation-en-fr / process_raw_data.py
ngia's picture
deploy on hugging face spaces for inference
d91ea77
from datasets import load_dataset, concatenate_datasets, load_from_disk
import torch
import os
def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01):
list_datasets = []
for directory in os.listdir(root_data_path):
path_to_dir = os.path.join(root_data_path, directory)
if os.path.isdir(path_to_dir):
print(f"Processing: {path_to_dir}")
english_text = None
french_text = None
for file_dir in os.listdir(path_to_dir):
if file_dir.endswith(".en"):
english_text = os.path.join(path_to_dir, file_dir)
if file_dir.endswith(".fr"):
french_text = os.path.join(path_to_dir, file_dir)
if english_text is not None and french_text is not None:
english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"]
french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"]
english_dataset = english_dataset.rename_column("text", "english_src")
dataset = english_dataset.add_column("french_tgt", french_dataset["text"])
list_datasets.append(dataset)
hf_dataset = concatenate_datasets(list_datasets)
hf_dataset = hf_dataset.train_test_split(test_size=test_size)
hf_dataset.save_to_disk(save_data_path)
print(f"Dataset successfully saved in: {save_data_path}")
def push_dataset_into_hf_hub(save_data_path):
dataset = load_from_disk(dataset_path=save_data_path)
dataset = dataset.shuffle()
dataset.push_to_hub(repo_id="ngia/translation-en-fr")
print("Successfully pushed on Hugging Face Hub")
if __name__ == "__main__":
root_data_path = "data/raw_data/"
save_data_path = "data/saved_data/"
cache_data_path = "data/cached_data/"
create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path)
dataset = load_from_disk(dataset_path=save_data_path)
print(dataset["train"][10])
push_dataset_into_hf_hub(save_data_path=save_data_path)