Spaces:

ngia
/

translation-en-fr

Sleeping

App Files Files Community

translation-en-fr / process_raw_data.py

ngia

deploy on hugging face spaces for inference

d91ea77 5 months ago

raw

history blame contribute delete

2.36 kB

	from datasets import load_dataset, concatenate_datasets, load_from_disk
	import torch

	import os


	def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01):

	list_datasets = []

	for directory in os.listdir(root_data_path):

	path_to_dir = os.path.join(root_data_path, directory)

	if os.path.isdir(path_to_dir):

	print(f"Processing: {path_to_dir}")

	english_text = None
	french_text = None

	for file_dir in os.listdir(path_to_dir):

	if file_dir.endswith(".en"):
	english_text = os.path.join(path_to_dir, file_dir)

	if file_dir.endswith(".fr"):
	french_text = os.path.join(path_to_dir, file_dir)

	if english_text is not None and french_text is not None:
	english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"]
	french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"]

	english_dataset = english_dataset.rename_column("text", "english_src")
	dataset = english_dataset.add_column("french_tgt", french_dataset["text"])

	list_datasets.append(dataset)


	hf_dataset = concatenate_datasets(list_datasets)
	hf_dataset = hf_dataset.train_test_split(test_size=test_size)

	hf_dataset.save_to_disk(save_data_path)
	print(f"Dataset successfully saved in: {save_data_path}")


	def push_dataset_into_hf_hub(save_data_path):
	dataset = load_from_disk(dataset_path=save_data_path)
	dataset = dataset.shuffle()
	dataset.push_to_hub(repo_id="ngia/translation-en-fr")
	print("Successfully pushed on Hugging Face Hub")


	if __name__ == "__main__":
	root_data_path = "data/raw_data/"
	save_data_path = "data/saved_data/"
	cache_data_path = "data/cached_data/"

	create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path)
	dataset = load_from_disk(dataset_path=save_data_path)
	print(dataset["train"][10])

	push_dataset_into_hf_hub(save_data_path=save_data_path)