translation-en-fr / tokenize_dataset.py
ngia's picture
deploy on hugging face spaces for inference
d91ea77
from tokenizer import CustomTokenizer
from datasets import load_from_disk
def tokenize_dataset(path_to_dataset,
path_to_save,
num_workers=24,
truncate=False,
max_length=512,
min_length=3):
english_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_en.json", truncate=truncate, max_length=max_length)
french_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_fr.json", truncate=truncate, max_length=max_length)
dataset = load_from_disk(path_to_dataset)
def _tokenize_text(examples):
english_text = examples["english_src"]
french_text = examples["french_tgt"]
src_ids = english_tokenizer.encode(english_text)
tgt_ids = french_tokenizer.encode(french_text)
batch = {
"src_ids": src_ids,
"tgt_ids": tgt_ids
}
return batch
tokenized_dataset = dataset.map(_tokenize_text, batched=True, num_proc=num_workers)
tokenized_dataset = tokenized_dataset.remove_columns(["english_src", "french_tgt"])
filter_func = lambda batch: [len(e) >= min_length for e in batch["tgt_ids"]]
tokenized_dataset = tokenized_dataset.filter(filter_func, batched=True)
print(tokenized_dataset)
tokenized_dataset.save_to_disk(path_to_save)
print("Tokenized dataset is successfully saved into the disk")
if __name__ == "__main__":
path_to_dataset = "data/saved_data"
path_to_save = "data/tokenized_dataset"
tokenize_dataset(path_to_dataset=path_to_dataset, path_to_save=path_to_save)
#push dataset into the hub:
tokenized_dataset = load_from_disk(dataset_path=path_to_save)
tokenized_dataset.push_to_hub("ngia/tokenized-translation-en-fr")
print("Tokenized dataset is successfully pushed into Hugging Face hub")