Spaces:
Sleeping
Sleeping
from tokenizer import CustomTokenizer | |
from datasets import load_from_disk | |
def tokenize_dataset(path_to_dataset, | |
path_to_save, | |
num_workers=24, | |
truncate=False, | |
max_length=512, | |
min_length=3): | |
english_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_en.json", truncate=truncate, max_length=max_length) | |
french_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_fr.json", truncate=truncate, max_length=max_length) | |
dataset = load_from_disk(path_to_dataset) | |
def _tokenize_text(examples): | |
english_text = examples["english_src"] | |
french_text = examples["french_tgt"] | |
src_ids = english_tokenizer.encode(english_text) | |
tgt_ids = french_tokenizer.encode(french_text) | |
batch = { | |
"src_ids": src_ids, | |
"tgt_ids": tgt_ids | |
} | |
return batch | |
tokenized_dataset = dataset.map(_tokenize_text, batched=True, num_proc=num_workers) | |
tokenized_dataset = tokenized_dataset.remove_columns(["english_src", "french_tgt"]) | |
filter_func = lambda batch: [len(e) >= min_length for e in batch["tgt_ids"]] | |
tokenized_dataset = tokenized_dataset.filter(filter_func, batched=True) | |
print(tokenized_dataset) | |
tokenized_dataset.save_to_disk(path_to_save) | |
print("Tokenized dataset is successfully saved into the disk") | |
if __name__ == "__main__": | |
path_to_dataset = "data/saved_data" | |
path_to_save = "data/tokenized_dataset" | |
tokenize_dataset(path_to_dataset=path_to_dataset, path_to_save=path_to_save) | |
#push dataset into the hub: | |
tokenized_dataset = load_from_disk(dataset_path=path_to_save) | |
tokenized_dataset.push_to_hub("ngia/tokenized-translation-en-fr") | |
print("Tokenized dataset is successfully pushed into Hugging Face hub") | |