dalat5 / src /train_tokeniser.py
crossroderick's picture
Refined (v5.3) update
3cf1937
import json
import sentencepiece as spm
from transformers import T5Tokenizer
# Load corpus data
corpus = []
with open("src/data/tokeniser_corpus.txt", "w", encoding = "utf-8") as f_out:
with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
for i, line in enumerate(f_in):
if i >= 1000000: # take 1000000 records for the tokeniser (no need to load everything in the corpus)
break
item = json.loads(line)
src = item["transliteration"]["src"]
tgt = item["transliteration"]["tgt"]
f_out.write(src + "\n")
f_out.write(tgt + "\n")
# Train the sentence piece model
spm.SentencePieceTrainer.Train(
input = "src/data/tokeniser_corpus.txt",
model_prefix = "src/tokeniser/dalat5_sp",
vocab_size = 40000,
model_type = "unigram", # worth testing with "bpe"
character_coverage = 1.0, # to preserve rare characters like ä, ñ, etc.
max_sentence_length = 8384,
pad_id = 0,
unk_id = 1,
bos_id = 2,
eos_id = 3,
user_defined_symbols = ["<pad>", "<s>", "</s>"]
)
# Convert to a HF-compatible format
tokenizer = T5Tokenizer.from_pretrained("src/tokeniser/dalat5_sp.model")
tokenizer.save_pretrained("src/tokeniser/")