|
import json |
|
import sentencepiece as spm |
|
from transformers import T5Tokenizer |
|
|
|
|
|
|
|
corpus = [] |
|
|
|
with open("src/data/tokeniser_corpus.txt", "w", encoding = "utf-8") as f_out: |
|
with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in: |
|
for i, line in enumerate(f_in): |
|
if i >= 1000000: |
|
break |
|
|
|
item = json.loads(line) |
|
src = item["transliteration"]["src"] |
|
tgt = item["transliteration"]["tgt"] |
|
|
|
f_out.write(src + "\n") |
|
f_out.write(tgt + "\n") |
|
|
|
|
|
spm.SentencePieceTrainer.Train( |
|
input = "src/data/tokeniser_corpus.txt", |
|
model_prefix = "src/tokeniser/dalat5_sp", |
|
vocab_size = 40000, |
|
model_type = "unigram", |
|
character_coverage = 1.0, |
|
max_sentence_length = 8384, |
|
pad_id = 0, |
|
unk_id = 1, |
|
bos_id = 2, |
|
eos_id = 3, |
|
user_defined_symbols = ["<pad>", "<s>", "</s>"] |
|
) |
|
|
|
|
|
tokenizer = T5Tokenizer.from_pretrained("src/tokeniser/dalat5_sp.model") |
|
|
|
tokenizer.save_pretrained("src/tokeniser/") |