Spaces:
Runtime error
Runtime error
ml-en-stt-model
/
IndicTrans2
/huggingface_interface
/IndicTransToolkit
/tokenizer_training
/tokenizer.py
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from tokenizers.trainers import BpeTrainer | |
| from tokenizers.pre_tokenizers import Whitespace | |
| tokenizer = Tokenizer(BPE(unk_token="<unk>")) | |
| # Initialize trainer | |
| trainer = BpeTrainer( | |
| special_tokens=["<unk>", "<s>", "</s>", "<pad>"], | |
| ) | |
| # Train tokenizer on your corpus files | |
| tokenizer.pre_tokenizer = Whitespace() | |
| tokenizer.train(files=["tokenizer_corpus.txt"], trainer=trainer) | |
| # Save tokenizer | |
| tokenizer.save("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/tokenizer.json") | |