|
import argparse |
|
import os |
|
from tokenizers import Tokenizer, models, trainers, pre_tokenizers |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Train a BPE tokenizer on a text corpus.") |
|
parser.add_argument("--input", type=str, required=True, help="Path to input text file (raw corpus).") |
|
parser.add_argument("--output", type=str, required=True, help="Directory to save the trained tokenizer files.") |
|
parser.add_argument("--vocab_size", type=int, default=8000, help="Vocabulary size for the tokenizer.") |
|
parser.add_argument("--min_frequency", type=int, default=2, help="Minimum frequency for tokens to be included.") |
|
args = parser.parse_args() |
|
|
|
|
|
os.makedirs(args.output, exist_ok=True) |
|
|
|
|
|
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]")) |
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() |
|
|
|
trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=args.min_frequency, |
|
special_tokens=["[PAD]", "[UNK]"]) |
|
|
|
tokenizer.train([args.input], trainer) |
|
|
|
|
|
tokenizer_path = os.path.join(args.output, "tokenizer.json") |
|
tokenizer.save(tokenizer_path) |
|
print(f"Tokenizer trained and saved to {tokenizer_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|