OpenGPT / tokenizer_train.py
VolodymyrPugachov's picture
Upload 17 files
6810eb1 verified
import argparse
import os
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
def main():
parser = argparse.ArgumentParser(description="Train a BPE tokenizer on a text corpus.")
parser.add_argument("--input", type=str, required=True, help="Path to input text file (raw corpus).")
parser.add_argument("--output", type=str, required=True, help="Directory to save the trained tokenizer files.")
parser.add_argument("--vocab_size", type=int, default=8000, help="Vocabulary size for the tokenizer.")
parser.add_argument("--min_frequency", type=int, default=2, help="Minimum frequency for tokens to be included.")
args = parser.parse_args()
# Ensure output directory exists
os.makedirs(args.output, exist_ok=True)
# Initialize a Byte-Pair Encoding (BPE) tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
# Use whitespace as a basic pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# Trainer for BPE model
trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=args.min_frequency,
special_tokens=["[PAD]", "[UNK]"])
# Train the tokenizer on the given file
tokenizer.train([args.input], trainer)
# Save the tokenizer model to the output directory
tokenizer_path = os.path.join(args.output, "tokenizer.json")
tokenizer.save(tokenizer_path)
print(f"Tokenizer trained and saved to {tokenizer_path}")
if __name__ == "__main__":
main()