from tokenizers import models, trainers, Tokenizer from datasets import load_dataset # Step 1: Download the dataset and save it locally dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train") # Save the dataset locally to a text file with open("wikipedia_data.txt", "w", encoding="utf-8") as file: for example in dataset: if "text" in example: # Ensure the 'text' column exists file.write(example["text"] + "\n") # Step 2: Initialize the tokenizer tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]")) # Special tokens and trainer special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens) # Train the tokenizer using the local text file tokenizer.train(["wikipedia_data.txt"], trainer=trainer) # Step 3: Test the tokenizer encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.") print("Token IDs:", encoding.ids)