from tokenizers import models, trainers, Tokenizer
from datasets import load_dataset

# Step 1: Download the dataset and save it locally
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")

# Save the dataset locally to a text file
with open("wikipedia_data.txt", "w", encoding="utf-8") as file:
    for example in dataset:
        if "text" in example:  # Ensure the 'text' column exists
            file.write(example["text"] + "\n")

# Step 2: Initialize the tokenizer
tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))

# Special tokens and trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

# Train the tokenizer using the local text file
tokenizer.train(["wikipedia_data.txt"], trainer=trainer)

# Step 3: Test the tokenizer
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print("Token IDs:", encoding.ids)