|
from typing import Dict, List, Tuple |
|
from pathlib import Path |
|
|
|
from transformers import AutoTokenizer |
|
import tiktoken |
|
|
|
|
|
UZH_COLORS = [ |
|
"#BACBFF", |
|
"#DBF4F9", |
|
"#ECF6D6", |
|
"#FFF4DA", |
|
"#FFDBCC", |
|
"#FBC6D4", |
|
"#C2C2C2", |
|
"#FAFAFA", |
|
"#7596FF", |
|
"#B7E9F4", |
|
"#DBEDAD", |
|
"#FFE9B5", |
|
"#FEB799", |
|
"#F78CAA", |
|
"#A3A3A3", |
|
"#EFEFEF", |
|
] |
|
|
|
def load_hf_tokenizer(name: str) -> Tuple[str, object]: |
|
""" |
|
Load a single HuggingFace tokenizer. |
|
|
|
Args: |
|
name: The name of the tokenizer to load |
|
|
|
Returns: |
|
Tuple of (tokenizer_name, tokenizer_object) |
|
""" |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
name, |
|
use_fast=True, |
|
model_max_length=1000000, |
|
clean_up_tokenization_spaces=True, |
|
legacy=False |
|
) |
|
except Exception as e: |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
name, |
|
model_max_length=1000000, |
|
clean_up_tokenization_spaces=True, |
|
legacy=False |
|
) |
|
return name, tokenizer |
|
|
|
def load_openai_tokenizer(name: str) -> Tuple[str, object]: |
|
""" |
|
Load a single OpenAI tokenizer. |
|
|
|
Args: |
|
name: The name of the tokenizer to load |
|
|
|
Returns: |
|
Tuple of (tokenizer_name, tokenizer_object) |
|
""" |
|
return name, tiktoken.encoding_for_model(name) |
|
|
|
def load_gsw_tokenizer() -> Tuple[str, object]: |
|
""" |
|
Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory. |
|
|
|
Returns: |
|
Tuple of (tokenizer_name, tokenizer_object) |
|
""" |
|
tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab") |
|
return "swissbert-gsw", tokenizer |
|
|
|
def load_tokenizers() -> Dict[str, object]: |
|
""" |
|
Load all tokenizers. |
|
|
|
Returns: |
|
Dictionary mapping tokenizer names to tokenizer objects |
|
""" |
|
tokenizers = {} |
|
|
|
|
|
openai_names = ["gpt-4o"] |
|
for name in openai_names: |
|
tokenizer_name, tokenizer = load_openai_tokenizer(name) |
|
tokenizers[tokenizer_name] = tokenizer |
|
|
|
|
|
hf_names = [ |
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct", |
|
"deepseek-ai/DeepSeek-V3-0324", |
|
"ZurichNLP/swissbert", |
|
"google/gemma-3-27b-it", |
|
"mistralai/Mistral-Nemo-Instruct-2407", |
|
"CohereLabs/aya-expanse-8b", |
|
] |
|
for name in hf_names: |
|
tokenizer_name, tokenizer = load_hf_tokenizer(name) |
|
tokenizers[tokenizer_name] = tokenizer |
|
|
|
return tokenizers |
|
|
|
|
|
MODEL_DISPLAY_NAMES = { |
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4", |
|
"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3", |
|
"ZurichNLP/swissbert": "SwissBERT 🇨🇭", |
|
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo", |
|
"google/gemma-3-27b-it": "Gemma 3", |
|
"gpt-4o": "ChatGPT (gpt-4o)", |
|
"CohereLabs/aya-expanse-8b": "Aya Expanse" |
|
} |
|
|
|
def tokenize(s: str, tokenizer) -> List[str]: |
|
""" |
|
Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers(). |
|
For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one. |
|
|
|
Args: |
|
s: The string to tokenize |
|
tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers() |
|
|
|
Returns: |
|
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed |
|
""" |
|
|
|
if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower(): |
|
|
|
_, gsw_tokenizer = load_gsw_tokenizer() |
|
|
|
|
|
swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer) |
|
gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer) |
|
|
|
|
|
shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens |
|
if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "): |
|
shorter_tokens[0] = shorter_tokens[0][1:] |
|
return shorter_tokens |
|
|
|
return _tokenize_with_tokenizer(s, tokenizer) |
|
|
|
def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]: |
|
""" |
|
Internal helper function to tokenize a string with a given tokenizer. |
|
|
|
Args: |
|
s: The string to tokenize |
|
tokenizer: A tokenizer object |
|
|
|
Returns: |
|
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed |
|
""" |
|
if hasattr(tokenizer, "tokenize"): |
|
encoded = tokenizer.encode(s, add_special_tokens=False) |
|
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]): |
|
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded] |
|
else: |
|
tokens = tokenizer.convert_ids_to_tokens(encoded) |
|
|
|
filtered_tokens = [] |
|
for t in tokens: |
|
if t.startswith("<") or t.startswith("["): |
|
continue |
|
elif "Ġ" in t: |
|
filtered_tokens.append(t.replace("Ġ", " ")) |
|
elif "Ċ" in t: |
|
filtered_tokens.append(t.replace("Ċ", " ")) |
|
elif t.startswith("▁"): |
|
filtered_tokens.append(" " + t[1:]) |
|
else: |
|
filtered_tokens.append(t) |
|
|
|
return [t.rstrip("##").rstrip("@@") for t in filtered_tokens] |
|
|
|
elif hasattr(tokenizer, "encode"): |
|
token_ids = tokenizer.encode(s) |
|
return [tokenizer.decode([token_id]) for token_id in token_ids] |
|
|
|
else: |
|
raise ValueError("Unsupported tokenizer type") |
|
|
|
def get_uzh_color(index): |
|
"""Get a color from the UZH color palette based on index.""" |
|
return UZH_COLORS[index % len(UZH_COLORS)] |
|
|
|
def visualize_tokens(text: str, tokenizers: Dict[str, object]): |
|
""" |
|
Tokenize text with each tokenizer and visualize the tokens with colors. |
|
Colors are consistent across tokenizers for the same token sequences. |
|
Colors are deterministic based on token content. |
|
|
|
Args: |
|
text: The input text to tokenize |
|
tokenizers: Dictionary of tokenizers |
|
|
|
Returns: |
|
Dictionary mapping tokenizer names to HTML visualizations |
|
""" |
|
results = {} |
|
|
|
|
|
all_tokens = set() |
|
for tokenizer in tokenizers.values(): |
|
tokens = tokenize(text, tokenizer) |
|
all_tokens.update(tokens) |
|
|
|
|
|
token_colors = {} |
|
for token in all_tokens: |
|
|
|
token_hash = hash(token) |
|
|
|
index = abs(token_hash) % len(UZH_COLORS) |
|
token_colors[token] = get_uzh_color(index) |
|
|
|
|
|
for name, tokenizer in tokenizers.items(): |
|
tokens = tokenize(text, tokenizer) |
|
|
|
|
|
html = "" |
|
|
|
|
|
for token in tokens: |
|
color = token_colors[token] |
|
html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>' |
|
|
|
results[name] = html |
|
|
|
return results |
|
|