Spaces:

ZurichNLP
/

subword-tokenization

Running

App Files Files

xet

Community

subword-tokenization / utils.py

jvamvas

Use gsw vocab from hf hub

1390fb6 5 months ago

raw

history blame

7.98 kB

	from typing import Dict, List, Tuple
	from pathlib import Path

	from transformers import AutoTokenizer
	import tiktoken

	# UZH color palette
	UZH_COLORS = [
	"#BACBFF", # UZH Blue V1
	"#DBF4F9", # UZH Cyan V1
	"#ECF6D6", # UZH Apple V1
	"#FFF4DA", # UZH Gold V1
	"#FFDBCC", # UZH Orange V1
	"#FBC6D4", # UZH Berry V1
	"#C2C2C2", # UZH Grey V1
	"#FAFAFA", # UZH Light Grey V1
	"#7596FF", # UZH Blue V2
	"#B7E9F4", # UZH Cyan V2
	"#DBEDAD", # UZH Apple V2
	"#FFE9B5", # UZH Gold V2
	"#FEB799", # UZH Orange V2
	"#F78CAA", # UZH Berry V2
	"#A3A3A3", # UZH Grey V2
	"#EFEFEF", # UZH Light Grey V2
	]

	def load_hf_tokenizer(name: str) -> Tuple[str, object]:
	"""
	Load a single HuggingFace tokenizer.

	Args:
	name: The name of the tokenizer to load

	Returns:
	Tuple of (tokenizer_name, tokenizer_object)
	"""
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	name,
	use_fast=True,
	model_max_length=1000000,
	clean_up_tokenization_spaces=True,
	legacy=False
	)
	except Exception as e:
	tokenizer = AutoTokenizer.from_pretrained(
	name,
	model_max_length=1000000,
	clean_up_tokenization_spaces=True,
	legacy=False
	)
	return name, tokenizer

	def load_openai_tokenizer(name: str) -> Tuple[str, object]:
	"""
	Load a single OpenAI tokenizer.

	Args:
	name: The name of the tokenizer to load

	Returns:
	Tuple of (tokenizer_name, tokenizer_object)
	"""
	return name, tiktoken.encoding_for_model(name)

	def load_gsw_tokenizer() -> Tuple[str, object]:
	"""
	Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.

	Returns:
	Tuple of (tokenizer_name, tokenizer_object)
	"""
	tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab")
	return "swissbert-gsw", tokenizer

	def load_tokenizers() -> Dict[str, object]:
	"""
	Load all tokenizers.

	Returns:
	Dictionary mapping tokenizer names to tokenizer objects
	"""
	tokenizers = {}

	# Load OpenAI tokenizers first
	openai_names = ["gpt-4o"]
	for name in openai_names:
	tokenizer_name, tokenizer = load_openai_tokenizer(name)
	tokenizers[tokenizer_name] = tokenizer

	# Load HuggingFace tokenizers in specified order
	hf_names = [
	"meta-llama/Llama-4-Scout-17B-16E-Instruct",
	"deepseek-ai/DeepSeek-V3-0324",
	"ZurichNLP/swissbert",
	"google/gemma-3-27b-it",
	"mistralai/Mistral-Nemo-Instruct-2407",
	"CohereLabs/aya-expanse-8b",
	]
	for name in hf_names:
	tokenizer_name, tokenizer = load_hf_tokenizer(name)
	tokenizers[tokenizer_name] = tokenizer

	return tokenizers

	# Mapping of model names to display names
	MODEL_DISPLAY_NAMES = {
	"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
	"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
	"ZurichNLP/swissbert": "SwissBERT 🇨🇭",
	"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
	"google/gemma-3-27b-it": "Gemma 3",
	"gpt-4o": "ChatGPT (gpt-4o)",
	"CohereLabs/aya-expanse-8b": "Aya Expanse"
	}

	def tokenize(s: str, tokenizer) -> List[str]:
	"""
	Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
	For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.

	Args:
	s: The string to tokenize
	tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()

	Returns:
	A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
	"""
	# Special handling for SwissBERT tokenizer
	if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
	# Get SwissBERT-GSW tokenizer
	_, gsw_tokenizer = load_gsw_tokenizer()

	# Get tokenizations from both tokenizers§
	swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
	gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)

	# Return the shorter tokenization
	shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
	if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
	shorter_tokens[0] = shorter_tokens[0][1:]
	return shorter_tokens

	return _tokenize_with_tokenizer(s, tokenizer)

	def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
	"""
	Internal helper function to tokenize a string with a given tokenizer.

	Args:
	s: The string to tokenize
	tokenizer: A tokenizer object

	Returns:
	A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
	"""
	if hasattr(tokenizer, "tokenize"):
	encoded = tokenizer.encode(s, add_special_tokens=False)
	if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
	tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
	else:
	tokens = tokenizer.convert_ids_to_tokens(encoded)

	filtered_tokens = []
	for t in tokens:
	if t.startswith("<") or t.startswith("["):
	continue
	elif "Ġ" in t:
	filtered_tokens.append(t.replace("Ġ", " "))
	elif "Ċ" in t:
	filtered_tokens.append(t.replace("Ċ", " "))
	elif t.startswith("▁"):
	filtered_tokens.append(" " + t[1:])
	else:
	filtered_tokens.append(t)

	return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]

	elif hasattr(tokenizer, "encode"):
	token_ids = tokenizer.encode(s)
	return [tokenizer.decode([token_id]) for token_id in token_ids]

	else:
	raise ValueError("Unsupported tokenizer type")

	def get_uzh_color(index):
	"""Get a color from the UZH color palette based on index."""
	return UZH_COLORS[index % len(UZH_COLORS)]

	def visualize_tokens(text: str, tokenizers: Dict[str, object]):
	"""
	Tokenize text with each tokenizer and visualize the tokens with colors.
	Colors are consistent across tokenizers for the same token sequences.
	Colors are deterministic based on token content.

	Args:
	text: The input text to tokenize
	tokenizers: Dictionary of tokenizers

	Returns:
	Dictionary mapping tokenizer names to HTML visualizations
	"""
	results = {}

	# First pass: collect all unique tokens across all tokenizers
	all_tokens = set()
	for tokenizer in tokenizers.values():
	tokens = tokenize(text, tokenizer)
	all_tokens.update(tokens)

	# Generate colors for all unique tokens using hash-based approach
	token_colors = {}
	for token in all_tokens:
	# Use hash of token to get a deterministic index
	token_hash = hash(token)
	# Ensure positive index and wrap around to color list length
	index = abs(token_hash) % len(UZH_COLORS)
	token_colors[token] = get_uzh_color(index)

	# Second pass: create visualizations using the consistent colors
	for name, tokenizer in tokenizers.items():
	tokens = tokenize(text, tokenizer)

	# Create a colored visualization
	html = ""

	# Build the HTML with colored spans for each token
	for token in tokens:
	color = token_colors[token]
	html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'

	results[name] = html

	return results