Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Runtime error

App Files Files Community

arabic-tokenizers-leaderboard / app.py

MohamedRashad

chore: Update leaderboard description and notes in app.py

8f88df6 6 months ago

raw

history blame contribute delete

11 kB

	from transformers import AutoTokenizer
	from tqdm import tqdm
	import gradio as gr
	import pandas as pd
	from datasets import load_dataset
	import random
	from pathlib import Path

	initial_list_of_models = [
	"asafaya/bert-base-arabic",
	"Xenova/gpt-4o",
	"FreedomIntelligence/AceGPT-v1.5-13B-Chat",
	"FreedomIntelligence/AceGPT-13B",
	"Qwen/Qwen1.5-7B-Chat",
	"Qwen/Qwen1.5-110B-Chat",
	"microsoft/Phi-3-mini-128k-instruct",
	"unsloth/gemma-2b-bnb-4bit",
	"NousResearch/Meta-Llama-3-8B",
	"CohereForAI/c4ai-command-r-v01",
	"CohereForAI/c4ai-command-r-plus",
	"core42/jais-13b",
	"core42/jais-30b-chat-v3",
	]

	dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
	if dataframe_path.exists():
	df = pd.read_json(dataframe_path, lines=True)
	else:
	df = pd.DataFrame(
	columns=[
	"👳 Tokenize Tashkeel",
	"📛 Models",
	"🪺 Fertility Score",
	"➕ Total Number of Tokens",
	"📘 Vocab Size",
	"Tokenizer Class",
	]
	)

	# Datasets used for calculating the number of tokens
	arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
	arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
	arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
	all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
	print(f"Total number of samples: {len(all_data)}")
	all_text = " ".join(all_data)
	all_words = all_text.split()

	def benchmark_tokenizer(model_name) -> float:
	# Initialize the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_name, use_fast=True, trust_remote_code=True
	)
	vocab_size = tokenizer.vocab_size
	total_number_of_tokens = len(tokenizer.tokenize(all_text))

	# Check if the tokenizer maintains the tashkeel
	dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
	tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
	tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"

	return {
	"👳 Tokenize Tashkeel": tashkeel_maintainer,
	"📛 Models": model_name,
	"🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
	"📘 Vocab Size": vocab_size,
	"➕ Total Number of Tokens": total_number_of_tokens,
	"Tokenizer Class": tokenizer.__class__.__name__,
	}


	for model_name in tqdm(initial_list_of_models):
	if model_name in df["📛 Models"].values:
	continue

	benchmark_data = benchmark_tokenizer(model_name)
	df = df._append(benchmark_data, ignore_index=True)

	# Sort the dataframe by the number of tokens
	df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)

	# Save the dataframe to a csv file
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)


	def submit(model_name):
	global df
	if model_name in df["📛 Models"].values:
	return (
	gr.Dataframe(df),
	gr.BarPlot(df),
	gr.Dropdown(choices=df["📛 Models"].tolist()),
	)
	benchmark_data = benchmark_tokenizer(model_name)
	df = df._append(benchmark_data, ignore_index=True)
	df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
	return (
	gr.Dataframe(df),
	gr.BarPlot(df),
	gr.Dropdown(choices=df["📛 Models"].tolist()),
	)


	def generate_distinct_colors(n):
	"""Generate n visually distinct colors in hexadecimal format."""
	if n > 256**3:
	raise ValueError("Cannot generate more than 16,777,216 unique colors.")

	# To ensure colors are distinct, calculate an appropriate distance between colors
	# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
	spacing = int((256 * 256 * 256) (1 / 3) / n (1 / 3))
	max_val = 256 - spacing

	# Set to keep track of used colors
	used_colors = set()

	# List to store the result colors
	result = []

	attempts = 0
	while len(result) < n:
	# Generate a color with a random start and controlled spacing
	r = random.randint(0, max_val)
	g = random.randint(0, max_val)
	b = random.randint(0, max_val)

	# Scale up by spacing to ensure minimum distance between colors
	r = min(255, r * spacing)
	g = min(255, g * spacing)
	b = min(255, b * spacing)

	# Format the color in hexadecimal
	color = f"#{r:02X}{g:02X}{b:02X}"

	# Ensure this color hasn't been used
	if color not in used_colors:
	used_colors.add(color)
	result.append(color)
	else:
	attempts += 1
	if attempts > 50:
	# Dynamically adjust spacing if stuck
	spacing = max(1, spacing - 1)
	max_val = 256 - spacing
	attempts = 0

	return result


	def decode_bpe_tokens(tokens):
	fixed_tokens = []
	for token in tokens:
	# Check if the token starts with the special BPE space character 'Ġ'
	if token.startswith("Ġ"):
	# Process the rest of the token
	try:
	# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
	fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
	except UnicodeDecodeError:
	fixed_token = token # Use the original token if decoding fails
	else:
	try:
	# Directly encode and decode without misinterpretation steps
	fixed_token = token.encode("utf-8").decode("utf-8")
	except UnicodeDecodeError:
	fixed_token = token # Use the original token if decoding fails
	fixed_tokens.append(fixed_token)
	return fixed_tokens


	def tokenize_text(text, chosen_model, better_tokenization=False):
	tokenizer = AutoTokenizer.from_pretrained(chosen_model)
	tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
	random_colors = generate_distinct_colors(len(tokenized_text))

	if better_tokenization:
	final_tokenized_text = []
	for token in tokenized_text:
	correct_tokenized_text = ""
	for char in text:
	correct_tokenized_text += char
	current_token = decode_bpe_tokens(
	tokenizer.tokenize(correct_tokenized_text)
	)
	if current_token[0] == token:
	final_tokenized_text.append(correct_tokenized_text)
	text = text[len(correct_tokenized_text) :]
	break
	else:
	final_tokenized_text = tokenized_text
	print(final_tokenized_text)

	output = []
	color_map = {}
	for idx, token in enumerate(final_tokenized_text):
	output.append((token, str(idx)))
	color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]

	return gr.HighlightedText(output, color_map)


	def refresh():
	global df
	df = pd.read_json(dataframe_path, lines=True)
	return (
	gr.Dataframe(df),
	gr.BarPlot(df),
	gr.Dropdown(choices=df["📛 Models"].tolist()),
	)

	leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).

	A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.

	## Updates/Notes:
	1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
	1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better).
	1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
	1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
	1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
	1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better).

	Note: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving).
	"""

	with gr.Blocks() as demo:
	gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
	gr.Markdown("## What is the best tokenizer for Arabic?")
	gr.Markdown(leaderboard_description)
	with gr.Tab(label="Leaderboard"):
	dataframe = gr.Dataframe(df)
	with gr.Accordion("Barplot", open=False):
	barplot = gr.BarPlot(
	df,
	x="📛 Models",
	y="➕ Total Number of Tokens",
	x_title=" ",
	y_title=" ",
	width=1000,
	height=400,
	tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
	vertical=False,
	x_label_angle=30,
	)
	model_name = gr.Textbox(
	label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
	)
	with gr.Row():
	submit_new_model_btn = gr.Button(
	value="Submit New Model", variant="primary", scale=3
	)
	refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
	with gr.Tab(label="Try tokenizers"):
	text = gr.Textbox(
	label="Enter a text",
	lines=5,
	value="السلام عليكم ورحمة الله",
	rtl=True,
	text_align="right",
	)
	dropdown = gr.Dropdown(
	label="Select a model",
	choices=df["📛 Models"].tolist(),
	value=df["📛 Models"].tolist()[0],
	)
	with gr.Row():
	submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
	checkbox = gr.Checkbox(
	label="Better tokenization for Arabic Text", value=False, scale=1
	)
	tokenized_textbox = gr.HighlightedText(label="Tokenized text")

	submit_new_model_btn.click(
	submit, model_name, outputs=[dataframe, barplot, dropdown]
	)
	refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
	submit_text_btn.click(
	tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
	)


	demo.launch()