Spaces:
Runtime error
Runtime error
from transformers import AutoTokenizer | |
from tqdm import tqdm | |
import gradio as gr | |
import pandas as pd | |
from datasets import load_dataset | |
import random | |
from pathlib import Path | |
initial_list_of_models = [ | |
"asafaya/bert-base-arabic", | |
"Xenova/gpt-4o", | |
"FreedomIntelligence/AceGPT-v1.5-13B-Chat", | |
"FreedomIntelligence/AceGPT-13B", | |
"Qwen/Qwen1.5-7B-Chat", | |
"Qwen/Qwen1.5-110B-Chat", | |
"microsoft/Phi-3-mini-128k-instruct", | |
"unsloth/gemma-2b-bnb-4bit", | |
"NousResearch/Meta-Llama-3-8B", | |
"CohereForAI/c4ai-command-r-v01", | |
"CohereForAI/c4ai-command-r-plus", | |
"core42/jais-13b", | |
"core42/jais-30b-chat-v3", | |
] | |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl" | |
if dataframe_path.exists(): | |
df = pd.read_json(dataframe_path, lines=True) | |
else: | |
df = pd.DataFrame( | |
columns=[ | |
"👳 Tokenize Tashkeel", | |
"📛 Models", | |
"🪺 Fertility Score", | |
"➕ Total Number of Tokens", | |
"📘 Vocab Size", | |
"Tokenizer Class", | |
] | |
) | |
# Datasets used for calculating the number of tokens | |
arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"] | |
arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"] | |
arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"] | |
all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3 | |
print(f"Total number of samples: {len(all_data)}") | |
all_text = " ".join(all_data) | |
all_words = all_text.split() | |
def benchmark_tokenizer(model_name) -> float: | |
# Initialize the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name, use_fast=True, trust_remote_code=True | |
) | |
vocab_size = tokenizer.vocab_size | |
total_number_of_tokens = len(tokenizer.tokenize(all_text)) | |
# Check if the tokenizer maintains the tashkeel | |
dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ" | |
tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True) | |
tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌" | |
return { | |
"👳 Tokenize Tashkeel": tashkeel_maintainer, | |
"📛 Models": model_name, | |
"🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3), | |
"📘 Vocab Size": vocab_size, | |
"➕ Total Number of Tokens": total_number_of_tokens, | |
"Tokenizer Class": tokenizer.__class__.__name__, | |
} | |
for model_name in tqdm(initial_list_of_models): | |
if model_name in df["📛 Models"].values: | |
continue | |
benchmark_data = benchmark_tokenizer(model_name) | |
df = df._append(benchmark_data, ignore_index=True) | |
# Sort the dataframe by the number of tokens | |
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) | |
# Save the dataframe to a csv file | |
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) | |
def submit(model_name): | |
global df | |
if model_name in df["📛 Models"].values: | |
return ( | |
gr.Dataframe(df), | |
gr.BarPlot(df), | |
gr.Dropdown(choices=df["📛 Models"].tolist()), | |
) | |
benchmark_data = benchmark_tokenizer(model_name) | |
df = df._append(benchmark_data, ignore_index=True) | |
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) | |
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) | |
return ( | |
gr.Dataframe(df), | |
gr.BarPlot(df), | |
gr.Dropdown(choices=df["📛 Models"].tolist()), | |
) | |
def generate_distinct_colors(n): | |
"""Generate n visually distinct colors in hexadecimal format.""" | |
if n > 256**3: | |
raise ValueError("Cannot generate more than 16,777,216 unique colors.") | |
# To ensure colors are distinct, calculate an appropriate distance between colors | |
# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate | |
spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3)) | |
max_val = 256 - spacing | |
# Set to keep track of used colors | |
used_colors = set() | |
# List to store the result colors | |
result = [] | |
attempts = 0 | |
while len(result) < n: | |
# Generate a color with a random start and controlled spacing | |
r = random.randint(0, max_val) | |
g = random.randint(0, max_val) | |
b = random.randint(0, max_val) | |
# Scale up by spacing to ensure minimum distance between colors | |
r = min(255, r * spacing) | |
g = min(255, g * spacing) | |
b = min(255, b * spacing) | |
# Format the color in hexadecimal | |
color = f"#{r:02X}{g:02X}{b:02X}" | |
# Ensure this color hasn't been used | |
if color not in used_colors: | |
used_colors.add(color) | |
result.append(color) | |
else: | |
attempts += 1 | |
if attempts > 50: | |
# Dynamically adjust spacing if stuck | |
spacing = max(1, spacing - 1) | |
max_val = 256 - spacing | |
attempts = 0 | |
return result | |
def decode_bpe_tokens(tokens): | |
fixed_tokens = [] | |
for token in tokens: | |
# Check if the token starts with the special BPE space character 'Ġ' | |
if token.startswith("Ġ"): | |
# Process the rest of the token | |
try: | |
# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters | |
fixed_token = " " + token[1:].encode("utf-8").decode("utf-8") | |
except UnicodeDecodeError: | |
fixed_token = token # Use the original token if decoding fails | |
else: | |
try: | |
# Directly encode and decode without misinterpretation steps | |
fixed_token = token.encode("utf-8").decode("utf-8") | |
except UnicodeDecodeError: | |
fixed_token = token # Use the original token if decoding fails | |
fixed_tokens.append(fixed_token) | |
return fixed_tokens | |
def tokenize_text(text, chosen_model, better_tokenization=False): | |
tokenizer = AutoTokenizer.from_pretrained(chosen_model) | |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text)) | |
random_colors = generate_distinct_colors(len(tokenized_text)) | |
if better_tokenization: | |
final_tokenized_text = [] | |
for token in tokenized_text: | |
correct_tokenized_text = "" | |
for char in text: | |
correct_tokenized_text += char | |
current_token = decode_bpe_tokens( | |
tokenizer.tokenize(correct_tokenized_text) | |
) | |
if current_token[0] == token: | |
final_tokenized_text.append(correct_tokenized_text) | |
text = text[len(correct_tokenized_text) :] | |
break | |
else: | |
final_tokenized_text = tokenized_text | |
print(final_tokenized_text) | |
output = [] | |
color_map = {} | |
for idx, token in enumerate(final_tokenized_text): | |
output.append((token, str(idx))) | |
color_map[str(idx + 1)] = random_colors[idx % len(random_colors)] | |
return gr.HighlightedText(output, color_map) | |
def refresh(): | |
global df | |
df = pd.read_json(dataframe_path, lines=True) | |
return ( | |
gr.Dataframe(df), | |
gr.BarPlot(df), | |
gr.Dropdown(choices=df["📛 Models"].tolist()), | |
) | |
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner). | |
**A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.** | |
## Updates/Notes: | |
1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)). | |
1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better). | |
1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no). | |
1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens). | |
1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`) | |
1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better). | |
**Note**: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving). | |
""" | |
with gr.Blocks() as demo: | |
gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>") | |
gr.Markdown("## What is the best tokenizer for Arabic?") | |
gr.Markdown(leaderboard_description) | |
with gr.Tab(label="Leaderboard"): | |
dataframe = gr.Dataframe(df) | |
with gr.Accordion("Barplot", open=False): | |
barplot = gr.BarPlot( | |
df, | |
x="📛 Models", | |
y="➕ Total Number of Tokens", | |
x_title=" ", | |
y_title=" ", | |
width=1000, | |
height=400, | |
tooltip=["📘 Vocab Size", "🪺 Fertility Score"], | |
vertical=False, | |
x_label_angle=30, | |
) | |
model_name = gr.Textbox( | |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)" | |
) | |
with gr.Row(): | |
submit_new_model_btn = gr.Button( | |
value="Submit New Model", variant="primary", scale=3 | |
) | |
refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1) | |
with gr.Tab(label="Try tokenizers"): | |
text = gr.Textbox( | |
label="Enter a text", | |
lines=5, | |
value="السلام عليكم ورحمة الله", | |
rtl=True, | |
text_align="right", | |
) | |
dropdown = gr.Dropdown( | |
label="Select a model", | |
choices=df["📛 Models"].tolist(), | |
value=df["📛 Models"].tolist()[0], | |
) | |
with gr.Row(): | |
submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3) | |
checkbox = gr.Checkbox( | |
label="Better tokenization for Arabic Text", value=False, scale=1 | |
) | |
tokenized_textbox = gr.HighlightedText(label="Tokenized text") | |
submit_new_model_btn.click( | |
submit, model_name, outputs=[dataframe, barplot, dropdown] | |
) | |
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown]) | |
submit_text_btn.click( | |
tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox] | |
) | |
demo.launch() | |