Spaces:
Sleeping
Sleeping
File size: 4,083 Bytes
232c9b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from src.tokenizer import SanskritBPETokenizer
import os
import random
# Initialize tokenizer
tokenizer = SanskritBPETokenizer(
merges_path='data/vocab',
token_path='data/vocab'
)
def generate_color(token_id: int) -> str:
"""Generate a consistent color for a token ID"""
random.seed(token_id) # Make color consistent for same token
hue = random.randint(0, 360)
return f"hsl({hue}, 80%, 80%)"
def colorize_tokens(text: str) -> str:
"""Convert text to HTML with colored token spans"""
if not text.strip():
return ""
tokens = tokenizer.encode(text)
decoded_pieces = []
for i, token_id in enumerate(tokens):
decoded_text = tokenizer.decode([token_id])
color = generate_color(token_id)
span = f'<span style="background-color: {color}; color: black; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="Token {token_id}">{decoded_text}</span>'
decoded_pieces.append(span)
return "".join(decoded_pieces)
def count_tokens(text: str, show_tokens: bool = False) -> tuple:
"""Count tokens and return token visualization"""
if not text.strip():
return "0 tokens", ""
tokens = tokenizer.encode(text)
token_count = len(tokens)
if show_tokens:
decoded = tokenizer.decode(tokens)
token_info = f"{token_count} tokens\nTokens: {tokens}\nDecoded: {decoded}"
else:
token_info = f"{token_count} tokens"
colored_text = colorize_tokens(text)
return token_info, colored_text
# Custom CSS for better visualization
custom_css = """
footer {visibility: hidden}
.token-text {
font-family: monospace;
line-height: 1.8;
padding: 10px;
border-radius: 5px;
background: white;
margin: 10px 0;
color: black;
}
.gradio-container {
max-width: 1000px !important;
}
"""
# Create the Gradio interface
with gr.Blocks(css=custom_css) as demo:
gr.Markdown(
"""
# Sanskrit BPE Tokenizer
Test how the Sanskrit BPE tokenizer processes text. Enter Sanskrit text below to see how many tokens it uses.
Each colored span represents one token.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Content",
placeholder="Enter Sanskrit text here...",
lines=5
)
show_tokens = gr.Checkbox(
label="Show token IDs and decoded text",
value=False
)
with gr.Column():
token_count = gr.Textbox(
label="Token count",
lines=2,
interactive=False
)
token_viz = gr.HTML(
label="Token visualization",
elem_classes=["token-text"]
)
# Update token count and visualization when text changes or checkbox is toggled
text_input.change(
fn=count_tokens,
inputs=[text_input, show_tokens],
outputs=[token_count, token_viz]
)
show_tokens.change(
fn=count_tokens,
inputs=[text_input, show_tokens],
outputs=[token_count, token_viz]
)
gr.Markdown(
"""
### Examples
Try these Sanskrit text samples:
"""
)
gr.Examples(
examples=[
["विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः।"],
["धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।"],
["यदा यदा हि धर्मस्य ग्लानिर्भवति भारत।"],
],
inputs=text_input
)
gr.Markdown(
"""
---
Built with [Gradio](https://gradio.app) | [GitHub Repository](https://github.com/PRIYE/SanskritBPETokenizer)
"""
)
# Launch the app
if __name__ == "__main__":
demo.launch() |