Spaces:

PriyePrabhakar
/

SanskritBPETokenizer

Sleeping

File size: 4,083 Bytes

232c9b6

import gradio as gr
from src.tokenizer import SanskritBPETokenizer
import os
import random

# Initialize tokenizer
tokenizer = SanskritBPETokenizer(
    merges_path='data/vocab',
    token_path='data/vocab'
)

def generate_color(token_id: int) -> str:
    """Generate a consistent color for a token ID"""
    random.seed(token_id)  # Make color consistent for same token
    hue = random.randint(0, 360)
    return f"hsl({hue}, 80%, 80%)"

def colorize_tokens(text: str) -> str:
    """Convert text to HTML with colored token spans"""
    if not text.strip():
        return ""
        
    tokens = tokenizer.encode(text)
    decoded_pieces = []
    
    for i, token_id in enumerate(tokens):
        decoded_text = tokenizer.decode([token_id])
        color = generate_color(token_id)
        span = f'<span style="background-color: {color}; color: black; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="Token {token_id}">{decoded_text}</span>'
        decoded_pieces.append(span)
    
    return "".join(decoded_pieces)

def count_tokens(text: str, show_tokens: bool = False) -> tuple:
    """Count tokens and return token visualization"""
    if not text.strip():
        return "0 tokens", ""
        
    tokens = tokenizer.encode(text)
    token_count = len(tokens)
    
    if show_tokens:
        decoded = tokenizer.decode(tokens)
        token_info = f"{token_count} tokens\nTokens: {tokens}\nDecoded: {decoded}"
    else:
        token_info = f"{token_count} tokens"
        
    colored_text = colorize_tokens(text)
    return token_info, colored_text

# Custom CSS for better visualization
custom_css = """
footer {visibility: hidden}
.token-text {
    font-family: monospace;
    line-height: 1.8;
    padding: 10px;
    border-radius: 5px;
    background: white;
    margin: 10px 0;
    color: black;
}
.gradio-container {
    max-width: 1000px !important;
}
"""

# Create the Gradio interface
with gr.Blocks(css=custom_css) as demo:
    gr.Markdown(
        """
        # Sanskrit BPE Tokenizer
        
        Test how the Sanskrit BPE tokenizer processes text. Enter Sanskrit text below to see how many tokens it uses.
        Each colored span represents one token.
        """
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Content",
                placeholder="Enter Sanskrit text here...",
                lines=5
            )
            show_tokens = gr.Checkbox(
                label="Show token IDs and decoded text",
                value=False
            )
        
        with gr.Column():
            token_count = gr.Textbox(
                label="Token count",
                lines=2,
                interactive=False
            )
            token_viz = gr.HTML(
                label="Token visualization",
                elem_classes=["token-text"]
            )
    
    # Update token count and visualization when text changes or checkbox is toggled
    text_input.change(
        fn=count_tokens,
        inputs=[text_input, show_tokens],
        outputs=[token_count, token_viz]
    )
    show_tokens.change(
        fn=count_tokens,
        inputs=[text_input, show_tokens],
        outputs=[token_count, token_viz]
    )

    gr.Markdown(
        """
        ### Examples
        Try these Sanskrit text samples:
        """
    )
    
    gr.Examples(
        examples=[
            ["विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः।"],
            ["धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।"],
            ["यदा यदा हि धर्मस्य ग्लानिर्भवति भारत।"],
        ],
        inputs=text_input
    )

    gr.Markdown(
        """
        ---
        Built with [Gradio](https://gradio.app) | [GitHub Repository](https://github.com/PRIYE/SanskritBPETokenizer)
        """
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()