File size: 4,083 Bytes
232c9b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
from src.tokenizer import SanskritBPETokenizer
import os
import random

# Initialize tokenizer
tokenizer = SanskritBPETokenizer(
    merges_path='data/vocab',
    token_path='data/vocab'
)

def generate_color(token_id: int) -> str:
    """Generate a consistent color for a token ID"""
    random.seed(token_id)  # Make color consistent for same token
    hue = random.randint(0, 360)
    return f"hsl({hue}, 80%, 80%)"

def colorize_tokens(text: str) -> str:
    """Convert text to HTML with colored token spans"""
    if not text.strip():
        return ""
        
    tokens = tokenizer.encode(text)
    decoded_pieces = []
    
    for i, token_id in enumerate(tokens):
        decoded_text = tokenizer.decode([token_id])
        color = generate_color(token_id)
        span = f'<span style="background-color: {color}; color: black; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="Token {token_id}">{decoded_text}</span>'
        decoded_pieces.append(span)
    
    return "".join(decoded_pieces)

def count_tokens(text: str, show_tokens: bool = False) -> tuple:
    """Count tokens and return token visualization"""
    if not text.strip():
        return "0 tokens", ""
        
    tokens = tokenizer.encode(text)
    token_count = len(tokens)
    
    if show_tokens:
        decoded = tokenizer.decode(tokens)
        token_info = f"{token_count} tokens\nTokens: {tokens}\nDecoded: {decoded}"
    else:
        token_info = f"{token_count} tokens"
        
    colored_text = colorize_tokens(text)
    return token_info, colored_text

# Custom CSS for better visualization
custom_css = """
footer {visibility: hidden}
.token-text {
    font-family: monospace;
    line-height: 1.8;
    padding: 10px;
    border-radius: 5px;
    background: white;
    margin: 10px 0;
    color: black;
}
.gradio-container {
    max-width: 1000px !important;
}
"""

# Create the Gradio interface
with gr.Blocks(css=custom_css) as demo:
    gr.Markdown(
        """
        # Sanskrit BPE Tokenizer
        
        Test how the Sanskrit BPE tokenizer processes text. Enter Sanskrit text below to see how many tokens it uses.
        Each colored span represents one token.
        """
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Content",
                placeholder="Enter Sanskrit text here...",
                lines=5
            )
            show_tokens = gr.Checkbox(
                label="Show token IDs and decoded text",
                value=False
            )
        
        with gr.Column():
            token_count = gr.Textbox(
                label="Token count",
                lines=2,
                interactive=False
            )
            token_viz = gr.HTML(
                label="Token visualization",
                elem_classes=["token-text"]
            )
    
    # Update token count and visualization when text changes or checkbox is toggled
    text_input.change(
        fn=count_tokens,
        inputs=[text_input, show_tokens],
        outputs=[token_count, token_viz]
    )
    show_tokens.change(
        fn=count_tokens,
        inputs=[text_input, show_tokens],
        outputs=[token_count, token_viz]
    )

    gr.Markdown(
        """
        ### Examples
        Try these Sanskrit text samples:
        """
    )
    
    gr.Examples(
        examples=[
            ["विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः।"],
            ["धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।"],
            ["यदा यदा हि धर्मस्य ग्लानिर्भवति भारत।"],
        ],
        inputs=text_input
    )

    gr.Markdown(
        """
        ---
        Built with [Gradio](https://gradio.app) | [GitHub Repository](https://github.com/PRIYE/SanskritBPETokenizer)
        """
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()