Spaces:

PriyePrabhakar
/

SanskritBPETokenizer

Sleeping

App Files Files Community

PriyePrabhakar commited on Jan 21

Commit

232c9b6

1 Parent(s): 9bbe09e

Added files for sanskritBPE tokenizer

Browse files

Files changed (13) hide show

.DS_Store +0 -0
README.md +0 -0
app.py +139 -0
data/vocab/merges_saved.pkl +3 -0
data/vocab/saved.pkl +3 -0
merges_saved.pkl +3 -0
requirements.txt +4 -0
src/__init__.py +4 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/tokenizer.cpython-312.pyc +0 -0
src/__pycache__/utils.cpython-312.pyc +0 -0
src/tokenizer.py +190 -0
src/utils.py +23 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md CHANGED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import gradio as gr
+from src.tokenizer import SanskritBPETokenizer
+import os
+import random
+# Initialize tokenizer
+tokenizer = SanskritBPETokenizer(
+    merges_path='data/vocab',
+    token_path='data/vocab'
+)
+def generate_color(token_id: int) -> str:
+    """Generate a consistent color for a token ID"""
+    random.seed(token_id)  # Make color consistent for same token
+    hue = random.randint(0, 360)
+    return f"hsl({hue}, 80%, 80%)"
+def colorize_tokens(text: str) -> str:
+    """Convert text to HTML with colored token spans"""
+    if not text.strip():
+        return ""
+    tokens = tokenizer.encode(text)
+    decoded_pieces = []
+    for i, token_id in enumerate(tokens):
+        decoded_text = tokenizer.decode([token_id])
+        color = generate_color(token_id)
+        span = f'<span style="background-color: {color}; color: black; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="Token {token_id}">{decoded_text}</span>'
+        decoded_pieces.append(span)
+    return "".join(decoded_pieces)
+def count_tokens(text: str, show_tokens: bool = False) -> tuple:
+    """Count tokens and return token visualization"""
+    if not text.strip():
+        return "0 tokens", ""
+    tokens = tokenizer.encode(text)
+    token_count = len(tokens)
+    if show_tokens:
+        decoded = tokenizer.decode(tokens)
+        token_info = f"{token_count} tokens\nTokens: {tokens}\nDecoded: {decoded}"
+    else:
+        token_info = f"{token_count} tokens"
+    colored_text = colorize_tokens(text)
+    return token_info, colored_text
+# Custom CSS for better visualization
+custom_css = """
+footer {visibility: hidden}
+.token-text {
+    font-family: monospace;
+    line-height: 1.8;
+    padding: 10px;
+    border-radius: 5px;
+    background: white;
+    margin: 10px 0;
+    color: black;
+}
+.gradio-container {
+    max-width: 1000px !important;
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown(
+        """
+        # Sanskrit BPE Tokenizer
+        Test how the Sanskrit BPE tokenizer processes text. Enter Sanskrit text below to see how many tokens it uses.
+        Each colored span represents one token.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Content",
+                placeholder="Enter Sanskrit text here...",
+                lines=5
+            )
+            show_tokens = gr.Checkbox(
+                label="Show token IDs and decoded text",
+                value=False
+            )
+        with gr.Column():
+            token_count = gr.Textbox(
+                label="Token count",
+                lines=2,
+                interactive=False
+            )
+            token_viz = gr.HTML(
+                label="Token visualization",
+                elem_classes=["token-text"]
+            )
+    # Update token count and visualization when text changes or checkbox is toggled
+    text_input.change(
+        fn=count_tokens,
+        inputs=[text_input, show_tokens],
+        outputs=[token_count, token_viz]
+    )
+    show_tokens.change(
+        fn=count_tokens,
+        inputs=[text_input, show_tokens],
+        outputs=[token_count, token_viz]
+    )
+    gr.Markdown(
+        """
+        ### Examples
+        Try these Sanskrit text samples:
+        """
+    )
+    gr.Examples(
+        examples=[
+            ["विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः।"],
+            ["धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।"],
+            ["यदा यदा हि धर्मस्य ग्लानिर्भवति भारत।"],
+        ],
+        inputs=text_input
+    )
+    gr.Markdown(
+        """
+        ---
+        Built with [Gradio](https://gradio.app) | [GitHub Repository](https://github.com/PRIYE/SanskritBPETokenizer)
+        """
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

data/vocab/merges_saved.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:831479c60a34b724bae34fa1d16571748b7f5d008d6e71cb1cc6e8489d596f48
+size 54143

data/vocab/saved.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd750f97d79eaba6b14d575f7d71c723365311b22658f850699a5bed75397b8
+size 41003929

merges_saved.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:831479c60a34b724bae34fa1d16571748b7f5d008d6e71cb1cc6e8489d596f48
+size 54143

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+numpy>=1.21.0
+tqdm>=4.65.0
+gradio>=4.11.0
+datasets>=2.0.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .tokenizer import SanskritBPETokenizer
+from .utils import save_merges, load_merges
+__all__ = ['SanskritBPETokenizer', 'save_merges', 'load_merges']

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (320 Bytes). View file

src/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (9.35 kB). View file

src/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (1.92 kB). View file

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import json
+from pathlib import Path
+import pickle
+from typing import List, Dict, Tuple, Optional
+from datasets import load_dataset
+import re
+class SanskritBPETokenizer:
+    def __init__(self, vocab_path:Optional[str] = None  , merges_path: Optional[str] = None, token_path: Optional[str] = None):
+        """Initialize the tokenizer with vocabulary and merges"""
+        self.vocab = []
+        self.merges = {}
+        if merges_path:
+            self.load_vocab(merges_path)
+        if token_path:
+            self.load_tokens(token_path)
+        if vocab_path:
+            self.create_tokens(vocab_path, token_path, merges_path)
+    def create_tokens(self, vocab_path, token_path, merges_path):
+        dataset = load_dataset(vocab_path)
+        text = ''.join([i['translation']['sn'] for i in dataset['train']])
+        tokens =  self.regex_sanskrit_tokenize(text)
+        tokens = text.encode("utf-8") # raw bytes
+        tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience
+        with open(token_path + '/saved.pkl', 'wb') as f:
+            pickle.dump(tokens, f, pickle.HIGHEST_PROTOCOL)
+        vocab_size = 5250 # the desired final vocabulary size
+        num_merges = vocab_size - 256
+        ids = list(tokens) # copy so we don't destroy the original list
+        merges = {} # (int, int) -> int
+        for i in range(num_merges):
+          stats = self.get_stats(ids)
+          pair = max(stats, key=stats.get)
+          idx = 256 + i
+          print(f"merging {pair} into a new token {idx}")
+          ids = self.merge(ids, pair, idx)
+          merges[pair] = idx
+        with open(merges_path + '/merges_saved.pkl', 'wb') as f:
+            pickle.dump(merges, f, pickle.HIGHEST_PROTOCOL)
+        print("tokens length:", len(tokens))
+        print("ids length:", len(ids))
+        print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
+    def regex_sanskrit_tokenize(self, text):
+        # Basic sandhi patterns
+        sandhi_patterns = [
+            # # Visarga sandhi
+            # r'ः\s*([कखगघङचछजझञटठडढणतथदधनपफबभम])',
+            # # Vowel sandhi
+            # r'([अआइईउऊऋॠऌॡएऐओऔ])्?\s*([अआइईउऊऋॠऌॡएऐओऔ])',
+            # # Consonant sandhi
+            # r'([क-ह])्\s*([क-ह])',
+            # # Common contractions and combinations
+            # r'([क-ह])्([यरलवहमनञणन])',
+            # # Anusvara and chandrabindu combinations
+            # r'[ंँ]([क-ह])',
+            # # Handle special cases like ज्ञ, क्ष
+            # r'(ज्ञ|क्ष)',
+            # # Handle numbers and punctuation
+            # r'([०-९])|([।॥,])',
+            # # Handle specific compound formations
+            # r'([क-ह])्य',  # -ya formations
+            # r'([क-ह])्र',  # -ra formations
+            # # Handle specific prefixes
+            # r'(प्र|उप|अभि|नि|वि|आ|उद्|परि)',
+            # # Handle specific suffixes
+            # r'(तया|त्वम्|त्वात्)',
+            ##################
+            # Anusvara and visarga combinations
+            r'ं|ः',
+            # Common vowel sandhis
+            r'ा|ि|ी|ु|ू|ृ|ॄ|ॢ|ॣ|े|ै|ो|ौ',
+            # Virama (halant) combinations
+            r'्',
+            # Common consonant combinations
+            r'त्त|त्र|त्व|न्त|न्द|न्ध|श्च|श्व|ष्ट|स्त|स्थ|ह्म|ह्य',
+            # Basic word boundaries
+            r'\s+',
+            # Punctuation and numbers
+            r'[।॥॰,!?०-९]+',
+        ]
+        # Combine all patterns
+        pattern = '|'.join(sandhi_patterns)
+        # Function to process each match
+        def split_token(match):
+            token = match.group(0)
+            # Add spaces around the matched token
+            return f' {token} '
+        # Apply the regex
+        tokenized_text = re.sub(pattern, split_token, text)
+        print('tokenized_text',tokenized_text)
+        # Clean up extra spaces and split
+        tokens = [token.strip() for token in tokenized_text.split() if token.strip()]
+        return ' '.join(tokens)
+    def load_tokens(self, token_path: str):
+        """Load vocabulary and merges from file"""
+        with open(token_path + "/saved.pkl", "rb") as f:
+            self.tokens = pickle.load(f)
+            print("tokens length:", len(self.tokens))
+            chars = sorted(list(set(self.tokens)))
+    def load_vocab(self, vocab_path: str):
+        """Load vocabulary and merges from file"""
+        with open(vocab_path + "/merges_saved.pkl", "rb") as f:
+            self.merges = pickle.load(f)
+            #print(self.merges)
+            # Create reverse vocab from merges
+            self.vocab = {idx: bytes([idx]) for idx in range(256)}
+            for (p0, p1), idx in self.merges.items():
+                self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
+            #print(self.vocab)
+    def get_stats(self, tokens: List[int]) -> Dict[Tuple[int, int], int]:
+        """Count frequency of token pairs"""
+        stats = {}
+        for pair in zip(tokens, tokens[1:]): # Pythonic way to iterate consecutive elements
+            stats[pair] = stats.get(pair, 0) + 1
+        return stats
+    def merge(self, tokens: List[int], pair: Tuple[int, int], idx: int) -> List[int]:
+        """Merge all occurrences of a token pair"""
+        new_tokens = []
+        i = 0
+        while i < len(tokens):
+            if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i + 1] == pair[1]:
+                new_tokens.append(idx)
+                i += 2
+            else:
+                new_tokens.append(tokens[i])
+                i += 1
+        return new_tokens
+    def encode(self, text: str) -> List[int]:
+        """Encode text to token IDs"""
+        tokens = list(text.encode("utf-8"))
+        while len(tokens) >= 2:
+            stats = self.get_stats(tokens)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            if pair not in self.merges:
+                break  # nothing else can be merged
+            idx = self.merges[pair]
+            tokens = self.merge(tokens, pair, idx)
+        return tokens
+    def decode(self, ids: List[int]) -> str:
+        """Decode token IDs back to text"""
+        tokens = b"".join(self.vocab[idx] for idx in ids)
+        text = tokens.decode("utf-8", errors="replace")
+        return text
+if __name__ == "__main__":
+    # Create tokens from text
+    vocab_path = 'rahular/itihasa' # loading sansakrit text from huggingface
+    #SanskritBPETokenizer(vocab_path = vocab_path, merges_path='/Users/priye/Desktop/ERAV3/SanskritBPETokenizer' ,  token_path='/Users/priye/Desktop/ERAV3/SanskritBPETokenizer' )
+    # Example usage
+    tokenizer = SanskritBPETokenizer(merges_path='/Users/priye/Desktop/ERAV3/SanskritBPETokenizer/data/vocab' ,  token_path='/Users/priye/Desktop/ERAV3/SanskritBPETokenizer/data/vocab' )
+    sample_text = "विश्वामित्रवचः श्रुत्वा राघवः सहलक्ष्मणः। विस्मयं परमं गत्वा विश्वामित्रमथाब्रवीत्॥"
+    encoded = tokenizer.encode(sample_text)
+    decoded = tokenizer.decode(encoded)
+    print(f"Original text: {sample_text}")
+    print(f"Encoded tokens: {encoded}")
+    print(f"Decoded text: {decoded}")
+    print(tokenizer.decode(tokenizer.encode(sample_text)))
+    assert sample_text == tokenizer.decode(tokenizer.encode(sample_text))

src/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+from pathlib import Path
+from typing import Dict, Tuple
+def save_merges(merges: Dict[Tuple[int, int], int], save_path: str):
+    """Save merges dictionary to JSON file"""
+    # Convert tuple keys to strings for JSON serialization
+    serializable_merges = {f"{k[0]},{k[1]}": v for k, v in merges.items()}
+    save_dir = Path(save_path)
+    save_dir.mkdir(parents=True, exist_ok=True)
+    with open(save_dir / "merges.json", "w", encoding="utf-8") as f:
+        json.dump(serializable_merges, f, ensure_ascii=False, indent=2)
+def load_merges(load_path: str) -> Dict[Tuple[int, int], int]:
+    """Load merges dictionary from JSON file"""
+    with open(Path(load_path) / "merges.json", "r", encoding="utf-8") as f:
+        serialized_merges = json.load(f)
+    # Convert string keys back to tuples
+    merges = {tuple(map(int, k.split(","))): v for k, v in serialized_merges.items()}
+    return merges