Spaces:

orionweller
/

gemma-3-token-counter

Running

App Files Files Community

orionweller commited on 27 days ago

Commit

c1072ca

verified ·

1 Parent(s): 2a4dd78

Create app.py

Browse files

Files changed (1) hide show

app.py +69 -0

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gradio as gr
+from transformers import AutoTokenizer
+import json
+# Load the tokenizer for the specified Gemma model
+# This will download the tokenizer model from the Hugging Face Hub
+try:
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
+    print("Tokenizer loaded successfully.")
+except Exception as e:
+    print(f"Error loading tokenizer: {e}")
+    # Use a fallback or raise the exception if the tokenizer is critical
+    tokenizer = None
+def count_tokens(text_input):
+    """
+    Counts the number of tokens in a single string or a list of strings.
+    Args:
+        text_input (str): A string that can be a single sentence or a
+                          JSON-formatted list of strings.
+    Returns:
+        str: A string describing the token count(s).
+    """
+    if not tokenizer:
+        return "Error: Tokenizer not loaded. Please check the model name and your internet connection."
+    try:
+        # Try to interpret the input as a JSON list
+        # This allows users to input lists like ["hello world", "gemma is great"]
+        input_list = json.loads(text_input)
+        if isinstance(input_list, list):
+            # If it's a list, process each item
+            token_counts = [len(tokenizer.encode(text)) for text in input_list]
+            return f"Token counts for each string: {token_counts}"
+    except (json.JSONDecodeError, TypeError):
+        # If it's not a valid JSON list, treat it as a single string
+        if isinstance(text_input, str):
+            token_ids = tokenizer.encode(text_input)
+            num_tokens = len(token_ids)
+            return f"The number of tokens is: {num_tokens}"
+        else:
+            return "Invalid input type. Please provide a string or a JSON-formatted list of strings."
+# Define the Gradio interface
+# This creates the web UI for our function
+demo = gr.Interface(
+    fn=count_tokens,
+    inputs=gr.Textbox(
+        lines=5,
+        label="Input Text or List of Strings",
+        placeholder='Enter a single string, or a list of strings in JSON format (e.g., ["string 1", "string 2"]).'
+    ),
+    outputs=gr.Text(label="Token Count Result"),
+    title="Gemma Token Counter",
+    description="Calculate the number of tokens for a given input string or a list of strings using the 'google/gemma-3-4b-it' tokenizer. To input a list, please format it as a valid JSON array.",
+    examples=[
+        ["Hello, how are you?"],
+        ['["This is the first sentence.", "This is the second one."_]'],
+        ["Tokenization is the process of breaking down text."]
+    ],
+    allow_flagging="never"
+)
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch()