import gradio as gr from transformers import AutoTokenizer import json import os # Load the tokenizer for the specified Gemma model # This will download the tokenizer model from the Hugging Face Hub try: tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it", token=os.environ["HF_TOKEN"]) print("Tokenizer loaded successfully.") except Exception as e: print(f"Error loading tokenizer: {e}") # Use a fallback or raise the exception if the tokenizer is critical tokenizer = None def count_tokens(text_input): """ Counts the number of tokens in a single string or a list of strings. Args: text_input (str): A string that can be a single sentence or a JSON-formatted list of strings. Returns: str: A string describing the token count(s). """ if not tokenizer: return "Error: Tokenizer not loaded. Please check the model name and your internet connection." try: # Try to interpret the input as a JSON list # This allows users to input lists like ["hello world", "gemma is great"] input_list = json.loads(text_input) if isinstance(input_list, list): # If it's a list, process each item token_counts = [len(tokenizer.encode(text)) for text in input_list] return f"Token counts for each string: {token_counts}" except (json.JSONDecodeError, TypeError): # If it's not a valid JSON list, treat it as a single string if isinstance(text_input, str): token_ids = tokenizer.encode(text_input) num_tokens = len(token_ids) return f"The number of tokens is: {num_tokens}" else: return "Invalid input type. Please provide a string or a JSON-formatted list of strings." # Define the Gradio interface # This creates the web UI for our function demo = gr.Interface( fn=count_tokens, inputs=gr.Textbox( lines=5, label="Input Text or List of Strings", placeholder='Enter a single string, or a list of strings in JSON format (e.g., ["string 1", "string 2"]).' ), outputs=gr.Text(label="Token Count Result"), title="Gemma Token Counter", description="Calculate the number of tokens for a given input string or a list of strings using the 'google/gemma-3-4b-it' tokenizer. To input a list, please format it as a valid JSON array.", examples=[ ["Hello, how are you?"], ['["This is the first sentence.", "This is the second one."_]'], ["Tokenization is the process of breaking down text."] ], allow_flagging="never" ) # Launch the Gradio app if __name__ == "__main__": demo.launch()