orionweller commited on
Commit
c1072ca
·
verified ·
1 Parent(s): 2a4dd78

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import json
4
+
5
+ # Load the tokenizer for the specified Gemma model
6
+ # This will download the tokenizer model from the Hugging Face Hub
7
+ try:
8
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
9
+ print("Tokenizer loaded successfully.")
10
+ except Exception as e:
11
+ print(f"Error loading tokenizer: {e}")
12
+ # Use a fallback or raise the exception if the tokenizer is critical
13
+ tokenizer = None
14
+
15
+ def count_tokens(text_input):
16
+ """
17
+ Counts the number of tokens in a single string or a list of strings.
18
+
19
+ Args:
20
+ text_input (str): A string that can be a single sentence or a
21
+ JSON-formatted list of strings.
22
+
23
+ Returns:
24
+ str: A string describing the token count(s).
25
+ """
26
+ if not tokenizer:
27
+ return "Error: Tokenizer not loaded. Please check the model name and your internet connection."
28
+
29
+ try:
30
+ # Try to interpret the input as a JSON list
31
+ # This allows users to input lists like ["hello world", "gemma is great"]
32
+ input_list = json.loads(text_input)
33
+ if isinstance(input_list, list):
34
+ # If it's a list, process each item
35
+ token_counts = [len(tokenizer.encode(text)) for text in input_list]
36
+ return f"Token counts for each string: {token_counts}"
37
+ except (json.JSONDecodeError, TypeError):
38
+ # If it's not a valid JSON list, treat it as a single string
39
+ if isinstance(text_input, str):
40
+ token_ids = tokenizer.encode(text_input)
41
+ num_tokens = len(token_ids)
42
+ return f"The number of tokens is: {num_tokens}"
43
+ else:
44
+ return "Invalid input type. Please provide a string or a JSON-formatted list of strings."
45
+
46
+
47
+ # Define the Gradio interface
48
+ # This creates the web UI for our function
49
+ demo = gr.Interface(
50
+ fn=count_tokens,
51
+ inputs=gr.Textbox(
52
+ lines=5,
53
+ label="Input Text or List of Strings",
54
+ placeholder='Enter a single string, or a list of strings in JSON format (e.g., ["string 1", "string 2"]).'
55
+ ),
56
+ outputs=gr.Text(label="Token Count Result"),
57
+ title="Gemma Token Counter",
58
+ description="Calculate the number of tokens for a given input string or a list of strings using the 'google/gemma-3-4b-it' tokenizer. To input a list, please format it as a valid JSON array.",
59
+ examples=[
60
+ ["Hello, how are you?"],
61
+ ['["This is the first sentence.", "This is the second one."_]'],
62
+ ["Tokenization is the process of breaking down text."]
63
+ ],
64
+ allow_flagging="never"
65
+ )
66
+
67
+ # Launch the Gradio app
68
+ if __name__ == "__main__":
69
+ demo.launch()