Spaces:

ManishThota
/

gemma-2b-it-CHAT

Runtime error

App Files Files Community

cnmoro commited on Feb 21, 2024

Commit

3d5a205

verified ·

1 Parent(s): eabeea0

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -34

app.py CHANGED Viewed

@@ -1,61 +1,36 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
 from threading import Thread
 torch.set_num_threads(2)
 # Loading the tokenizer and model from Hugging Face's model hub.
-tokenizer = AutoTokenizer.from_pretrained("cnmoro/teenytinyllama-460m-text-simplification-ptbr")
-model = AutoModelForCausalLM.from_pretrained("cnmoro/teenytinyllama-460m-text-simplification-ptbr")
-# using CUDA for an optimal experience
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model = model.to(device)
 def count_tokens(text):
     return len(tokenizer.tokenize(text))
-class EOSStoppingCriteria(StoppingCriteria):
-    """
-    Custom stopping criteria that stops the generation when the "</s>" token is found.
-    """
-    def __init__(self, eos_token_id):
-        self.eos_token_id = eos_token_id
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
-        # Check if the last generated token is the EOS token.
-        is_eos = input_ids[0, -1] == self.eos_token_id
-        return is_eos
-# Find the EOS token ID for the specific token "</s>" in your tokenizer
-eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
 # Function to generate model predictions.
 def predict(message, history):
-    formatted_prompt = f"<s><system>O objetivo é comprimir e estruturar o texto a seguir<texto>{message}</texto>"
-    model_inputs = tokenizer([
-        formatted_prompt
-    ], return_tensors="pt").to(device)
-    # Instantiate your custom stopping criteria
-    stopping_criteria = EOSStoppingCriteria(eos_token_id=eos_token_id)
     streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
         streamer=streamer,
-        max_new_tokens=3072 - count_tokens(formatted_prompt),
         top_p=0.2,
         top_k=20,
         temperature=0.1,
         repetition_penalty=2.0,
         length_penalty=-0.5,
-        num_beams=1,
-        stopping_criteria=StoppingCriteriaList([stopping_criteria])
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()  # Starting the generation in a separate thread.
@@ -66,6 +41,6 @@ def predict(message, history):
 # Setting up the Gradio chat interface.
 gr.ChatInterface(predict,
-                 title="TextStructurization_TeenyTinyLlama460m_CPU",
-                 description="Pass a text to be structurized"
                  ).launch()  # Launching the web interface.

 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import StoppingCriteria, TextIteratorStreamer
 from threading import Thread
 torch.set_num_threads(2)
 # Loading the tokenizer and model from Hugging Face's model hub.
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
 def count_tokens(text):
     return len(tokenizer.tokenize(text))
 # Function to generate model predictions.
 def predict(message, history):
+    formatted_prompt = f"<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
+    model_inputs = tokenizer(input_text, return_tensors="pt")
     streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
         streamer=streamer,
+        max_new_tokens=2048 - count_tokens(formatted_prompt),
         top_p=0.2,
         top_k=20,
         temperature=0.1,
         repetition_penalty=2.0,
         length_penalty=-0.5,
+        num_beams=1
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()  # Starting the generation in a separate thread.
 # Setting up the Gradio chat interface.
 gr.ChatInterface(predict,
+                 title="Gemma 2b Instruct Chat",
+                 description=None
                  ).launch()  # Launching the web interface.