Spaces:

archit11
/

Hindi_LLM_arena

Sleeping

App Files Files Community

archit11 commited on Jun 29, 2024

Commit

a3db774

verified ·

1 Parent(s): 7f7274f

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -34

app.py CHANGED Viewed

@@ -30,25 +30,6 @@ MODEL_OPTIONS = [
 models = {}
 tokenizers = {}
-# Custom chat templates
-MISTRAL_TEMPLATE = """<s>[INST] {instruction} [/INST]
-{response}
-</s>
-<s>[INST] {instruction} [/INST]
-"""
-LLAMA_TEMPLATE = """<s>[INST] <<SYS>>
-You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
-<</SYS>>
-{instruction} [/INST]
-{response}
-</s>
-<s>[INST] {instruction} [/INST]
-"""
 for model_id in MODEL_OPTIONS:
     tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id)
     models[model_id] = AutoModelForCausalLM.from_pretrained(
@@ -58,11 +39,9 @@ for model_id in MODEL_OPTIONS:
     )
     models[model_id].eval()
-    # Set custom chat templates
-    if "Navarna" in model_id:
-        tokenizers[model_id].chat_template = MISTRAL_TEMPLATE
-    elif "OpenHathi" in model_id:
-        tokenizers[model_id].chat_template = LLAMA_TEMPLATE
 # Initialize Flask app
 app = Flask(__name__)
@@ -74,6 +53,25 @@ def log_results():
     print("Logged:", json.dumps(data, indent=2))
     return jsonify({"status": "success"}), 200
 @spaces.GPU(duration=90)
 def generate(
     model_id: str,
@@ -86,29 +84,28 @@ def generate(
     model = models[model_id]
     tokenizer = tokenizers[model_id]
-    conversation = []
-    for user, assistant in chat_history:
-        conversation.extend([
-            {"role": "user", "content": user},
-            {"role": "assistant", "content": assistant},
-        ])
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         temperature=temperature,
         num_beams=1,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
@@ -215,5 +212,5 @@ if __name__ == "__main__":
     flask_thread = Thread(target=app.run, kwargs={"host": "0.0.0.0", "port": 5000})
     flask_thread.start()
-    # Start Gradio app
-    demo.queue(max_size=10).launch()

 models = {}
 tokenizers = {}
 for model_id in MODEL_OPTIONS:
     tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id)
     models[model_id] = AutoModelForCausalLM.from_pretrained(
     )
     models[model_id].eval()
+    # Set pad_token_id to eos_token_id if it's not set
+    if tokenizers[model_id].pad_token_id is None:
+        tokenizers[model_id].pad_token_id = tokenizers[model_id].eos_token_id
 # Initialize Flask app
 app = Flask(__name__)
     print("Logged:", json.dumps(data, indent=2))
     return jsonify({"status": "success"}), 200
+def prepare_input(model_id: str, message: str, chat_history: List[Tuple[str, str]]):
+    if "OpenHathi" in model_id:
+        # OpenHathi model doesn't use a specific chat template
+        full_prompt = message
+        for history_message in chat_history:
+            full_prompt = f"{history_message[0]}\n{history_message[1]}\n{full_prompt}"
+        return tokenizers[model_id](full_prompt, return_tensors="pt")
+    elif "Navarna" in model_id:
+        # Navarna model uses a chat template
+        conversation = []
+        for user, assistant in chat_history:
+            conversation.extend([
+                {"role": "user", "content": user},
+                {"role": "assistant", "content": assistant},
+            ])
+        conversation.append({"role": "user", "content": message})
+        prompt = tokenizers[model_id].apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+        return tokenizers[model_id](prompt, return_tensors="pt")
 @spaces.GPU(duration=90)
 def generate(
     model_id: str,
     model = models[model_id]
     tokenizer = tokenizers[model_id]
+    inputs = prepare_input(model_id, message, chat_history)
+    input_ids = inputs.input_ids
+    attention_mask = inputs.attention_mask
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        attention_mask = attention_mask[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    attention_mask = attention_mask.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
+        attention_mask=attention_mask,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         temperature=temperature,
         num_beams=1,
+        pad_token_id=tokenizer.eos_token_id,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     flask_thread = Thread(target=app.run, kwargs={"host": "0.0.0.0", "port": 5000})
     flask_thread.start()
+    # Start Gradio app with public link
+    demo.queue(max_size=10).launch(share=True)