Spaces:

Smilyai-labs
/

Sam-chat-full

Sleeping

App Files Files Community

Boning c commited on Jul 17

Commit

23339f6

verified ·

1 Parent(s): abccee4

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -17

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
 usage_info = {}
-# Load models
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
@@ -25,31 +25,39 @@ def load_models():
     fallback_model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL, torch_dtype=torch.float16).to(device).eval()
     return f"✅ Loaded: {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
-# Format multi-turn history
 def build_chat_prompt(history, user_input, reasoning_enabled):
     prefix = "/think " if reasoning_enabled else "/no_think "
     prompt = ""
     for user_msg, bot_msg in history:
-        prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n"
-    prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prefix + prompt
-# Collapse <think> block
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if match:
         reasoning = escape(match.group(1).strip())
         visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
-        return f"{escape(visible)}<br><details><summary>🧠 Show reasoning</summary><pre>{reasoning}</pre></details>"
     return escape(text)
-# Token stream generator
 def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2, top_p=0.9):
     model = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
-    output = tokenizer.decode(input_ids[0])
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
@@ -60,20 +68,29 @@ def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2,
         mask[..., 0] = 0
         filtered = logits.clone()
         filtered[:, sorted_indices[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0])
-        output += new_text
-        yield output
         if next_token.item() == tokenizer.eos_token_id:
             break
-# Response pipeline
 def respond(message, history, reasoning_enabled, request: gr.Request):
     ip = request.client.host if request else "unknown"
     now = time.time()
     info = usage_info.get(ip, {"count": 0, "last_seen": 0})
     if now - info["last_seen"] > RESET_AFTER_SECONDS:
         info["count"] = 0
@@ -88,19 +105,21 @@ def respond(message, history, reasoning_enabled, request: gr.Request):
     prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
     history = history + [[message, ""]]
-    for output in generate_stream(prompt, use_fallback=use_fallback):
-        formatted = format_thinking(output)
         history[-1][1] = f"{formatted}<br><sub style='color:gray'>({model_used})</sub>"
         yield history, history, f"🧠 A3 messages left: {remaining}"
 def clear_chat():
     return [], [], "🧠 A3 messages left: 5"
-# UI
 with gr.Blocks() as demo:
-    gr.Markdown("# 🤖 SamAI – Reasoning Chat (Chat Mode Enabled)")
     model_status = gr.Textbox(interactive=False, label="Model Status")
     usage_counter = gr.Textbox(value="🧠 A3 messages left: 5", interactive=False, show_label=False)
     chat_box = gr.Chatbot(type="tuples")
     chat_state = gr.State([])
@@ -115,9 +134,8 @@ with gr.Blocks() as demo:
     send_btn.click(
         respond,
         inputs=[user_input, chat_state, reason_toggle],
-        outputs=[chat_box, chat_state, usage_counter]
     )
     clear_btn.click(fn=clear_chat, inputs=[], outputs=[chat_box, chat_state, usage_counter])
 demo.queue()

 fallback_model = fallback_tokenizer = None
 usage_info = {}
+# Load both models
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
     fallback_model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL, torch_dtype=torch.float16).to(device).eval()
     return f"✅ Loaded: {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
+# Build a Qwen-style chat prompt
 def build_chat_prompt(history, user_input, reasoning_enabled):
     prefix = "/think " if reasoning_enabled else "/no_think "
     prompt = ""
     for user_msg, bot_msg in history:
+        prompt += "<|user|>\n" + user_msg + "\n<|assistant|>\n" + bot_msg + "\n"
+    prompt += "<|user|>\n" + user_input + "\n<|assistant|>\n"
     return prefix + prompt
+# Collapse <think> blocks into hidden details
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if match:
         reasoning = escape(match.group(1).strip())
         visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
+        return (
+            escape(visible)
+            + "<br><details><summary>🧠 Show reasoning</summary><pre>"
+            + reasoning
+            + "</pre></details>"
+        )
     return escape(text)
+# Stream only the new assistant tokens (no prompt echo)
 def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2, top_p=0.9):
     model = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
+    # We’ll accumulate only the new assistant text:
+    assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
         mask[..., 0] = 0
         filtered = logits.clone()
         filtered[:, sorted_indices[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0])
+        assistant_text += new_text
+        # Strip leading assistant tag if it shows up
+        if assistant_text.startswith("<|assistant|>"):
+            assistant_text = assistant_text[len("<|assistant|>"):]
+        yield assistant_text
         if next_token.item() == tokenizer.eos_token_id:
             break
+# Main respond handler
 def respond(message, history, reasoning_enabled, request: gr.Request):
     ip = request.client.host if request else "unknown"
     now = time.time()
     info = usage_info.get(ip, {"count": 0, "last_seen": 0})
+    # Reset count if idle
     if now - info["last_seen"] > RESET_AFTER_SECONDS:
         info["count"] = 0
     prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
     history = history + [[message, ""]]
+    # Stream only the assistant’s new text
+    for partial in generate_stream(prompt, use_fallback=use_fallback):
+        formatted = format_thinking(partial)
         history[-1][1] = f"{formatted}<br><sub style='color:gray'>({model_used})</sub>"
         yield history, history, f"🧠 A3 messages left: {remaining}"
 def clear_chat():
     return [], [], "🧠 A3 messages left: 5"
+# Build Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 SamAI – Qwen-Chat Mode")
     model_status = gr.Textbox(interactive=False, label="Model Status")
     usage_counter = gr.Textbox(value="🧠 A3 messages left: 5", interactive=False, show_label=False)
     chat_box = gr.Chatbot(type="tuples")
     chat_state = gr.State([])
     send_btn.click(
         respond,
         inputs=[user_input, chat_state, reason_toggle],
+        outputs=[chat_box, chat_state, usage_counter],
     )
     clear_btn.click(fn=clear_chat, inputs=[], outputs=[chat_box, chat_state, usage_counter])
 demo.queue()