Spaces:

Smilyai-labs
/

Sam-chat-full

Sleeping

App Files Files Community

Boning c commited on Jul 17

Commit

9fe966b

verified ·

1 Parent(s): b612b32

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -69

app.py CHANGED Viewed

@@ -1,26 +1,33 @@
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import re, time, json
 from html import escape
 PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
-USAGE_LIMIT = 5
-RESET_MS = 20 * 60 * 1000
-device = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
-    primary_model = AutoModelForCausalLM.from_pretrained(PRIMARY_MODEL, torch_dtype=torch.float16).to(device).eval()
-    fallback_tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, trust_remote_code=True)
-    fallback_model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL, torch_dtype=torch.float16).to(device).eval()
-    return f"✅ Loaded {PRIMARY_MODEL} + fallback {FALLBACK_MODEL}"
 def build_chat_prompt(history, user_input, reasoning_enabled):
     system_flag = "/think" if reasoning_enabled else "/no_think"
     prompt = f"<|system|>\n{system_flag}\n"
@@ -29,124 +36,151 @@ def build_chat_prompt(history, user_input, reasoning_enabled):
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if not match:
         return escape(text)
     reasoning = escape(match.group(1).strip())
     visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
-    return escape(visible) + "<br><details><summary>🧠 Show reasoning</summary><pre>" + reasoning + "</pre></details>"
-def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2, top_p=0.9):
-    model = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
     assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
         mask = probs > top_p
         mask[..., 1:] = mask[..., :-1].clone()
-        mask[..., 0] = 0
         filtered = logits.clone()
-        filtered[:, sorted_indices[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
         if "<|user|>" in new_text:
             break
         yield assistant_text
         if next_token.item() == tokenizer.eos_token_id:
             break
 def respond(message, history, reasoning_enabled, limit_json):
-    info = json.loads(limit_json) if limit_json else {"count": 0}
     count = info.get("count", 0)
     use_fallback = count > USAGE_LIMIT
-    remaining = max(0, USAGE_LIMIT - count)
-    model_label = "A3" if not use_fallback else "Fallback A1"
     prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
     history = history + [[message, ""]]
     yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
-    for chunk in generate_stream(prompt, use_fallback=use_fallback):
         formatted = format_thinking(chunk)
-        history[-1][1] = f"{formatted}<br><sub style='color:gray'>({model_label})</sub>"
         yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
     yield history, history, f"🧠 A3 left: {remaining}", "Send"
 def clear_chat():
     return [], [], "🧠 A3 left: 5", "Send"
 with gr.Blocks() as demo:
     gr.HTML(f"""
-    <script>
-      function updateUsageLimit() {{
-        let key = "samai_limit";
-        let now = Date.now();
-        let record = JSON.parse(localStorage.getItem(key) || "null");
-        if (!record || (now - record.lastSeen) > {RESET_MS}) {{
-          record = {{count: 0, lastSeen: now}};
-        }}
-        record.count += 1;
-        record.lastSeen = now;
-        localStorage.setItem(key, JSON.stringify(record));
-        document.getElementById("limit_json").value = JSON.stringify(record);
-      }}
-      function setGeneratingText() {{
-        document.getElementById("send_btn").innerText = "Generating…";
-      }}
-      function setIdleText() {{
-        document.getElementById("send_btn").innerText = "Send";
-      }}
-    </script>
-    <style>
-      .send-circle {{
-        border-radius: 50%;
-        height: 40px;
-        width: 40px;
-        padding: 0;
-        font-size: 12px;
-        text-align: center;
-      }}
-    </style>
-    """)
-    gr.Markdown("# 🤖 SamAI – Chat Reasoning (Gradio v3 Compatible)")
-    limit_json = gr.Textbox(visible=False, elem_id="limit_json")
-    model_status = gr.Textbox(interactive=False, label="Model Status")
     usage_counter = gr.Textbox("🧠 A3 left: 5", interactive=False, show_label=False)
-    chat_box = gr.Chatbot(type="tuples")
     chat_state = gr.State([])
     with gr.Row():
-        user_input = gr.Textbox(placeholder="Ask anything...", show_label=False, scale=6)
         reason_toggle = gr.Checkbox(label="Reason", value=True, scale=1)
-        send_btn = gr.Button("Send", elem_id="send_btn", elem_classes=["send-circle"], scale=1)
-    update_btn = gr.Button(visible=False)
     clear_btn = gr.Button("Clear")
     model_status.value = load_models()
-    update_btn.click(None, _js="updateUsageLimit")
-    send_btn.click(None, _js="setGeneratingText").then(
         fn=respond,
         inputs=[user_input, chat_state, reason_toggle, limit_json],
         outputs=[chat_box, chat_state, usage_counter, send_btn]
-    ).then(fn=None, _js="setIdleText")
-    clear_btn.click(fn=clear_chat,
-                    inputs=[],
-                    outputs=[chat_box, chat_state, usage_counter, send_btn]
     )
 demo.queue()

 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import re, json
 from html import escape
+# ─── Config ─────────────────────────────────────────────────────────
 PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
+USAGE_LIMIT   = 5
+RESET_MS      = 20 * 60 * 1000   # 20 minutes in milliseconds
+device        = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
+# ─── Load Models ───────────────────────────────────────────────────────
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
+    primary_model     = AutoModelForCausalLM.from_pretrained(
+        PRIMARY_MODEL, torch_dtype=torch.float16
+    ).to(device).eval()
+    fallback_tokenizer= AutoTokenizer.from_pretrained(FALLBACK_MODEL, trust_remote_code=True)
+    fallback_model    = AutoModelForCausalLM.from_pretrained(
+        FALLBACK_MODEL, torch_dtype=torch.float16
+    ).to(device).eval()
+    return f"✅ Loaded {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
+# ─── Build Qwen-style Prompt ──────────────────────────────────────────
 def build_chat_prompt(history, user_input, reasoning_enabled):
     system_flag = "/think" if reasoning_enabled else "/no_think"
     prompt = f"<|system|>\n{system_flag}\n"
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
+# ─── Collapse <think> Blocks ──────────────────────────────────────────
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if not match:
         return escape(text)
     reasoning = escape(match.group(1).strip())
     visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
+    return (
+        escape(visible)
+        + "<br><details><summary>🧠 Show reasoning</summary>"
+        + f"<pre>{reasoning}</pre></details>"
+    )
+# ─── Token‐by‐Token Streaming ─────────────────────────────────────────
+def generate_stream(prompt, use_fallback=False,
+                    max_length=100, temperature=0.2, top_p=0.9):
+    model     = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
     assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
+        sorted_logits, idxs = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
         mask = probs > top_p
         mask[..., 1:] = mask[..., :-1].clone()
+        mask[..., 0]  = 0
         filtered = logits.clone()
+        filtered[:, idxs[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
+        # strip the opening assistant tag
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
+        # stop if model tries to start a new user turn
         if "<|user|>" in new_text:
             break
         yield assistant_text
         if next_token.item() == tokenizer.eos_token_id:
             break
+# ─── Main Respond Handler ─────────────────────────────────────────────
 def respond(message, history, reasoning_enabled, limit_json):
+    info  = json.loads(limit_json) if limit_json else {"count": 0}
     count = info.get("count", 0)
     use_fallback = count > USAGE_LIMIT
+    remaining    = max(0, USAGE_LIMIT - count)
+    model_label  = "A3" if not use_fallback else "Fallback A1"
+    # show "Generating…" immediately
     prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
     history = history + [[message, ""]]
     yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
+    # stream assistant reply
+    for chunk in generate_stream(prompt, use_fallback):
         formatted = format_thinking(chunk)
+        history[-1][1] = (
+            f"{formatted}<br><sub style='color:gray'>({model_label})</sub>"
+        )
         yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
+    # final: reset to Send
     yield history, history, f"🧠 A3 left: {remaining}", "Send"
 def clear_chat():
     return [], [], "🧠 A3 left: 5", "Send"
+# ─── Gradio UI ─────────────────────────────────────────────────────────
 with gr.Blocks() as demo:
+    # Inject client-side JS + CSS
     gr.HTML(f"""
+<script>
+  // bump/reset usage in localStorage and write to hidden textbox
+  function updateUsageLimit() {{
+    const key = "samai_limit";
+    const now = Date.now();
+    let rec = JSON.parse(localStorage.getItem(key) || "null");
+    if (!rec || now - rec.lastSeen > {RESET_MS}) {{
+      rec = {{count:0, lastSeen: now}};
+    }}
+    rec.count += 1;
+    rec.lastSeen = now;
+    localStorage.setItem(key, JSON.stringify(rec));
+    document.getElementById("limit_json").value = JSON.stringify(rec);
+  }}
+  // on Send click: update limit & flip button text
+  document.addEventListener("DOMContentLoaded", () => {{
+    const btn = document.getElementById("send_btn");
+    btn.addEventListener("click", () => {{
+      updateUsageLimit();
+      btn.innerText = "Generating…";
+    }});
+  }});
+</script>
+<style>
+  .send-circle {{
+    border-radius: 50%;
+    height: 40px;
+    width: 40px;
+    padding: 0;
+    font-size: 12px;
+    text-align: center;
+  }}
+</style>
+""")
+    gr.Markdown("# 🤖 SamAI – Chat Reasoning (Final)")
+    # carry usage JSON from JS → Python
+    limit_json    = gr.Textbox(visible=False, elem_id="limit_json")
+    model_status  = gr.Textbox(interactive=False, label="Model Status")
     usage_counter = gr.Textbox("🧠 A3 left: 5", interactive=False, show_label=False)
+    chat_box   = gr.Chatbot(type="tuples")
     chat_state = gr.State([])
     with gr.Row():
+        user_input    = gr.Textbox(placeholder="Ask anything...", show_label=False, scale=6)
         reason_toggle = gr.Checkbox(label="Reason", value=True, scale=1)
+        send_btn      = gr.Button("Send", elem_id="send_btn", elem_classes=["send-circle"], scale=1)
     clear_btn = gr.Button("Clear")
     model_status.value = load_models()
+    send_btn.click(
         fn=respond,
         inputs=[user_input, chat_state, reason_toggle, limit_json],
         outputs=[chat_box, chat_state, usage_counter, send_btn]
+    )
+    clear_btn.click(
+        fn=clear_chat,
+        inputs=[],
+        outputs=[chat_box, chat_state, usage_counter, send_btn]
     )
 demo.queue()