Spaces:

Smilyai-labs
/

Sam-chat-full

Sleeping

App Files Files Community

Boning c commited on Jul 17

Commit

7a55e85

verified ·

1 Parent(s): d5529e8

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -89

app.py CHANGED Viewed

@@ -4,32 +4,27 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 import re, time, json
 from html import escape
-# ─── Model Config ─────────────────────────────────────────────────────────────
-PRIMARY_MODEL  = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
-USAGE_LIMIT    = 5            # max messages before fallback
-RESET_MS       = 20 * 60 * 1000  # 20 minutes in milliseconds
-device         = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
-# ─── Load Models ────────────────────────────────────────────────────────────────
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
-    primary_model     = AutoModelForCausalLM.from_pretrained(PRIMARY_MODEL,
-                                                           torch_dtype=torch.float16
-                                                          ).to(device).eval()
-    fallback_tokenizer= AutoTokenizer.from_pretrained(FALLBACK_MODEL, trust_remote_code=True)
-    fallback_model    = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL,
-                                                           torch_dtype=torch.float16
-                                                          ).to(device).eval()
-    return f"✅ Loaded: {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
-# ─── Prompt Builder ────────────────────────────────────────────────────────────
 def build_chat_prompt(history, user_input, reasoning_enabled):
-    # inject think/no_think as a system role
     system_flag = "/think" if reasoning_enabled else "/no_think"
     prompt = f"<|system|>\n{system_flag}\n"
     for u, a in history:
@@ -37,53 +32,42 @@ def build_chat_prompt(history, user_input, reasoning_enabled):
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
-# ─── Collapse <think> Blocks ───────────────────────────────────────────────────
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if not match:
         return escape(text)
     reasoning = escape(match.group(1).strip())
-    visible   = re.sub(r"<think>.*?</think>", "[thinking...]", text,
-                       flags=re.DOTALL).strip()
-    return (
-        escape(visible)
-        + "<br><details><summary>🧠 Show reasoning</summary>"
-        + "<pre>" + reasoning + "</pre></details>"
-    )
-# ─── Token-Stream Generator ───────────────────────────────────────────────────
-def generate_stream(prompt, use_fallback=False,
-                    max_length=100, temperature=0.2, top_p=0.9):
-    model     = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
     assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
-        sorted_logits, indices = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
-        # top-p filtering
         mask = probs > top_p
         mask[..., 1:] = mask[..., :-1].clone()
-        mask[..., 0]  = 0
         filtered = logits.clone()
-        filtered[:, indices[mask]] = -float("Inf")
-        # sample next token
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
-        # strip opening assistant tag
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
-        # stop if model begins a new user turn
         if "<|user|>" in new_text:
             break
@@ -92,91 +76,91 @@ def generate_stream(prompt, use_fallback=False,
         if next_token.item() == tokenizer.eos_token_id:
             break
-# ─── Main Respond Handler ─────────────────────────────────────────────────────
-def respond(user_msg, history, reasoning_enabled, limit_json):
-    # parse usage info from localStorage
     info = json.loads(limit_json) if limit_json else {"count": 0}
     count = info.get("count", 0)
     use_fallback = count > USAGE_LIMIT
-    remaining    = max(0, USAGE_LIMIT - count)
-    model_label  = "A3" if not use_fallback else "Fallback A1"
-    # build prompt & init history
-    prompt = build_chat_prompt(history, user_msg.strip(), reasoning_enabled)
-    history = history + [[user_msg, ""]]
-    # stream assistant reply
     for chunk in generate_stream(prompt, use_fallback=use_fallback):
         formatted = format_thinking(chunk)
-        history[-1][1] = (
-            f"{formatted}<br><sub style='color:gray'>({model_label})</sub>"
-        )
-        # during streaming, show Generating
-        yield history, history, f"🧠 A3 left: {remaining}", "Generating..."
-    # final update: set status back to Idle
-    yield history, history, f"🧠 A3 left: {remaining}", "Idle"
 def clear_chat():
-    return [], [], "🧠 A3 left: 5", "Idle"
-# ─── Gradio UI ────────────────────────────────────────────────────────────────
 with gr.Blocks() as demo:
-    gr.HTML(  # inject localStorage logic
-    """
-    <script>
-      function updateUsageLimit() {
-        const key = "samai_limit";
-        let now = Date.now();
-        let record = JSON.parse(localStorage.getItem(key) || "null");
-        if (!record || (now - record.lastSeen) > {RESET_MS}) {{
-          record = {{count: 0, lastSeen: now}};
-        }}
-        record.count += 1;
-        record.lastSeen = now;
-        localStorage.setItem(key, JSON.stringify(record));
-        return record;
-      }
-    </script>
-    """.replace("{RESET_MS}", str(RESET_MS))
     )
-    gr.Markdown("# 🤖 SamAI – Qwen Chat with Client-Side Limits")
-    # hidden box to carry JSON string from JS → Python
     limit_json     = gr.Textbox(visible=False)
     model_status   = gr.Textbox(interactive=False, label="Model Status")
-    usage_counter  = gr.Textbox("🧠 A3 left: 5", interactive=False, show_label=False)
-    status_display = gr.Textbox("Idle", interactive=False, label="Status")
     chat_box  = gr.Chatbot(type="tuples")
     chat_state= gr.State([])
     with gr.Row():
-        user_input   = gr.Textbox(placeholder="Ask me anything…", show_label=False, scale=6)
         reason_toggle= gr.Checkbox(label="Reason", value=True, scale=1)
-        send_btn     = gr.Button("Send", scale=1)
-    clear_btn = gr.Button("Clear Chat")
     model_status.value = load_models()
-    # first: JS updates localStorage → limit_json
     send_btn.click(
-      fn=None,
-      _js="() => JSON.stringify(updateUsageLimit())",
-      outputs=[limit_json]
     ).then(
-      # then: call our Python respond() with that JSON
-      fn=respond,
-      inputs=[user_input, chat_state, reason_toggle, limit_json],
-      outputs=[chat_box, chat_state, usage_counter, status_display]
     )
     clear_btn.click(fn=clear_chat,
-                    inputs=[],
-                    outputs=[chat_box, chat_state, usage_counter, status_display]
     )
 demo.queue()

 import re, time, json
 from html import escape
+# ─── Config ───────────────────────────────────────────────────
+PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
+USAGE_LIMIT = 5
+RESET_MS = 20 * 60 * 1000  # 20 min in ms
+device = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
+# ─── Load Models ───────────────────────────────────────────────
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
+    primary_model = AutoModelForCausalLM.from_pretrained(PRIMARY_MODEL, torch_dtype=torch.float16).to(device).eval()
+    fallback_tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, trust_remote_code=True)
+    fallback_model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL, torch_dtype=torch.float16).to(device).eval()
+    return f"✅ Loaded {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
+# ─── Prompt Builder ────────────────────────────────────────────
 def build_chat_prompt(history, user_input, reasoning_enabled):
     system_flag = "/think" if reasoning_enabled else "/no_think"
     prompt = f"<|system|>\n{system_flag}\n"
     for u, a in history:
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
+# ─── Collapse <think> blocks ──────────────────────────────────
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if not match:
         return escape(text)
     reasoning = escape(match.group(1).strip())
+    visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
+    return escape(visible) + "<br><details><summary>🧠 Show reasoning</summary><pre>" + reasoning + "</pre></details>"
+# ─── Stream Generator ─────────────────────────────────────────
+def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2, top_p=0.9):
+    model = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
     assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
         mask = probs > top_p
         mask[..., 1:] = mask[..., :-1].clone()
+        mask[..., 0] = 0
         filtered = logits.clone()
+        filtered[:, sorted_indices[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
         if "<|user|>" in new_text:
             break
         if next_token.item() == tokenizer.eos_token_id:
             break
+# ─── Respond Handler ──────────────────────────────────────────
+def respond(message, history, reasoning_enabled, limit_json):
     info = json.loads(limit_json) if limit_json else {"count": 0}
     count = info.get("count", 0)
     use_fallback = count > USAGE_LIMIT
+    remaining = max(0, USAGE_LIMIT - count)
+    model_label = "A3" if not use_fallback else "Fallback A1"
+    prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
+    history = history + [[message, ""]]
+    yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
     for chunk in generate_stream(prompt, use_fallback=use_fallback):
         formatted = format_thinking(chunk)
+        history[-1][1] = f"{formatted}<br><sub style='color:gray'>({model_label})</sub>"
+        yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
+    yield history, history, f"🧠 A3 left: {remaining}", "Send"
 def clear_chat():
+    return [], [], "🧠 A3 left: 5", "Send"
+# ─── Gradio UI ────────────────────────────────────────────────
 with gr.Blocks() as demo:
+    gr.HTML(
+        """
+        <script>
+        function updateUsageLimit() {
+            let key = "samai_limit";
+            let now = Date.now();
+            let record = JSON.parse(localStorage.getItem(key) || "null");
+            if (!record || (now - record.lastSeen) > """ + str(RESET_MS) + """) {
+                record = {count: 0, lastSeen: now};
+            }
+            record.count += 1;
+            record.lastSeen = now;
+            localStorage.setItem(key, JSON.stringify(record));
+            return record;
+        }
+        </script>
+        <style>
+        .send-circle {
+            border-radius: 50%;
+            height: 40px;
+            width: 40px;
+            padding: 0;
+            font-size: 12px;
+            text-align: center;
+        }
+        </style>
+        """
     )
+    gr.Markdown("# 🤖 SamAI – Chat Reasoning UI")
     limit_json     = gr.Textbox(visible=False)
     model_status   = gr.Textbox(interactive=False, label="Model Status")
+    usage_counter  = gr.Textbox(value="🧠 A3 left: 5", interactive=False, show_label=False)
     chat_box  = gr.Chatbot(type="tuples")
     chat_state= gr.State([])
     with gr.Row():
+        user_input   = gr.Textbox(placeholder="Ask anything...", show_label=False, scale=6)
         reason_toggle= gr.Checkbox(label="Reason", value=True, scale=1)
+        send_btn     = gr.Button("Send", elem_classes=["send-circle"], scale=1)
+    clear_btn = gr.Button("Clear")
     model_status.value = load_models()
     send_btn.click(
+        None,
+        _js="() => JSON.stringify(updateUsageLimit())",
+        outputs=[limit_json]
     ).then(
+        fn=respond,
+        inputs=[user_input, chat_state, reason_toggle, limit_json],
+        outputs=[chat_box, chat_state, usage_counter, send_btn]
     )
     clear_btn.click(fn=clear_chat,
+                    inputs=[],
+                    outputs=[chat_box, chat_state, usage_counter, send_btn]
     )
 demo.queue()