Spaces:

Smilyai-labs
/

Sam-chat-full

Sleeping

App Files Files Community

Boning c commited on Jul 17

Commit

d5529e8

verified ·

1 Parent(s): d84b5b3

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -68

app.py CHANGED Viewed

@@ -1,75 +1,89 @@
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import re
-import time
 from html import escape
-# Model config
-PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
-USAGE_LIMIT = 5
-RESET_AFTER_SECONDS = 20 * 60
-device = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
-usage_info = {}
-# Load models
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
-    primary_model = AutoModelForCausalLM.from_pretrained(PRIMARY_MODEL, torch_dtype=torch.float16).to(device).eval()
-    fallback_tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL, trust_remote_code=True)
-    fallback_model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL, torch_dtype=torch.float16).to(device).eval()
     return f"✅ Loaded: {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
-# Build prompt with full chat context
 def build_chat_prompt(history, user_input, reasoning_enabled):
-    system = "/think" if reasoning_enabled else "/no_think"
-    prompt = f"<|system|>\n{system}\n"
-    for user_msg, bot_msg in history:
-        prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n"
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
-# Collapse <think> reasoning blocks
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
-    if match:
-        reasoning = escape(match.group(1).strip())
-        visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
-        return f"{escape(visible)}<br><details><summary>🧠 Show reasoning</summary><pre>{reasoning}</pre></details>"
-    return escape(text)
-# Stream tokens and stop on <|user|> tag
-def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2, top_p=0.9):
-    model = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
     assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
         mask = probs > top_p
         mask[..., 1:] = mask[..., :-1].clone()
-        mask[..., 0] = 0
         filtered = logits.clone()
-        filtered[:, sorted_indices[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
-        new_text = tokenizer.decode(next_token[0])
         assistant_text += new_text
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
-        # ⛔️ Stop if model starts new user turn
         if "<|user|>" in new_text:
             break
@@ -78,55 +92,92 @@ def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.2,
         if next_token.item() == tokenizer.eos_token_id:
             break
-# Respond to incoming message
-def respond(message, history, reasoning_enabled, request: gr.Request):
-    ip = request.client.host if request else "unknown"
-    now = time.time()
-    info = usage_info.get(ip, {"count": 0, "last_seen": 0})
-    if now - info["last_seen"] > RESET_AFTER_SECONDS:
-        info["count"] = 0
-    info["count"] += 1
-    info["last_seen"] = now
-    usage_info[ip] = info
-    use_fallback = info["count"] > USAGE_LIMIT
-    remaining = max(0, USAGE_LIMIT - info["count"])
-    model_used = "A3" if not use_fallback else "Fallback A1"
-    prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
-    history = history + [[message, ""]]
-    for partial in generate_stream(prompt, use_fallback=use_fallback):
-        formatted = format_thinking(partial)
-        history[-1][1] = f"{formatted}<br><sub style='color:gray'>({model_used})</sub>"
-        yield history, history, f"🧠 A3 messages left: {remaining}"
 def clear_chat():
-    return [], [], "🧠 A3 messages left: 5"
-# UI Layout
 with gr.Blocks() as demo:
-    gr.Markdown("# 🤖 SamAI – Chat Reasoning (Qwen-Style)")
-    model_status = gr.Textbox(interactive=False, label="Model Status")
-    usage_counter = gr.Textbox(value="🧠 A3 messages left: 5", interactive=False, show_label=False)
-    chat_box = gr.Chatbot(type="tuples")
-    chat_state = gr.State([])
     with gr.Row():
-        user_input = gr.Textbox(placeholder="Ask anything...", show_label=False, scale=6)
-        reason_toggle = gr.Checkbox(label="Reason", value=True, scale=1)
-        send_btn = gr.Button("Send", scale=1)
     clear_btn = gr.Button("Clear Chat")
     model_status.value = load_models()
     send_btn.click(
-        respond,
-        inputs=[user_input, chat_state, reason_toggle],
-        outputs=[chat_box, chat_state, usage_counter]
     )
-    clear_btn.click(fn=clear_chat, inputs=[], outputs=[chat_box, chat_state, usage_counter])
 demo.queue()
 demo.launch()

 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import re, time, json
 from html import escape
+# ─── Model Config ─────────────────────────────────────────────────────────────
+PRIMARY_MODEL  = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
+USAGE_LIMIT    = 5            # max messages before fallback
+RESET_MS       = 20 * 60 * 1000  # 20 minutes in milliseconds
+device         = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
+# ─── Load Models ────────────────────────────────────────────────────────────────
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
+    primary_model     = AutoModelForCausalLM.from_pretrained(PRIMARY_MODEL,
+                                                           torch_dtype=torch.float16
+                                                          ).to(device).eval()
+    fallback_tokenizer= AutoTokenizer.from_pretrained(FALLBACK_MODEL, trust_remote_code=True)
+    fallback_model    = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL,
+                                                           torch_dtype=torch.float16
+                                                          ).to(device).eval()
     return f"✅ Loaded: {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
+# ─── Prompt Builder ────────────────────────────────────────────────────────────
 def build_chat_prompt(history, user_input, reasoning_enabled):
+    # inject think/no_think as a system role
+    system_flag = "/think" if reasoning_enabled else "/no_think"
+    prompt = f"<|system|>\n{system_flag}\n"
+    for u, a in history:
+        prompt += f"<|user|>\n{u}\n<|assistant|>\n{a}\n"
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
+# ─── Collapse <think> Blocks ───────────────────────────────────────────────────
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
+    if not match:
+        return escape(text)
+    reasoning = escape(match.group(1).strip())
+    visible   = re.sub(r"<think>.*?</think>", "[thinking...]", text,
+                       flags=re.DOTALL).strip()
+    return (
+        escape(visible)
+        + "<br><details><summary>🧠 Show reasoning</summary>"
+        + "<pre>" + reasoning + "</pre></details>"
+    )
+# ─── Token-Stream Generator ───────────────────────────────────────────────────
+def generate_stream(prompt, use_fallback=False,
+                    max_length=100, temperature=0.2, top_p=0.9):
+    model     = fallback_model if use_fallback else primary_model
     tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
     generated = input_ids
     assistant_text = ""
     for _ in range(max_length):
         logits = model(generated).logits[:, -1, :] / temperature
+        sorted_logits, indices = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
+        # top-p filtering
         mask = probs > top_p
         mask[..., 1:] = mask[..., :-1].clone()
+        mask[..., 0]  = 0
         filtered = logits.clone()
+        filtered[:, indices[mask]] = -float("Inf")
+        # sample next token
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
+        new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
+        # strip opening assistant tag
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
+        # stop if model begins a new user turn
         if "<|user|>" in new_text:
             break
         if next_token.item() == tokenizer.eos_token_id:
             break
+# ─── Main Respond Handler ─────────────────────────────────────────────────────
+def respond(user_msg, history, reasoning_enabled, limit_json):
+    # parse usage info from localStorage
+    info = json.loads(limit_json) if limit_json else {"count": 0}
+    count = info.get("count", 0)
+    use_fallback = count > USAGE_LIMIT
+    remaining    = max(0, USAGE_LIMIT - count)
+    model_label  = "A3" if not use_fallback else "Fallback A1"
+    # build prompt & init history
+    prompt = build_chat_prompt(history, user_msg.strip(), reasoning_enabled)
+    history = history + [[user_msg, ""]]
+    # stream assistant reply
+    for chunk in generate_stream(prompt, use_fallback=use_fallback):
+        formatted = format_thinking(chunk)
+        history[-1][1] = (
+            f"{formatted}<br><sub style='color:gray'>({model_label})</sub>"
+        )
+        # during streaming, show Generating
+        yield history, history, f"🧠 A3 left: {remaining}", "Generating..."
+    # final update: set status back to Idle
+    yield history, history, f"🧠 A3 left: {remaining}", "Idle"
 def clear_chat():
+    return [], [], "🧠 A3 left: 5", "Idle"
+# ─── Gradio UI ────────────────────────────────────────────────────────────────
 with gr.Blocks() as demo:
+    gr.HTML(  # inject localStorage logic
+    """
+    <script>
+      function updateUsageLimit() {
+        const key = "samai_limit";
+        let now = Date.now();
+        let record = JSON.parse(localStorage.getItem(key) || "null");
+        if (!record || (now - record.lastSeen) > {RESET_MS}) {{
+          record = {{count: 0, lastSeen: now}};
+        }}
+        record.count += 1;
+        record.lastSeen = now;
+        localStorage.setItem(key, JSON.stringify(record));
+        return record;
+      }
+    </script>
+    """.replace("{RESET_MS}", str(RESET_MS))
+    )
+    gr.Markdown("# 🤖 SamAI – Qwen Chat with Client-Side Limits")
+    # hidden box to carry JSON string from JS → Python
+    limit_json     = gr.Textbox(visible=False)
+    model_status   = gr.Textbox(interactive=False, label="Model Status")
+    usage_counter  = gr.Textbox("🧠 A3 left: 5", interactive=False, show_label=False)
+    status_display = gr.Textbox("Idle", interactive=False, label="Status")
+    chat_box  = gr.Chatbot(type="tuples")
+    chat_state= gr.State([])
     with gr.Row():
+        user_input   = gr.Textbox(placeholder="Ask me anything…", show_label=False, scale=6)
+        reason_toggle= gr.Checkbox(label="Reason", value=True, scale=1)
+        send_btn     = gr.Button("Send", scale=1)
     clear_btn = gr.Button("Clear Chat")
     model_status.value = load_models()
+    # first: JS updates localStorage → limit_json
     send_btn.click(
+      fn=None,
+      _js="() => JSON.stringify(updateUsageLimit())",
+      outputs=[limit_json]
+    ).then(
+      # then: call our Python respond() with that JSON
+      fn=respond,
+      inputs=[user_input, chat_state, reason_toggle, limit_json],
+      outputs=[chat_box, chat_state, usage_counter, status_display]
     )
+    clear_btn.click(fn=clear_chat,
+                    inputs=[],
+                    outputs=[chat_box, chat_state, usage_counter, status_display]
+    )
 demo.queue()
 demo.launch()