Boning c commited on
Commit
24a676b
·
verified ·
1 Parent(s): 62ca512

Update latest_stable.txt

Browse files
Files changed (1) hide show
  1. latest_stable.txt +92 -0
latest_stable.txt CHANGED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ # Model definitions
6
+ PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A1"
7
+ FALLBACK_MODEL = "Smilyai-labs/Sam-reason-S2.1"
8
+ USAGE_LIMIT = 10
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ # Globals for models and tokenizers
13
+ primary_model, primary_tokenizer = None, None
14
+ fallback_model, fallback_tokenizer = None, None
15
+
16
+ # IP-based usage tracking
17
+ usage_counts = {}
18
+
19
+ def load_models():
20
+ global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
21
+ primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL)
22
+ primary_model = AutoModelForCausalLM.from_pretrained(PRIMARY_MODEL).to(device).eval()
23
+ fallback_tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
24
+ fallback_model = AutoModelForCausalLM.from_pretrained(FALLBACK_MODEL).to(device).eval()
25
+ return f"Models loaded: {PRIMARY_MODEL} + fallback {FALLBACK_MODEL}"
26
+
27
+ def generate_stream(prompt, use_fallback=False, max_length=100, temperature=0.7, top_p=0.9):
28
+ model = fallback_model if use_fallback else primary_model
29
+ tokenizer = fallback_tokenizer if use_fallback else primary_tokenizer
30
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
31
+ generated = input_ids
32
+ output_text = tokenizer.decode(input_ids[0])
33
+
34
+ for _ in range(max_length):
35
+ outputs = model(generated)
36
+ logits = outputs.logits[:, -1, :] / temperature
37
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
38
+ probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
39
+ mask = probs > top_p
40
+ mask[..., 1:] = mask[..., :-1].clone()
41
+ mask[..., 0] = 0
42
+ filtered = logits.clone()
43
+ filtered[:, sorted_indices[mask]] = -float("Inf")
44
+ next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
45
+ generated = torch.cat([generated, next_token], dim=-1)
46
+ new_text = tokenizer.decode(next_token[0])
47
+ output_text += new_text
48
+ yield output_text
49
+ if next_token.item() == tokenizer.eos_token_id:
50
+ break
51
+
52
+ def respond(msg, history, reasoning_enabled, request: gr.Request):
53
+ ip = request.client.host if request else "unknown"
54
+ usage_counts[ip] = usage_counts.get(ip, 0) + 1
55
+ use_fallback = usage_counts[ip] > USAGE_LIMIT
56
+ model_used = "A1" if not use_fallback else "Fallback S2.1"
57
+ prefix = "/think " if reasoning_enabled else "/no_think "
58
+ prompt = prefix + msg.strip()
59
+ history = history + [[msg, ""]]
60
+ for output in generate_stream(prompt, use_fallback):
61
+ history[-1][1] = output + f" ({model_used})"
62
+ yield history, history
63
+
64
+ def clear_chat():
65
+ return [], []
66
+
67
+ with gr.Blocks() as demo:
68
+ gr.Markdown("# 🤖 SmilyAI Reasoning Chat • Token-by-Token + IP Usage Limits")
69
+
70
+ model_status = gr.Textbox(label="Model Load Status", interactive=False)
71
+ chat_box = gr.Chatbot(label="Chat", type="tuples")
72
+ chat_state = gr.State([])
73
+
74
+ with gr.Row():
75
+ user_input = gr.Textbox(placeholder="Your message here...", show_label=False, scale=6)
76
+ reason_toggle = gr.Checkbox(label="Reason", value=True, scale=1)
77
+ send_btn = gr.Button("Send", scale=1)
78
+
79
+ clear_btn = gr.Button("Clear Chat")
80
+
81
+ model_status.value = load_models()
82
+
83
+ send_btn.click(
84
+ respond,
85
+ inputs=[user_input, chat_state, reason_toggle],
86
+ outputs=[chat_box, chat_state]
87
+ )
88
+
89
+ clear_btn.click(fn=clear_chat, inputs=[], outputs=[chat_box, chat_state])
90
+
91
+ demo.queue()
92
+ demo.launch()