Update app.py
Browse files
app.py
CHANGED
@@ -1,64 +1,267 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
|
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
def
|
11 |
-
message,
|
12 |
-
history: list
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
-
|
39 |
-
response += token
|
40 |
-
yield response
|
41 |
-
|
42 |
-
|
43 |
-
"""
|
44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
45 |
-
"""
|
46 |
-
demo = gr.ChatInterface(
|
47 |
-
respond,
|
48 |
-
additional_inputs=[
|
49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
52 |
-
gr.Slider(
|
53 |
-
minimum=0.1,
|
54 |
-
maximum=1.0,
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
-
],
|
60 |
-
)
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
if __name__ == "__main__":
|
64 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from typing import Iterator
|
5 |
+
import threading
|
6 |
|
7 |
+
# Global variables
|
8 |
+
llm = None
|
9 |
+
model_loading = True
|
10 |
+
model_error = None
|
11 |
|
12 |
+
def load_model():
|
13 |
+
"""Load the GGUF model"""
|
14 |
+
global llm, model_loading, model_error
|
15 |
+
|
16 |
+
try:
|
17 |
+
print("🔄 Loading model...")
|
18 |
+
from llama_cpp import Llama
|
19 |
+
|
20 |
+
# Initialize model with optimized settings for CPU-only inference
|
21 |
+
llm = Llama.from_pretrained(
|
22 |
+
repo_id="Tohirju/Ameena_Qwen3-8B_e3_Quantised_gguf",
|
23 |
+
filename="Ameena_Qwen3-8B_e3.gguf",
|
24 |
+
# CPU-optimized settings
|
25 |
+
n_ctx=2048, # Context length
|
26 |
+
n_threads=None, # Use all available CPU threads
|
27 |
+
n_gpu_layers=0, # CPU only
|
28 |
+
use_mmap=True, # Memory mapping for efficiency
|
29 |
+
use_mlock=False, # Don't lock memory (can cause issues on some systems)
|
30 |
+
n_batch=512, # Batch size for prompt processing
|
31 |
+
verbose=False, # Reduce output noise
|
32 |
+
# Additional optimizations
|
33 |
+
offload_kqv=False, # Keep KV cache on CPU
|
34 |
+
f16_kv=True, # Use 16-bit for KV cache
|
35 |
+
)
|
36 |
+
|
37 |
+
model_loading = False
|
38 |
+
print("✅ Model loaded successfully!")
|
39 |
+
|
40 |
+
except Exception as e:
|
41 |
+
model_error = f"Model loading failed: {str(e)}"
|
42 |
+
model_loading = False
|
43 |
+
print(f"❌ {model_error}")
|
44 |
|
45 |
+
def chat_with_model(
|
46 |
+
message: str,
|
47 |
+
history: list,
|
48 |
+
system_message: str = "Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.",
|
49 |
+
max_tokens: int = 150,
|
50 |
+
temperature: float = 0.7,
|
51 |
+
top_p: float = 0.9,
|
52 |
+
) -> Iterator[str]:
|
53 |
+
"""
|
54 |
+
Chat function that streams responses
|
55 |
+
"""
|
56 |
+
# Check if model is ready
|
57 |
+
if model_loading:
|
58 |
+
yield "⏳ Model is still loading, please wait..."
|
59 |
+
return
|
60 |
+
|
61 |
+
if model_error:
|
62 |
+
yield f"❌ Model error: {model_error}"
|
63 |
+
return
|
64 |
+
|
65 |
+
if llm is None:
|
66 |
+
yield "❌ Model not loaded. Please refresh the page."
|
67 |
+
return
|
68 |
+
|
69 |
+
try:
|
70 |
+
# Build conversation history
|
71 |
+
messages = []
|
72 |
+
|
73 |
+
# Add system message if provided
|
74 |
+
if system_message.strip():
|
75 |
+
messages.append({"role": "system", "content": system_message})
|
76 |
+
|
77 |
+
# Add conversation history
|
78 |
+
for user_msg, assistant_msg in history:
|
79 |
+
if user_msg:
|
80 |
+
messages.append({"role": "user", "content": user_msg})
|
81 |
+
if assistant_msg:
|
82 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
83 |
+
|
84 |
+
# Add current message
|
85 |
+
messages.append({"role": "user", "content": message})
|
86 |
+
|
87 |
+
# Generate response with streaming
|
88 |
+
response_stream = llm.create_chat_completion(
|
89 |
+
messages=messages,
|
90 |
+
max_tokens=max_tokens,
|
91 |
+
temperature=temperature,
|
92 |
+
top_p=top_p,
|
93 |
+
stream=True,
|
94 |
+
stop=["</s>", "User:", "Human:", "Assistant:"],
|
95 |
+
repeat_penalty=1.1,
|
96 |
+
)
|
97 |
+
|
98 |
+
# Stream the response
|
99 |
+
partial_response = ""
|
100 |
+
for chunk in response_stream:
|
101 |
+
if chunk["choices"][0]["delta"].get("content"):
|
102 |
+
partial_response += chunk["choices"][0]["delta"]["content"]
|
103 |
+
yield partial_response
|
104 |
+
|
105 |
+
except Exception as e:
|
106 |
+
yield f"❌ Generation error: {str(e)}"
|
107 |
|
108 |
+
def get_model_status():
|
109 |
+
"""Get current model status"""
|
110 |
+
if model_loading:
|
111 |
+
return "🔄 Loading model... Please wait."
|
112 |
+
elif model_error:
|
113 |
+
return f"❌ Error: {model_error}"
|
114 |
+
elif llm is not None:
|
115 |
+
return "✅ Model ready!"
|
116 |
+
else:
|
117 |
+
return "❓ Unknown status"
|
118 |
|
119 |
+
# Load model in background thread
|
120 |
+
model_thread = threading.Thread(target=load_model, daemon=True)
|
121 |
+
model_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
# Create Gradio interface
|
124 |
+
with gr.Blocks(
|
125 |
+
title="🇹🇯 Ameena Qwen3-8B Tajik Language Model",
|
126 |
+
theme=gr.themes.Soft(),
|
127 |
+
css="""
|
128 |
+
.gradio-container {
|
129 |
+
max-width: 800px !important;
|
130 |
+
margin: auto !important;
|
131 |
+
}
|
132 |
+
"""
|
133 |
+
) as demo:
|
134 |
+
|
135 |
+
gr.Markdown("""
|
136 |
+
# 🇹🇯 Ameena Qwen3-8B - Tajik Language Model
|
137 |
+
|
138 |
+
**Model**: Quantized GGUF (4GB) | **Backend**: CPU Only | **Language**: Tajik
|
139 |
+
|
140 |
+
Base model: Qwen3-8B fine-tuned for Tajik language
|
141 |
+
""")
|
142 |
+
|
143 |
+
# Model status
|
144 |
+
status_display = gr.Markdown(get_model_status())
|
145 |
+
|
146 |
+
# Main chat interface
|
147 |
+
chatbot = gr.Chatbot(
|
148 |
+
height=400,
|
149 |
+
show_label=False,
|
150 |
+
show_copy_button=True,
|
151 |
+
)
|
152 |
+
|
153 |
+
with gr.Row():
|
154 |
+
msg = gr.Textbox(
|
155 |
+
placeholder="Салом! Саволи худро дар ин ҷо бинависед... (Hello! Write your question here...)",
|
156 |
+
show_label=False,
|
157 |
+
scale=4
|
158 |
+
)
|
159 |
+
submit_btn = gr.Button("Send", scale=1, variant="primary")
|
160 |
+
|
161 |
+
# Advanced settings
|
162 |
+
with gr.Accordion("⚙️ Settings", open=False):
|
163 |
+
system_msg = gr.Textbox(
|
164 |
+
value="Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.",
|
165 |
+
label="System Message (Tajik)",
|
166 |
+
info="Instructions for the model in Tajik language"
|
167 |
+
)
|
168 |
+
|
169 |
+
with gr.Row():
|
170 |
+
max_tokens = gr.Slider(
|
171 |
+
minimum=50,
|
172 |
+
maximum=300,
|
173 |
+
value=150,
|
174 |
+
step=10,
|
175 |
+
label="Max Tokens",
|
176 |
+
info="Maximum response length"
|
177 |
+
)
|
178 |
+
temperature = gr.Slider(
|
179 |
+
minimum=0.1,
|
180 |
+
maximum=1.5,
|
181 |
+
value=0.7,
|
182 |
+
step=0.1,
|
183 |
+
label="Temperature",
|
184 |
+
info="Response creativity (higher = more creative)"
|
185 |
+
)
|
186 |
+
top_p = gr.Slider(
|
187 |
+
minimum=0.1,
|
188 |
+
maximum=1.0,
|
189 |
+
value=0.9,
|
190 |
+
step=0.05,
|
191 |
+
label="Top-p",
|
192 |
+
info="Nucleus sampling parameter"
|
193 |
+
)
|
194 |
+
|
195 |
+
# Example prompts
|
196 |
+
gr.Examples(
|
197 |
+
examples=[
|
198 |
+
["Салом! Чӣ хел ҳастед?"],
|
199 |
+
["Тоҷикистон дар куҷо ҷойгир аст?"],
|
200 |
+
["Барномасозӣ чист ва чӣ гуна кор мекунад?"],
|
201 |
+
["Оиди забони тоҷикӣ маълумот диҳед"],
|
202 |
+
["Шеър дар бораи табиат нависед"],
|
203 |
+
],
|
204 |
+
inputs=msg,
|
205 |
+
label="💡 Example Questions"
|
206 |
+
)
|
207 |
+
|
208 |
+
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
209 |
+
"""Handle user message and generate response"""
|
210 |
+
if not message.strip():
|
211 |
+
return history, ""
|
212 |
+
|
213 |
+
# Add user message to history
|
214 |
+
history.append([message, None])
|
215 |
+
|
216 |
+
# Generate response
|
217 |
+
response_generator = chat_with_model(
|
218 |
+
message, history[:-1], system_message, max_tokens, temperature, top_p
|
219 |
+
)
|
220 |
+
|
221 |
+
# Stream response
|
222 |
+
for partial_response in response_generator:
|
223 |
+
history[-1][1] = partial_response
|
224 |
+
yield history, ""
|
225 |
+
|
226 |
+
return history, ""
|
227 |
+
|
228 |
+
def clear_chat():
|
229 |
+
"""Clear chat history"""
|
230 |
+
return [], ""
|
231 |
+
|
232 |
+
def update_status():
|
233 |
+
"""Update model status display"""
|
234 |
+
return get_model_status()
|
235 |
+
|
236 |
+
# Event handlers
|
237 |
+
submit_btn.click(
|
238 |
+
respond,
|
239 |
+
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p],
|
240 |
+
outputs=[chatbot, msg]
|
241 |
+
)
|
242 |
+
|
243 |
+
msg.submit(
|
244 |
+
respond,
|
245 |
+
inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p],
|
246 |
+
outputs=[chatbot, msg]
|
247 |
+
)
|
248 |
+
|
249 |
+
# Clear button
|
250 |
+
clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
|
251 |
+
clear_btn.click(clear_chat, outputs=[chatbot, msg])
|
252 |
+
|
253 |
+
# Refresh status button
|
254 |
+
refresh_btn = gr.Button("🔄 Refresh Status", variant="secondary")
|
255 |
+
refresh_btn.click(update_status, outputs=status_display)
|
256 |
+
|
257 |
+
# Auto-refresh status every 5 seconds during loading
|
258 |
+
demo.load(update_status, outputs=status_display, every=5)
|
259 |
|
260 |
if __name__ == "__main__":
|
261 |
+
demo.launch(
|
262 |
+
server_name="0.0.0.0",
|
263 |
+
server_port=7860,
|
264 |
+
show_error=True,
|
265 |
+
share=False,
|
266 |
+
quiet=False,
|
267 |
+
)
|