Spaces:

Tohirju
/

Ameena_e3

Paused

App Files Files Community

Tohirju commited on Jul 23

Commit

858b02e

verified ·

1 Parent(s): 337b29f

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -55

app.py CHANGED Viewed

@@ -1,64 +1,267 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import os
+import time
+from typing import Iterator
+import threading
+# Global variables
+llm = None
+model_loading = True
+model_error = None
+def load_model():
+    """Load the GGUF model"""
+    global llm, model_loading, model_error
+    try:
+        print("🔄 Loading model...")
+        from llama_cpp import Llama
+        # Initialize model with optimized settings for CPU-only inference
+        llm = Llama.from_pretrained(
+            repo_id="Tohirju/Ameena_Qwen3-8B_e3_Quantised_gguf",
+            filename="Ameena_Qwen3-8B_e3.gguf",
+            # CPU-optimized settings
+            n_ctx=2048,          # Context length
+            n_threads=None,      # Use all available CPU threads
+            n_gpu_layers=0,      # CPU only
+            use_mmap=True,       # Memory mapping for efficiency
+            use_mlock=False,     # Don't lock memory (can cause issues on some systems)
+            n_batch=512,         # Batch size for prompt processing
+            verbose=False,       # Reduce output noise
+            # Additional optimizations
+            offload_kqv=False,   # Keep KV cache on CPU
+            f16_kv=True,         # Use 16-bit for KV cache
+        )
+        model_loading = False
+        print("✅ Model loaded successfully!")
+    except Exception as e:
+        model_error = f"Model loading failed: {str(e)}"
+        model_loading = False
+        print(f"❌ {model_error}")
+def chat_with_model(
+    message: str,
+    history: list,
+    system_message: str = "Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.",
+    max_tokens: int = 150,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
+) -> Iterator[str]:
+    """
+    Chat function that streams responses
+    """
+    # Check if model is ready
+    if model_loading:
+        yield "⏳ Model is still loading, please wait..."
+        return
+    if model_error:
+        yield f"❌ Model error: {model_error}"
+        return
+    if llm is None:
+        yield "❌ Model not loaded. Please refresh the page."
+        return
+    try:
+        # Build conversation history
+        messages = []
+        # Add system message if provided
+        if system_message.strip():
+            messages.append({"role": "system", "content": system_message})
+        # Add conversation history
+        for user_msg, assistant_msg in history:
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                messages.append({"role": "assistant", "content": assistant_msg})
+        # Add current message
+        messages.append({"role": "user", "content": message})
+        # Generate response with streaming
+        response_stream = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stream=True,
+            stop=["</s>", "User:", "Human:", "Assistant:"],
+            repeat_penalty=1.1,
+        )
+        # Stream the response
+        partial_response = ""
+        for chunk in response_stream:
+            if chunk["choices"][0]["delta"].get("content"):
+                partial_response += chunk["choices"][0]["delta"]["content"]
+                yield partial_response
+    except Exception as e:
+        yield f"❌ Generation error: {str(e)}"
+def get_model_status():
+    """Get current model status"""
+    if model_loading:
+        return "🔄 Loading model... Please wait."
+    elif model_error:
+        return f"❌ Error: {model_error}"
+    elif llm is not None:
+        return "✅ Model ready!"
+    else:
+        return "❓ Unknown status"
+# Load model in background thread
+model_thread = threading.Thread(target=load_model, daemon=True)
+model_thread.start()
+# Create Gradio interface
+with gr.Blocks(
+    title="🇹🇯 Ameena Qwen3-8B Tajik Language Model",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 800px !important;
+        margin: auto !important;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🇹🇯 Ameena Qwen3-8B - Tajik Language Model
+    **Model**: Quantized GGUF (4GB) | **Backend**: CPU Only | **Language**: Tajik
+    Base model: Qwen3-8B fine-tuned for Tajik language
+    """)
+    # Model status
+    status_display = gr.Markdown(get_model_status())
+    # Main chat interface
+    chatbot = gr.Chatbot(
+        height=400,
+        show_label=False,
+        show_copy_button=True,
+    )
+    with gr.Row():
+        msg = gr.Textbox(
+            placeholder="Салом! Саволи худро дар ин ҷо бинависед... (Hello! Write your question here...)",
+            show_label=False,
+            scale=4
+        )
+        submit_btn = gr.Button("Send", scale=1, variant="primary")
+    # Advanced settings
+    with gr.Accordion("⚙️ Settings", open=False):
+        system_msg = gr.Textbox(
+            value="Шумо ёвари хуб ҳастед ва ба забони тоҷикӣ ҷавоб медиҳед.",
+            label="System Message (Tajik)",
+            info="Instructions for the model in Tajik language"
+        )
+        with gr.Row():
+            max_tokens = gr.Slider(
+                minimum=50,
+                maximum=300,
+                value=150,
+                step=10,
+                label="Max Tokens",
+                info="Maximum response length"
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=1.5,
+                value=0.7,
+                step=0.1,
+                label="Temperature",
+                info="Response creativity (higher = more creative)"
+            )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top-p",
+                info="Nucleus sampling parameter"
+            )
+    # Example prompts
+    gr.Examples(
+        examples=[
+            ["Салом! Чӣ хел ҳастед?"],
+            ["Тоҷикистон дар куҷо ҷойгир аст?"],
+            ["Барномасозӣ чист ва чӣ гуна кор мекунад?"],
+            ["Оиди забони тоҷикӣ маълумот диҳед"],
+            ["Шеър дар бораи табиат нависед"],
+        ],
+        inputs=msg,
+        label="💡 Example Questions"
+    )
+    def respond(message, history, system_message, max_tokens, temperature, top_p):
+        """Handle user message and generate response"""
+        if not message.strip():
+            return history, ""
+        # Add user message to history
+        history.append([message, None])
+        # Generate response
+        response_generator = chat_with_model(
+            message, history[:-1], system_message, max_tokens, temperature, top_p
+        )
+        # Stream response
+        for partial_response in response_generator:
+            history[-1][1] = partial_response
+            yield history, ""
+        return history, ""
+    def clear_chat():
+        """Clear chat history"""
+        return [], ""
+    def update_status():
+        """Update model status display"""
+        return get_model_status()
+    # Event handlers
+    submit_btn.click(
+        respond,
+        inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p],
+        outputs=[chatbot, msg]
+    )
+    msg.submit(
+        respond,
+        inputs=[msg, chatbot, system_msg, max_tokens, temperature, top_p],
+        outputs=[chatbot, msg]
+    )
+    # Clear button
+    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+    clear_btn.click(clear_chat, outputs=[chatbot, msg])
+    # Refresh status button
+    refresh_btn = gr.Button("🔄 Refresh Status", variant="secondary")
+    refresh_btn.click(update_status, outputs=status_display)
+    # Auto-refresh status every 5 seconds during loading
+    demo.load(update_status, outputs=status_display, every=5)
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False,
+        quiet=False,
+    )