Spaces:

Semnykcz
/

Qwen3

Paused

App Files Files Community

Semnykcz commited on Aug 22

Commit

ac5ebc8

verified ·

1 Parent(s): 646a901

Upload 8 files

Browse files

Files changed (8) hide show

app.py +277 -314
public/app.js +228 -0
public/index.html +18 -0
public/styles.css +335 -0
readme.md +124 -0
requirements.txt +10 -8
utils/__init__.py +0 -0
utils/model_utils.py +94 -0

app.py CHANGED Viewed

@@ -1,330 +1,293 @@
 #!/usr/bin/env python3
 """
-Qwen3 Coder – FastAPI server (OpenAI-compatible /v1/chat/completions)
-Refaktorováno do čisté, přehledné struktury a sloučeno s konfigurací z config.py.
-- Konfigurace přes env s rozumnými defaulty (viz třída AppConfig)
-- Deterministické načítání modelu/tokenizeru s volitelným prewarm přes snapshot_download
-- Oddělené sekce: konfigurace, model, API schémata, routy
 """
-from __future__ import annotations
 import os
-import time
 import logging
-from dataclasses import dataclass
-from typing import List, Optional, Dict, Any
 import torch
-from fastapi import FastAPI
-from fastapi.responses import FileResponse, HTMLResponse
-from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# =============================
-# Konfigurace
-# =============================
-@dataclass(frozen=True)
-class AppConfig:
-    """Aplikační konfigurace s env fallbacky.
-    Default hodnoty vycházejí z původního config.py:
-      - APP_NAME = "Qwen3 Coder"
-      - APP_LANG = "en"
-      - MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
-      - MODEL_ALIAS = "qwen3"
-      - PERSISTENT_DIR = "data"
-    Navíc:
-      - SNAPSHOT_DOWNLOAD ("1" / "0")
-      - PORT (výchozí 7860)
-    """
-    app_name: str
-    app_lang: str
-    model_id: str
-    model_alias: str
-    persistent_dir: str
-    snapshot_download: bool
-    port: int
-def _env(key: str, default: Optional[str] = None) -> str:
-    v = os.getenv(key)
-    return v if v is not None else (default or "")
-def make_config() -> AppConfig:
-    # Podpora obou názvů proměnných kvůli zpětné kompatibilitě: PERSISTENT_HOME i PERSISTENT_DIR
-    persistent_dir = (
-        os.getenv("PERSISTENT_HOME")
-        or os.getenv("PERSISTENT_DIR")
-        or "data"  # default z config.py
-    )
-    return AppConfig(
-        app_name=_env("APP_NAME", "Qwen3 Coder"),
-        app_lang=_env("APP_LANG", "en"),
-        model_id=_env("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct"),
-        model_alias=_env("MODEL_ALIAS", "qwen3"),
-        persistent_dir=persistent_dir,
-        snapshot_download=_env("SNAPSHOT_DOWNLOAD", "0") == "1",
-        port=int(_env("PORT", "7860") or 7860),
-    )
-CONFIG: AppConfig = make_config()
-# Logování
-logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
-LOGGER = logging.getLogger("qwen3-coder")
-# Absolutní cesty
-PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
-FRONTEND_DIR = os.path.join(PROJECT_DIR, "app")
-CACHE_DIR = os.path.abspath(CONFIG.persistent_dir)
-os.makedirs(CACHE_DIR, exist_ok=True)
-# =============================
-# Načtení modelu a tokenizeru
-# =============================
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-def maybe_snapshot_download(model_id: str, cache_dir: str, enabled: bool) -> None:
-    """Volitelně stáhne snapshot modelu do cache (prewarm)."""
-    if not enabled:
-        return
-    try:
-        from huggingface_hub import snapshot_download  # import on-demand
-        LOGGER.info("Prewarming model cache via snapshot_download …")
-        snapshot_download(repo_id=model_id, cache_dir=cache_dir, local_files_only=False)
-        LOGGER.info("Snapshot download completed")
-    except Exception as e:
-        LOGGER.warning("snapshot_download failed: %s", e)
-def load_models(model_id: str, cache_dir: str):
-    """Načte tokenizer a model. U 30B variant vyžaduje výkonné GPU; CPU je velmi pomalé."""
-    LOGGER.info("Loading tokenizer '%s' (cache: %s)", model_id, cache_dir)
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_id,
-        cache_dir=cache_dir,
-        trust_remote_code=True,
-    )
-    LOGGER.info("Loading model '%s' on device=%s", model_id, DEVICE)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        cache_dir=cache_dir,
-        device_map="auto" if DEVICE == "cuda" else None,
-        torch_dtype=(torch.float16 if DEVICE == "cuda" else torch.float32),
-        trust_remote_code=True,
-    )
-    if DEVICE != "cuda":
-        model.to(DEVICE)
-    model.eval()
-    return tokenizer, model
-# Prewarm (volitelně)
-maybe_snapshot_download(CONFIG.model_id, CACHE_DIR, CONFIG.snapshot_download)
-# Načtení modelu/tokenizeru (synchronně při startu – zachováno jako původní chování)
-TOKENIZER, MODEL = load_models(CONFIG.model_id, CACHE_DIR)
-# =============================
-# API schémata (OpenAI-compatible)
-# =============================
-class Message(BaseModel):
-    role: str  # "system" | "user" | "assistant"
     content: str
-class ChatCompletionsRequest(BaseModel):
-    model: Optional[str] = None
-    messages: List[Message]
-    temperature: Optional[float] = 0.2
-    top_p: Optional[float] = 0.95
-    max_tokens: Optional[int] = 1024
-    stream: Optional[bool] = False  # stream není implementován
-    stop: Optional[List[str]] = None
-# =============================
-# FastAPI aplikace a routy
-# =============================
-app = FastAPI(title=f"{CONFIG.app_name} ({CONFIG.model_alias})")
-# Statické soubory a frontend ve složce ./app
-if os.path.isdir(FRONTEND_DIR):
-    app.mount("/app", StaticFiles(directory=FRONTEND_DIR), name="app")
-@app.get("/", response_class=HTMLResponse)
-def serve_index():
-    """Vrátí app/index.html, pokud existuje; jinak zobrazí jednoduché info."""
-    index_path = os.path.join(FRONTEND_DIR, "index.html")
-    if os.path.exists(index_path):
-        return FileResponse(index_path)
-    return HTMLResponse(
-        """
-        <h1>Qwen3 Coder</h1>
-        <p>Vlož prosím frontend do složky <code>/app</code> (soubor <code>index.html</code>).</p>
-        """,
-        status_code=200,
-    )
-@app.get("/healthz")
-def healthz() -> Dict[str, Any]:
-    return {
-        "ok": True,
-        "app_name": CONFIG.app_name,
-        "lang": CONFIG.app_lang,
-        "model": CONFIG.model_id,
-        "alias": CONFIG.model_alias,
-        "device": DEVICE,
-        "cache_dir": CACHE_DIR,
-    }
-@app.get("/v1/models")
-def list_models():
-    return {"object": "list", "data": [{"id": CONFIG.model_id, "object": "model"}]}
-@app.post("/v1/chat/completions")
-def chat_completions(req: ChatCompletionsRequest):
-    """OpenAI-compatible Chat Completions (bez streamu)."""
-    # Převod zpráv na formát očekávaný chat šablonou
-    msgs = [{"role": m.role, "content": m.content} for m in req.messages]
-    if req.stream:
-        def event_gen():
-            try:
-                input_ids = TOKENIZER.apply_chat_template(
-                    msgs,
-                    tokenize=True,
-                    add_generation_prompt=True,
-                    return_tensors="pt",
-                ).to(MODEL.device)
-                streamer = TextIteratorStreamer(
-                    TOKENIZER,
-                    skip_prompt=True,
-                    skip_special_tokens=True,
-                )
-                gen_kwargs = dict(
-                    input_ids=input_ids,
-                    max_new_tokens=req.max_tokens or 1024,
-                    do_sample=(req.temperature or 0) > 0,
-                    temperature=req.temperature or 0.2,
-                    top_p=req.top_p or 0.95,
-                    pad_token_id=TOKENIZER.eos_token_id,
-                    eos_token_id=TOKENIZER.eos_token_id,
-                    use_cache=True,
-                    streamer=streamer,
-                )
-                thread = threading.Thread(target=MODEL.generate, kwargs=gen_kwargs, daemon=True)
-                thread.start()
-                started = False
-                for piece in streamer:
-                    # první token => indikace "typing"
-                    if not started:
-                        started = True
-                    now = int(time.time())
-                    chunk = {
-                        "id": f"chatcmpl-{now}",
-                        "object": "chat.completion.chunk",
-                        "created": now,
-                        "model": req.model or CONFIG.model_id,
-                        "choices": [
-                            {"index": 0, "delta": {"content": piece}, "finish_reason": None}
-                        ],
-                    }
-                    yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n"
-                # ukončovací chunk
-                now = int(time.time())
-                done_chunk = {
-                    "id": f"chatcmpl-{now}",
-                    "object": "chat.completion.chunk",
-                    "created": now,
-                    "model": req.model or CONFIG.model_id,
-                    "choices": [
-                        {"index": 0, "delta": {}, "finish_reason": "stop"}
-                    ],
-                }
-                yield f"data: {json.dumps(done_chunk, ensure_ascii=False)}\n\n"
-                yield "data: [DONE]\n\n"
-            except Exception as e:
-                err = {"error": str(e)}
-                yield f"data: {json.dumps(err, ensure_ascii=False)}\n\n"
-                yield "data: [DONE]\n\n"
-        return StreamingResponse(event_gen(), media_type="text/event-stream")
-    # Non-stream varianta
-    input_ids = TOKENIZER.apply_chat_template(
-        msgs,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_tensors="pt",
-    ).to(MODEL.device)
-    outputs = MODEL.generate(
-        input_ids=input_ids,
-        max_new_tokens=req.max_tokens or 1024,
-        do_sample=(req.temperature or 0) > 0,
-        temperature=req.temperature or 0.2,
-        top_p=req.top_p or 0.95,
-        pad_token_id=TOKENIZER.eos_token_id,
-        eos_token_id=TOKENIZER.eos_token_id,
-        use_cache=True,
-    )
-    # Nově vygenerovaná část za promptem
-    gen_ids = outputs[0][input_ids.shape[-1] :]
-    text = TOKENIZER.decode(gen_ids, skip_special_tokens=True).strip()
-    now = int(time.time())
-    usage = {
-        "prompt_tokens": int(input_ids.numel()),
-        "completion_tokens": int(gen_ids.numel()),
-        "total_tokens": int(input_ids.numel() + gen_ids.numel()),
-    }
-    return {
-        "id": f"chatcmpl-{now}",
-        "object": "chat.completion",
-        "created": now,
-        "model": req.model or CONFIG.model_id,
-        "choices": [
-            {
                 "index": 0,
-                "message": {"role": "assistant", "content": text},
-                "finish_reason": "stop",
             }
-        ],
-        "usage": usage,
-    }
-# =============================
-# Lokální běh (HF Spaces spouští automaticky)
-# =============================
-if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=CONFIG.port)

 #!/usr/bin/env python3
 """
+AI Chat Application for HuggingFace Spaces
+Integration with Qwen/Qwen3-Coder-30B-A3B-Instruct model
+OPENAI API compatibility features
 """
 import os
+import sys
+import json
 import logging
+import time
+from typing import Optional, Dict, Any, Generator
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+import gradio as gr
+from fastapi import FastAPI, HTTPException, Response
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
+import redis
+import asyncio
+import threading
+from threading import Thread
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Model configuration
+MODEL_NAME = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
+DEFAULT_MAX_TOKENS = 1024
+DEFAULT_TEMPERATURE = 0.7
+class ConversationManager:
+    """Manage conversation history and caching"""
+    def __init__(self):
+        self.redis_client = None
+        try:
+            self.redis_client = redis.Redis(host='localhost', port=6379, db=0)
+            self.redis_client.ping()
+        except:
+            logger.warning("Redis not available, using in-memory storage")
+            self.conversations = {}
+    def save_conversation(self, conv_id: str, messages: list) -> None:
+        """Save conversation to cache"""
+        try:
+            if self.redis_client:
+                self.redis_client.setex(conv_id, 86400, json.dumps(messages))  # 24 hours expiry
+            else:
+                self.conversations[conv_id] = messages
+        except Exception as e:
+            logger.error(f"Error saving conversation: {e}")
+    def load_conversation(self, conv_id: str) -> list:
+        """Load conversation from cache"""
+        try:
+            if self.redis_client:
+                data = self.redis_client.get(conv_id)
+                if data:
+                    return json.loads(data)
+            else:
+                return self.conversations.get(conv_id, [])
+        except Exception as e:
+            logger.error(f"Error loading conversation: {e}")
+        return []
+class ModelManager:
+    """Manage Qwen model loading and inference"""
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.load_model()
+    def load_model(self) -> None:
+        """Load the Qwen model"""
+        try:
+            logger.info(f"Loading model {MODEL_NAME} on {self.device}")
+            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                low_cpu_mem_usage=True,
+                device_map="auto"
+            )
+            logger.info("Model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+            raise
+    def generate_response(self, prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE) -> str:
+        """Generate response from the model"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Generate without streaming for simple response
+            generated = self.model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            response = self.tokenizer.decode(generated[0], skip_special_tokens=True)
+            # Remove the prompt from the response
+            response = response[len(prompt):].strip()
+            return response
+        except Exception as e:
+            logger.error(f"Error generating response: {e}")
+            raise
+    def generate_streaming_response(self, prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE) -> Generator[str, None, None]:
+        """Generate streaming response from the model"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Create streamer for streaming response
+            streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
+            # Start generation in a separate thread
+            generation_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+            thread.start()
+            # Yield tokens as they are generated
+            for new_text in streamer:
+                yield new_text
+        except Exception as e:
+            logger.error(f"Error generating streaming response: {e}")
+            yield f"Error: {str(e)}"
+# Initialize managers
+conversation_manager = ConversationManager()
+model_manager = ModelManager()
+# FastAPI app for OPENAI API compatibility
+app = FastAPI(title="AI Chat API", description="OPENAI API compatible interface for Qwen model")
+class ChatMessage(BaseModel):
+    role: str
     content: str
+class ChatRequest(BaseModel):
+    messages: list[ChatMessage]
+    model: str = MODEL_NAME
+    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS
+    temperature: Optional[float] = DEFAULT_TEMPERATURE
+class ChatResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: list
+    usage: Dict[str, int]
+@app.post("/v1/chat/completions", response_model=ChatResponse)
+async def chat_completion(request: ChatRequest):
+    """OPENAI API compatible chat completion endpoint"""
+    try:
+        # Convert messages to prompt
+        prompt = ""
+        for msg in request.messages:
+            if msg.role == "system":
+                prompt += f"System: {msg.content}\n"
+            elif msg.role == "user":
+                prompt += f"User: {msg.content}\n"
+            elif msg.role == "assistant":
+                prompt += f"Assistant: {msg.content}\n"
+        # Generate response
+        response_text = model_manager.generate_response(
+            prompt,
+            request.max_tokens or DEFAULT_MAX_TOKENS,
+            request.temperature or DEFAULT_TEMPERATURE
+        )
+        # Return in OPENAI format
+        return ChatResponse(
+            id="chatcmpl-" + str(hash(prompt))[:10],
+            created=int(time.time()),
+            model=request.model,
+            choices=[{
                 "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": response_text
+                },
+                "finish_reason": "stop"
+            }],
+            usage={
+                "prompt_tokens": len(prompt.split()),
+                "completion_tokens": len(response_text.split()),
+                "total_tokens": len(prompt.split()) + len(response_text.split())
             }
+        )
+    except Exception as e:
+        logger.error(f"Error in chat completion: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/chat")
+async def chat_endpoint(request: dict):
+    """Endpoint for frontend chat interface"""
+    try:
+        message = request.get("message", "")
+        history = request.get("history", [])
+        # Convert history to prompt
+        prompt = ""
+        for msg in history:
+            if msg["role"] == "user":
+                prompt += f"User: {msg['content']}\n"
+            elif msg["role"] == "assistant":
+                prompt += f"Assistant: {msg['content']}\n"
+        prompt += f"User: {message}\nAssistant:"
+        # Return streaming response
+        return StreamingResponse(
+            model_manager.generate_streaming_response(prompt),
+            media_type="text/plain"
+        )
+    except Exception as e:
+        logger.error(f"Error in chat endpoint: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# Gradio interface
+def predict(message, history):
+    """Gradio prediction function"""
+    # Convert history to prompt
+    prompt = ""
+    for human, ai in history:
+        prompt += f"User: {human}\nAssistant: {ai}\n"
+    prompt += f"User: {message}\nAssistant:"
+    # Generate response
+    response = model_manager.generate_response(prompt)
+    return response
+# Create Gradio interface
+gradio_interface = gr.ChatInterface(
+    fn=predict,
+    title="AI Chat with Qwen Coder",
+    description="Chat with Qwen/Qwen3-Coder-30B-A3B-Instruct model",
+    examples=[
+        ["Hello, how are you today?"],
+        ["Can you explain quantum computing in simple terms?"],
+        ["Write a Python function to calculate Fibonacci numbers"]
+    ],
+    cache_examples=False
+)
+# Serve static files
+from fastapi.staticfiles import StaticFiles
+# Combine FastAPI and Gradio
+def launch_app():
+    """Launch the combined FastAPI and Gradio app"""
+    from fastapi.middleware.cors import CORSMiddleware
+    # Add CORS middleware
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    # Mount static files
+    app.mount("/public", StaticFiles(directory="public"), name="public")
+    # Mount Gradio interface
+    app.mount("/", gradio_interface.app)
+    # Run the app
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    launch_app()

public/app.js ADDED Viewed

	@@ -0,0 +1,228 @@

+// AI Chat Application JavaScript Logic
+// This file contains the React component for the chat interface
+// Main App component
+function App() {
+  const [messages, setMessages] = React.useState([]);
+  const [inputValue, setInputValue] = React.useState('');
+  const [isLoading, setIsLoading] = React.useState(false);
+  const [darkMode, setDarkMode] = React.useState(false);
+  const messagesEndRef = React.useRef(null);
+  // Scroll to bottom of messages
+  const scrollToBottom = () => {
+    messagesEndRef.current?.scrollIntoView({ behavior: "smooth" });
+  };
+  // Scroll to bottom when messages change
+  React.useEffect(() => {
+    scrollToBottom();
+  }, [messages]);
+  // Toggle dark mode
+  const toggleDarkMode = () => {
+    setDarkMode(!darkMode);
+    document.documentElement.classList.toggle('dark', !darkMode);
+  };
+  // Handle input change
+  const handleInputChange = (e) => {
+    setInputValue(e.target.value);
+  };
+  // Handle form submission
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    if (!inputValue.trim() || isLoading) return;
+    // Add user message to chat
+    const userMessage = { id: Date.now(), text: inputValue, sender: 'user' };
+    setMessages(prev => [...prev, userMessage]);
+    setInputValue('');
+    setIsLoading(true);
+    try {
+      // Add temporary AI message
+      const aiMessageId = Date.now() + 1;
+      setMessages(prev => [...prev, { id: aiMessageId, text: '', sender: 'ai', isLoading: true }]);
+      // Send request to backend
+      const response = await fetch('/chat', {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          message: inputValue,
+          history: messages.filter(m => !m.isLoading).map(m => ({
+            role: m.sender === 'user' ? 'user' : 'assistant',
+            content: m.text
+          }))
+        })
+      });
+      if (!response.ok) {
+        throw new Error(`HTTP error! status: ${response.status}`);
+      }
+      // Process streaming response
+      const reader = response.body.getReader();
+      const decoder = new TextDecoder();
+      let aiResponse = '';
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        const chunk = decoder.decode(value);
+        aiResponse += chunk;
+        // Update AI message with new content
+        setMessages(prev => prev.map(msg =>
+          msg.id === aiMessageId
+            ? { ...msg, text: aiResponse, isLoading: false }
+            : msg
+        ));
+      }
+    } catch (error) {
+      console.error('Error sending message:', error);
+      setMessages(prev => prev.map(msg =>
+        msg.id === aiMessageId
+          ? { ...msg, text: 'Sorry, I encountered an error. Please try again.', isLoading: false, error: true }
+          : msg
+      ));
+    } finally {
+      setIsLoading(false);
+    }
+  };
+  // Copy message to clipboard
+  const copyToClipboard = (text) => {
+    navigator.clipboard.writeText(text).then(() => {
+      // Show success message (could be a toast notification)
+      console.log('Copied to clipboard');
+    }).catch(err => {
+      console.error('Failed to copy: ', err);
+    });
+  };
+  // Clear chat history
+  const clearChat = () => {
+    setMessages([]);
+  };
+  return (
+    <div className="chat-container">
+      {/* Header */}
+      <div className="chat-header flex justify-between items-center">
+        <h1 className="text-2xl font-bold">AI Chat with Qwen Coder</h1>
+        <div className="flex gap-2">
+          <button
+            onClick={toggleDarkMode}
+            className="btn btn-secondary"
+            aria-label="Toggle dark mode"
+          >
+            {darkMode ? (
+              <i className="fas fa-sun"></i>
+            ) : (
+              <i className="fas fa-moon"></i>
+            )}
+          </button>
+          <button
+            onClick={clearChat}
+            className="btn btn-secondary"
+            aria-label="Clear chat"
+          >
+            <i className="fas fa-trash"></i>
+          </button>
+        </div>
+      </div>
+      {/* Chat messages area */}
+      <div className="chat-messages">
+        {messages.length === 0 ? (
+          <div className="flex flex-col items-center justify-center h-full text-center">
+            <h2 className="text-2xl font-bold mb-4">Welcome to AI Chat</h2>
+            <p className="text-lg mb-8">Start a conversation with Qwen Coder by typing a message below</p>
+            <div className="grid grid-cols-1 md:grid-cols-2 gap-4 w-full max-w-2xl">
+              <div className="bg-gray-100 dark:bg-gray-800 p-4 rounded-lg">
+                <h3 className="font-bold mb-2">Examples</h3>
+                <ul className="text-left">
+                  <li>"Explain quantum computing in simple terms"</li>
+                  <li>"Write a Python function to calculate Fibonacci numbers"</li>
+                  <li>"How do I make an HTTP request in JavaScript?"</li>
+                </ul>
+              </div>
+              <div className="bg-gray-100 dark:bg-gray-800 p-4 rounded-lg">
+                <h3 className="font-bold mb-2">Capabilities</h3>
+                <ul className="text-left">
+                  <li>Remembers previous conversation</li>
+                  <li>Understands complex instructions</li>
+                  <li>Generates code and explanations</li>
+                </ul>
+              </div>
+            </div>
+          </div>
+        ) : (
+          messages.map((message) => (
+            <div
+              key={message.id}
+              className={`message-bubble relative ${message.sender === 'user' ? 'user' : 'ai'}`}
+            >
+              {message.sender === 'ai' && !message.isLoading && (
+                <button
+                  onClick={() => copyToClipboard(message.text)}
+                  className="copy-button"
+                  aria-label="Copy message"
+                >
+                  <i className="fas fa-copy"></i>
+                </button>
+              )}
+              {message.isLoading ? (
+                <div className="typing-indicator">
+                  <div className="typing-dot"></div>
+                  <div className="typing-dot"></div>
+                  <div className="typing-dot"></div>
+                </div>
+              ) : (
+                <div>{message.text}</div>
+              )}
+            </div>
+          ))
+        )}
+        <div ref={messagesEndRef} />
+      </div>
+      {/* Input area */}
+      <div className="chat-input-area">
+        <form onSubmit={handleSubmit} className="flex gap-2">
+          <input
+            type="text"
+            value={inputValue}
+            onChange={handleInputChange}
+            placeholder="Type your message here..."
+            className="chat-input"
+            disabled={isLoading}
+          />
+          <button
+            type="submit"
+            className="btn"
+            disabled={isLoading || !inputValue.trim()}
+          >
+            {isLoading ? (
+              <i className="fas fa-spinner fa-spin"></i>
+            ) : (
+              <i className="fas fa-paper-plane"></i>
+            )}
+          </button>
+        </form>
+        <div className="text-xs text-center mt-2 text-gray-500 dark:text-gray-400">
+          Qwen Coder can make mistakes. Consider checking important information.
+        </div>
+      </div>
+    </div>
+  );
+}
+// Render the app
+ReactDOM.render(<App />, document.getElementById('root'));

public/index.html ADDED Viewed

	@@ -0,0 +1,18 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AI Chat with Qwen Coder</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script src="https://unpkg.com/react@18/umd/react.development.js"></script>
+    <script src="https://unpkg.com/react-dom@18/umd/react-dom.development.js"></script>
+    <script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body class="bg-gray-50 dark:bg-gray-900 text-gray-900 dark:text-gray-100">
+    <div id="root"></div>
+    <script type="text/babel" src="app.js"></script>
+</body>
+</html>

public/styles.css ADDED Viewed

	@@ -0,0 +1,335 @@

+/* Custom CSS variables for theming */
+:root {
+  /* Primary color palette */
+  --primary-50: 240 249 255;
+  --primary-100: 224 242 254;
+  --primary-200: 186 230 253;
+  --primary-300: 125 211 252;
+  --primary-400: 56 189 248;
+  --primary-500: 14 165 233;
+  --primary-600: 2 132 199;
+  --primary-700: 3 105 161;
+  --primary-800: 7 89 133;
+  --primary-900: 12 74 110;
+  /* Secondary color palette */
+  --secondary-50: 248 250 252;
+  --secondary-100: 241 245 249;
+  --secondary-200: 226 232 240;
+  --secondary-300: 203 213 225;
+  --secondary-400: 148 163 184;
+  --secondary-500: 100 116 139;
+  --secondary-600: 71 85 105;
+  --secondary-700: 51 65 85;
+  --secondary-800: 30 41 59;
+  --secondary-900: 15 23 42;
+  /* Accent colors */
+  --accent-50: 254 249 195;
+  --accent-100: 254 240 138;
+  --accent-200: 253 230 138;
+  --accent-300: 252 211 77;
+  --accent-400: 251 191 36;
+  --accent-500: 245 158 11;
+  --accent-600: 217 119 6;
+  --accent-700: 180 83 9;
+  --accent-800: 146 64 14;
+  --accent-900: 120 53 15;
+  /* Gradient definitions */
+  --gradient-primary: linear-gradient(135deg, hsl(var(--primary-500)), hsl(var(--accent-500)));
+  --gradient-secondary: linear-gradient(135deg, hsl(var(--secondary-700)), hsl(var(--secondary-900)));
+  /* Shadows */
+  --shadow-sm: 0 1px 2px 0 rgba(0, 0, 0, 0.05);
+  --shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1), 0 1px 2px -1px rgba(0, 0, 0, 0.1);
+  --shadow-md: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -2px rgba(0, 0, 0, 0.1);
+  --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -4px rgba(0, 0, 0, 0.1);
+  --shadow-xl: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 8px 10px -6px rgba(0, 0, 0, 0.1);
+  --shadow-2xl: 0 25px 50px -12px rgba(0, 0, 0, 0.25);
+  /* Transitions */
+  --transition-fast: all 0.15s cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-normal: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-slow: all 0.5s cubic-bezier(0.4, 0, 0.2, 1);
+}
+/* Dark mode variables */
+.dark {
+  --primary-50: 236 254 255;
+  --primary-100: 207 250 254;
+  --primary-200: 165 243 252;
+  --primary-300: 103 232 249;
+  --primary-400: 34 211 238;
+  --primary-500: 6 182 212;
+  --primary-600: 8 145 178;
+  --primary-700: 14 116 144;
+  --primary-800: 21 94 117;
+  --primary-900: 22 78 99;
+}
+/* Base styles */
+body {
+  font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
+  -webkit-font-smoothing: antialiased;
+  -moz-osx-font-smoothing: grayscale;
+  background-color: hsl(var(--secondary-50));
+  transition: background-color var(--transition-normal);
+}
+.dark body {
+  background-color: hsl(var(--secondary-900));
+}
+/* Chat container */
+.chat-container {
+  max-width: 1200px;
+  margin: 0 auto;
+  height: 100vh;
+  display: flex;
+  flex-direction: column;
+  background-color: hsl(var(--secondary-50));
+  transition: background-color var(--transition-normal);
+}
+.dark .chat-container {
+  background-color: hsl(var(--secondary-900));
+}
+/* Header */
+.chat-header {
+  padding: 1rem;
+  border-bottom: 1px solid hsl(var(--secondary-200));
+  background-color: hsl(var(--secondary-50));
+  transition: all var(--transition-normal);
+}
+.dark .chat-header {
+  border-bottom: 1px solid hsl(var(--secondary-800));
+  background-color: hsl(var(--secondary-900));
+}
+/* Chat messages area */
+.chat-messages {
+  flex: 1;
+  overflow-y: auto;
+  padding: 1rem;
+  display: flex;
+  flex-direction: column;
+  gap: 1rem;
+  background-color: hsl(var(--secondary-50));
+  transition: background-color var(--transition-normal);
+}
+.dark .chat-messages {
+  background-color: hsl(var(--secondary-900));
+}
+/* Message bubble */
+.message-bubble {
+  max-width: 80%;
+  padding: 1rem 1.5rem;
+  border-radius: 1rem;
+  box-shadow: var(--shadow);
+  transition: all var(--transition-normal);
+}
+.message-bubble.user {
+  align-self: flex-end;
+  background-color: hsl(var(--primary-500));
+  color: white;
+}
+.message-bubble.ai {
+  align-self: flex-start;
+  background-color: hsl(var(--secondary-100));
+  color: hsl(var(--secondary-900));
+}
+.dark .message-bubble.ai {
+  background-color: hsl(var(--secondary-800));
+  color: hsl(var(--secondary-100));
+}
+/* Input area */
+.chat-input-area {
+  padding: 1rem;
+  border-top: 1px solid hsl(var(--secondary-200));
+  background-color: hsl(var(--secondary-50));
+  transition: all var(--transition-normal);
+}
+.dark .chat-input-area {
+  border-top: 1px solid hsl(var(--secondary-800));
+  background-color: hsl(var(--secondary-900));
+}
+/* Input field */
+.chat-input {
+  width: 100%;
+  padding: 0.75rem 1rem;
+  border-radius: 0.5rem;
+  border: 1px solid hsl(var(--secondary-300));
+  background-color: hsl(var(--secondary-100));
+  color: hsl(var(--secondary-900));
+  transition: all var(--transition-normal);
+}
+.dark .chat-input {
+  border: 1px solid hsl(var(--secondary-700));
+  background-color: hsl(var(--secondary-800));
+  color: hsl(var(--secondary-100));
+}
+.chat-input:focus {
+  outline: none;
+  border-color: hsl(var(--primary-500));
+  box-shadow: 0 0 0 3px hsla(var(--primary-500), 0.2);
+}
+/* Buttons */
+.btn {
+  padding: 0.5rem 1rem;
+  border-radius: 0.5rem;
+  font-weight: 500;
+  transition: all var(--transition-normal);
+  cursor: pointer;
+  border: none;
+  background-color: hsl(var(--primary-500));
+  color: white;
+}
+.btn:hover {
+  background-color: hsl(var(--primary-600));
+}
+.btn-secondary {
+  background-color: hsl(var(--secondary-200));
+  color: hsl(var(--secondary-900);
+}
+.dark .btn-secondary {
+  background-color: hsl(var(--secondary-700));
+  color: hsl(var(--secondary-100);
+}
+.btn-secondary:hover {
+  background-color: hsl(var(--secondary-300));
+}
+.dark .btn-secondary:hover {
+  background-color: hsl(var(--secondary-600));
+}
+/* Copy button */
+.copy-button {
+  position: absolute;
+  top: 0.5rem;
+  right: 0.5rem;
+  padding: 0.25rem;
+  border-radius: 0.25rem;
+  background-color: hsl(var(--secondary-200));
+  color: hsl(var(--secondary-700));
+  opacity: 0;
+  transition: all var(--transition-normal);
+}
+.message-bubble:hover .copy-button {
+  opacity: 1;
+}
+.dark .copy-button {
+  background-color: hsl(var(--secondary-700));
+  color: hsl(var(--secondary-200));
+}
+/* Typing indicator */
+.typing-indicator {
+  display: flex;
+  align-items: center;
+  gap: 0.25rem;
+  padding: 1rem 1.5rem;
+  background-color: hsl(var(--secondary-100));
+  border-radius: 1rem;
+  width: fit-content;
+  max-width: 80%;
+  align-self: flex-start;
+}
+.dark .typing-indicator {
+  background-color: hsl(var(--secondary-800));
+}
+.typing-dot {
+  width: 0.5rem;
+  height: 0.5rem;
+  border-radius: 50%;
+  background-color: hsl(var(--secondary-500));
+  animation: typing 1.4s infinite ease-in-out;
+}
+.typing-dot:nth-child(1) {
+  animation-delay: 0s;
+}
+.typing-dot:nth-child(2) {
+  animation-delay: 0.2s;
+}
+.typing-dot:nth-child(3) {
+  animation-delay: 0.4s;
+}
+@keyframes typing {
+  0%, 60%, 100% {
+    transform: translateY(0);
+  }
+  30% {
+    transform: translateY(-5px);
+  }
+}
+/* Responsive design */
+@media (max-width: 768px) {
+  .message-bubble {
+    max-width: 90%;
+  }
+  .chat-header, .chat-input-area {
+    padding: 0.75rem;
+  }
+  .chat-messages {
+    padding: 0.75rem;
+  }
+}
+/* Scrollbar styling */
+::-webkit-scrollbar {
+  width: 8px;
+}
+::-webkit-scrollbar-track {
+  background: hsl(var(--secondary-100));
+}
+.dark ::-webkit-scrollbar-track {
+  background: hsl(var(--secondary-800));
+}
+::-webkit-scrollbar-thumb {
+  background: hsl(var(--secondary-300));
+  border-radius: 4px;
+}
+.dark ::-webkit-scrollbar-thumb {
+  background: hsl(var(--secondary-600));
+}
+::-webkit-scrollbar-thumb:hover {
+  background: hsl(var(--secondary-400));
+}
+.dark ::-webkit-scrollbar-thumb:hover {
+  background: hsl(var(--secondary-500));
+}

readme.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# AI Chat Application for HuggingFace Spaces
+A fully functional AI chat application for HuggingFace Spaces integrating Qwen Coder 3 with advanced OPENAI API compatibility features.
+## Features
+- Integration with Qwen/Qwen3-Coder-30B-A3B-Instruct model
+- Advanced OPENAI API compatibility
+- Professional web interface replicating Perplexity AI design
+- Responsive layout with TailwindCSS styling
+- Dark/light mode support
+- Real-time streaming responses
+- Conversation history management
+- Copy response functionality
+- Typing indicators
+- Full GPU optimization
+- Robust error handling and automatic connection recovery
+- Caching mechanisms
+- Ready for immediate deployment on HuggingFace Spaces
+## Technology Stack
+- **Backend**: Python, Gradio, FastAPI, Transformers, PyTorch
+- **Frontend**: TailwindCSS, JavaScript, HTML5
+- **Infrastructure**: Redis for caching, HuggingFace Spaces deployment
+## Requirements
+- Python 3.8+
+- GPU with at least 24GB VRAM (for Qwen/Qwen3-Coder-30B-A3B-Instruct model)
+- Redis server (optional, for conversation caching)
+## Installation
+1. Clone this repository:
+   ```bash
+   git clone <repository-url>
+   cd ai-chat-app
+   ```
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the application:
+   ```bash
+   python app.py
+   ```
+## Usage
+### Web Interface
+The application provides a web interface accessible at `http://localhost:7860` when running locally. The interface features:
+- Chat interface similar to Perplexity AI
+- Dark/light mode toggle
+- Conversation history sidebar
+- Copy buttons for responses
+- Typing indicators during response generation
+### API Endpoints
+The application exposes OPENAI API compatible endpoints:
+- `POST /v1/chat/completions` - Chat completion endpoint
+Example request:
+```json
+{
+  "messages": [
+    {"role": "user", "content": "Hello, how are you?"}
+  ],
+  "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
+  "max_tokens": 1024,
+  "temperature": 0.7
+}
+```
+## Deployment to HuggingFace Spaces
+1. Create a new Space on HuggingFace with the following configuration:
+   - SDK: Gradio
+   - Hardware: GPU (recommended)
+2. Upload all files to your Space repository
+3. The application will automatically start and be accessible through your Space URL
+## Configuration
+The application can be configured through environment variables:
+- `MODEL_NAME`: The HuggingFace model identifier (default: Qwen/Qwen3-Coder-30B-A3B-Instruct)
+- `MAX_TOKENS`: Default maximum tokens for responses (default: 1024)
+- `TEMPERATURE`: Default temperature for generation (default: 0.7)
+- `REDIS_URL`: Redis connection URL for caching (optional)
+## Troubleshooting
+### GPU Memory Issues
+If you encounter GPU memory issues:
+1. Ensure your GPU has at least 24GB VRAM
+2. Try reducing the `max_tokens` parameter
+3. Use quantization techniques for model loading
+### Model Loading Errors
+If the model fails to load:
+1. Check your internet connection
+2. Ensure you have sufficient disk space
+3. Verify the model identifier is correct
+## Contributing
+Contributions are welcome! Please fork the repository and submit a pull request with your changes.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
-fastapi==0.116.1
-uvicorn==0.35.0
-transformers>=4.55.3
-torch==2.4.0
-accelerate>=0.33.0
-einops
-safetensors
-# bitsandbytes>=0.43.1   # jen pokud chceš 4-bit kvantizaci

+gradio>=3.0.0
+transformers>=4.30.0
+torch>=2.0.0
+fastapi>=0.68.0
+uvicorn>=0.15.0
+redis>=3.5.0
+aiohttp>=3.7.0
+pydantic>=1.8.0
+accelerate>=0.20.0
+bitsandbytes>=0.39.0

utils/__init__.py ADDED Viewed

File without changes

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Model utilities for working with Qwen/Qwen3-Coder-30B-A3B-Instruct model
+"""
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
+import logging
+from typing import Generator, Optional
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Model configuration
+MODEL_NAME = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
+DEFAULT_MAX_TOKENS = 1024
+DEFAULT_TEMPERATURE = 0.7
+class ModelManager:
+    """Manage Qwen model loading and inference"""
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.load_model()
+    def load_model(self) -> None:
+        """Load the Qwen model"""
+        try:
+            logger.info(f"Loading model {MODEL_NAME} on {self.device}")
+            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                low_cpu_mem_usage=True,
+                device_map="auto"
+            )
+            logger.info("Model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+            raise
+    def generate_response(self, prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE) -> str:
+        """Generate response from the model"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Generate without streaming for simple response
+            generated = self.model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            response = self.tokenizer.decode(generated[0], skip_special_tokens=True)
+            # Remove the prompt from the response
+            response = response[len(prompt):].strip()
+            return response
+        except Exception as e:
+            logger.error(f"Error generating response: {e}")
+            raise
+    def generate_streaming_response(self, prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS, temperature: float = DEFAULT_TEMPERATURE) -> Generator[str, None, None]:
+        """Generate streaming response from the model"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Create streamer for streaming response
+            streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
+            # Start generation in a separate thread
+            generation_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+            thread.start()
+            # Yield tokens as they are generated
+            for new_text in streamer:
+                yield new_text
+        except Exception as e:
+            logger.error(f"Error generating streaming response: {e}")
+            yield f"Error: {str(e)}"