Spaces:

sudipta26889
/

gradio-doc

Sleeping

App Files Files Community

sudipta26889 commited on 13 days ago

Commit

ccd721b

1 Parent(s): f9d9584

Switch to local GPT-OSS-20B loading with CPU-only approach to avoid CUDA issues

Browse files

Files changed (2) hide show

app.py +107 -125
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # app.py
 # Hugging Face Space: Gradio Docs Chat with GPT-OSS-20B and MCP Integration
 # Features:
-#   • GPT-OSS-20B with harmony format for excellent reasoning
-#   • Fallback to reliable smaller models when GPT-OSS-20B is paused
 #   • MCP tool-calling for Gradio docs access
 #   • Streaming responses with live tool logs
 #   • Optional "Concise / Detailed" answer styles
@@ -26,7 +26,6 @@ except ImportError:
     pass
 import gradio as gr
-import requests
 # Try to import MCPClient with fallback
 try:
@@ -56,13 +55,8 @@ GRADIO_DOCS_MCP_SSE = os.environ.get(
     "https://gradio-docs-mcp.hf.space/gradio_api/mcp/sse",
 )
-# Model configuration - primary and fallback models
-PRIMARY_MODEL = "openai/gpt-oss-20b"
-FALLBACK_MODELS = [
-    "microsoft/DialoGPT-medium",
-    "microsoft/DialoGPT-large",
-    "microsoft/DialoGPT-small"
-]
 PROVIDER = os.environ.get("CHAT_PROVIDER", "auto")
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
@@ -81,9 +75,11 @@ DETAILED_SUFFIX = " Provide a detailed, step-by-step answer with short code wher
 # Model Clients (lazy initialization)
 # ----------------------------
 mcp_client: Optional[MCPClient] = None
 _initialized = False
 _init_lock = asyncio.Lock()
-_current_model = PRIMARY_MODEL
 def _current_system_prompt(style: str) -> str:
     """Get the system prompt with style suffix."""
@@ -91,8 +87,10 @@ def _current_system_prompt(style: str) -> str:
 def _reset_clients():
     """Reset all global clients."""
-    global mcp_client, _initialized
     mcp_client = None
     _initialized = False
 def get_mcp_client(model_id: str, provider: str, api_key: Optional[str]) -> MCPClient:
@@ -104,108 +102,105 @@ def get_mcp_client(model_id: str, provider: str, api_key: Optional[str]) -> MCPC
         mcp_client = MCPClient(model=model_id, provider=provider, api_key=api_key)
     return mcp_client
-async def call_inference_api(messages: List[Dict[str, Any]], model_id: str) -> str:
-    """Call model via HF Inference API with fallback support."""
-    if not HF_TOKEN:
-        raise ValueError("HF_TOKEN or HUGGING_FACE_HUB_TOKEN required for inference API")
-    # Convert messages to appropriate format based on model
-    if "gpt-oss" in model_id.lower():
-        # GPT-OSS format with reasoning
-        formatted_messages = []
         for msg in messages:
             if msg["role"] == "system":
-                formatted_messages.append({
                     "role": "user",
                     "content": f"Reasoning: high\n\n{msg['content']}"
                 })
             else:
-                formatted_messages.append(msg)
-    else:
-        # Standard chat format for other models
-        formatted_messages = messages
-    # Prepare the request payload
-    payload = {
-        "inputs": formatted_messages,
-        "parameters": {
-            "max_new_tokens": 512,
-            "temperature": 0.7,
-            "do_sample": True,
-            "return_full_text": False,
-        }
-    }
-    headers = {
-        "Authorization": f"Bearer {HF_TOKEN}",
-        "Content-Type": "application/json"
-    }
-    # Make the API call
-    try:
-        response = requests.post(
-            f"https://api-inference.huggingface.co/models/{model_id}",
-            headers=headers,
-            json=payload,
-            timeout=120  # 2 minute timeout
         )
-        if response.status_code == 200:
-            result = response.json()
-            if isinstance(result, list) and len(result) > 0:
-                return result[0].get("generated_text", "")
-            elif isinstance(result, dict):
-                return result.get("generated_text", "")
-            else:
-                return str(result)
-        else:
-            error_msg = f"API Error {response.status_code}: {response.text}"
-            print(f"❌ {error_msg}")
-            raise Exception(error_msg)
-    except requests.exceptions.Timeout:
-        raise Exception("Request timed out after 120 seconds")
-    except requests.exceptions.RequestException as e:
-        raise Exception(f"Request failed: {str(e)}")
-def reset_to_primary_model():
-    """Reset to use the primary model on next request."""
-    global _current_model
-    _current_model = PRIMARY_MODEL
-    print(f"🔄 Reset to primary model: {PRIMARY_MODEL}")
-async def call_model_with_fallback(messages: List[Dict[str, Any]]) -> Tuple[str, str]:
-    """Call model with automatic fallback to smaller models."""
-    global _current_model
-    # Try current model first (could be primary or a previously successful fallback)
-    try:
-        print(f"🔄 Trying current model: {_current_model}")
-        result = await call_inference_api(messages, _current_model)
-        return result, _current_model
-    except Exception as e:
-        error_msg = str(e)
-        print(f"❌ {_current_model} failed: {error_msg}")
-        # If current model fails, try all models in order (primary + fallbacks)
-        all_models = [PRIMARY_MODEL] + FALLBACK_MODELS
-        for model in all_models:
-            if model == _current_model:  # Skip the one we just tried
-                continue
-            try:
-                print(f"🔄 Trying model: {model}")
-                result = await call_inference_api(messages, model)
-                _current_model = model  # Update current model
-                print(f"✅ Successfully using model: {model}")
-                return result, model
-            except Exception as model_error:
-                print(f"❌ {model} failed: {str(model_error)}")
-                continue
-        # If all models fail, provide a helpful error message
-        raise Exception(f"All models failed. Primary model ({PRIMARY_MODEL}) and fallback models are unavailable. Please try again later.")
 async def ensure_mcp_init(model_id: str, provider: str, api_key: Optional[str]):
     """Initialize MCP server connection."""
@@ -288,15 +283,11 @@ async def stream_answer(
     tool_log: List[str] = []
     citations: List[Tuple[str, Optional[str]]] = []
-    # Handle GPT-OSS-20B via Inference API with fallback
     if USE_GPT_OSS:
         try:
-            # Call the inference API with fallback
-            generated_text, used_model = await call_model_with_fallback(messages_for_llm)
-            # Add model info to tool log
-            if used_model != PRIMARY_MODEL:
-                _append_log(tool_log, f"⚠️ Using fallback model: {used_model}")
             # Stream character by character
             for char in generated_text:
@@ -308,7 +299,7 @@ async def stream_answer(
         except Exception as e:
             yield {
-                "delta": f"❌ Model Error: {str(e)}",
                 "tool_log": _format_tool_log(tool_log),
                 "citations": _format_citations(citations),
             }
@@ -437,7 +428,7 @@ async def stream_answer(
 with gr.Blocks(fill_height=True) as demo:
     gr.Markdown(
         "# 🤖 Gradio Docs Chat\n"
-        "Ask anything about **Gradio**. Powered by GPT-OSS-20B with automatic fallback to reliable models."
     )
     with gr.Row():
@@ -465,12 +456,10 @@ with gr.Blocks(fill_height=True) as demo:
                     value="Detailed",
                 )
                 model_info = gr.Markdown(
-                    f"**Primary Model:** `{PRIMARY_MODEL}`  \n"
-                    f"**Current Model:** `{_current_model}`  \n"
                     f"**Provider:** `{PROVIDER}`  \n"
-                    "_(Auto-fallback to smaller models if primary is paused)_"
                 )
-                reset_model_btn = gr.Button("🔄 Reset to Primary Model", variant="secondary", size="sm")
             with gr.Accordion("🛠 Tool Activity (live)", open=True):
                 tool_log_md = gr.Markdown("_No tool activity yet._")
@@ -487,28 +476,21 @@ with gr.Blocks(fill_height=True) as demo:
         messages_for_llm = to_llm_messages(history_msgs[:-1], user_msg, style_choice)
-        async for chunk in stream_answer(messages_for_llm, PRIMARY_MODEL, PROVIDER, HF_TOKEN):
             delta = chunk.get("delta", "")
             if delta:
                 history_msgs[-1]["content"] += delta
             yield history_msgs, gr.update(value=chunk.get("tool_log", "")), gr.update(value=chunk.get("citations", ""))
-    def on_reset_model():
-        """Reset to primary model and update UI."""
-        reset_to_primary_model()
-        return gr.update(value=f"**Primary Model:** `{PRIMARY_MODEL}`  \n**Current Model:** `{_current_model}`  \n**Provider:** `{PROVIDER}`  \n_(Auto-fallback to smaller models if primary is paused)_")
     # Wire up event handlers
     msg.submit(on_submit, inputs=[msg, chat, style], outputs=[chat, tool_log_md, citations_md], queue=True)
     send_btn.click(on_submit, inputs=[msg, chat, style], outputs=[chat, tool_log_md, citations_md], queue=True)
-    reset_model_btn.click(on_reset_model, outputs=[model_info])
 # ----------------------------
 # Launch App
 # ----------------------------
-print(f"🚀 Starting Gradio Docs Chat with GPT-OSS-20B + Fallback Models")
-print(f"📁 Primary Model: {PRIMARY_MODEL}")
-print(f"📁 Fallback Models: {', '.join(FALLBACK_MODELS)}")
 print(f"🔗 MCP Server: {GRADIO_DOCS_MCP_SSE}")
 demo = demo.queue(max_size=32)

 # app.py
 # Hugging Face Space: Gradio Docs Chat with GPT-OSS-20B and MCP Integration
 # Features:
+#   • GPT-OSS-20B with local transformers loading for fast inference
+#   • CPU-only loading to avoid CUDA initialization issues
 #   • MCP tool-calling for Gradio docs access
 #   • Streaming responses with live tool logs
 #   • Optional "Concise / Detailed" answer styles
     pass
 import gradio as gr
 # Try to import MCPClient with fallback
 try:
     "https://gradio-docs-mcp.hf.space/gradio_api/mcp/sse",
 )
+# Model configuration - local GPT-OSS-20B loading
+MODEL_ID = "openai/gpt-oss-20b"
 PROVIDER = os.environ.get("CHAT_PROVIDER", "auto")
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
 # Model Clients (lazy initialization)
 # ----------------------------
 mcp_client: Optional[MCPClient] = None
+gpt_oss_tokenizer = None
+gpt_oss_model = None
 _initialized = False
 _init_lock = asyncio.Lock()
+_model_loading_lock = asyncio.Lock()
 def _current_system_prompt(style: str) -> str:
     """Get the system prompt with style suffix."""
 def _reset_clients():
     """Reset all global clients."""
+    global mcp_client, gpt_oss_tokenizer, gpt_oss_model, _initialized
     mcp_client = None
+    gpt_oss_tokenizer = None
+    gpt_oss_model = None
     _initialized = False
 def get_mcp_client(model_id: str, provider: str, api_key: Optional[str]) -> MCPClient:
         mcp_client = MCPClient(model=model_id, provider=provider, api_key=api_key)
     return mcp_client
+async def get_gpt_oss_model_and_tokenizer():
+    """Get or create GPT-OSS-20B model and tokenizer with CPU-only loading."""
+    global gpt_oss_tokenizer, gpt_oss_model
+    # Check if already loaded
+    if gpt_oss_tokenizer is not None and gpt_oss_model is not None:
+        return gpt_oss_tokenizer, gpt_oss_model
+    # Use lock to prevent multiple simultaneous loads
+    async with _model_loading_lock:
+        # Double-check after acquiring lock
+        if gpt_oss_tokenizer is not None and gpt_oss_model is not None:
+            return gpt_oss_tokenizer, gpt_oss_model
+        try:
+            # Import here to avoid CUDA initialization in main process
+            import torch
+            from transformers import AutoTokenizer, AutoModelForCausalLM
+            print("🔄 Loading GPT-OSS-20B tokenizer...")
+            gpt_oss_tokenizer = AutoTokenizer.from_pretrained(
+                MODEL_ID,
+                trust_remote_code=True,
+            )
+            print("🔄 Loading GPT-OSS-20B model (CPU-only)...")
+            # Force CPU-only loading to avoid CUDA initialization issues
+            gpt_oss_model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                torch_dtype=torch.float32,  # Use float32 for CPU compatibility
+                device_map="cpu",  # Force CPU loading
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+            )
+            # Set model to evaluation mode
+            gpt_oss_model.eval()
+            print("✅ GPT-OSS-20B loaded successfully on CPU!")
+            return gpt_oss_tokenizer, gpt_oss_model
+        except Exception as e:
+            print(f"❌ Failed to load GPT-OSS-20B: {e}")
+            # Reset globals on error
+            gpt_oss_tokenizer = None
+            gpt_oss_model = None
+            raise e
+async def generate_with_gpt_oss(messages: List[Dict[str, Any]]) -> str:
+    """Generate response using local GPT-OSS-20B model."""
+    try:
+        # Lazy load model only when needed
+        tokenizer, model = await get_gpt_oss_model_and_tokenizer()
+        # Convert messages to GPT-OSS format with reasoning
+        gpt_oss_messages = []
         for msg in messages:
             if msg["role"] == "system":
+                gpt_oss_messages.append({
                     "role": "user",
                     "content": f"Reasoning: high\n\n{msg['content']}"
                 })
             else:
+                gpt_oss_messages.append(msg)
+        # Apply chat template and generate
+        inputs = tokenizer.apply_chat_template(
+            gpt_oss_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
         )
+        # Generate with timeout protection
+        try:
+            import torch
+            with torch.no_grad():  # Disable gradients for inference
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=512,
+                    do_sample=True,
+                    temperature=0.7,
+                    pad_token_id=tokenizer.eos_token_id,
+                    max_time=60.0,  # 60 second timeout
+                )
+        except Exception as gen_error:
+            raise Exception(f"Generation Error: {str(gen_error)}")
+        # Decode the generated text
+        generated_text = tokenizer.decode(
+            outputs[0][inputs["input_ids"].shape[-1]:],
+            skip_special_tokens=True
+        )
+        return generated_text
+    except Exception as e:
+        raise Exception(f"GPT-OSS-20B Error: {str(e)}")
 async def ensure_mcp_init(model_id: str, provider: str, api_key: Optional[str]):
     """Initialize MCP server connection."""
     tool_log: List[str] = []
     citations: List[Tuple[str, Optional[str]]] = []
+    # Handle GPT-OSS-20B via local model
     if USE_GPT_OSS:
         try:
+            # Generate response using local model
+            generated_text = await generate_with_gpt_oss(messages_for_llm)
             # Stream character by character
             for char in generated_text:
         except Exception as e:
             yield {
+                "delta": f"❌ {str(e)}",
                 "tool_log": _format_tool_log(tool_log),
                 "citations": _format_citations(citations),
             }
 with gr.Blocks(fill_height=True) as demo:
     gr.Markdown(
         "# 🤖 Gradio Docs Chat\n"
+        "Ask anything about **Gradio**. Powered by GPT-OSS-20B with local transformers loading."
     )
     with gr.Row():
                     value="Detailed",
                 )
                 model_info = gr.Markdown(
+                    f"**Model:** `{MODEL_ID}` (Local Loading)  \n"
                     f"**Provider:** `{PROVIDER}`  \n"
+                    "_(CPU-only loading to avoid CUDA issues)_"
                 )
             with gr.Accordion("🛠 Tool Activity (live)", open=True):
                 tool_log_md = gr.Markdown("_No tool activity yet._")
         messages_for_llm = to_llm_messages(history_msgs[:-1], user_msg, style_choice)
+        async for chunk in stream_answer(messages_for_llm, MODEL_ID, PROVIDER, HF_TOKEN):
             delta = chunk.get("delta", "")
             if delta:
                 history_msgs[-1]["content"] += delta
             yield history_msgs, gr.update(value=chunk.get("tool_log", "")), gr.update(value=chunk.get("citations", ""))
     # Wire up event handlers
     msg.submit(on_submit, inputs=[msg, chat, style], outputs=[chat, tool_log_md, citations_md], queue=True)
     send_btn.click(on_submit, inputs=[msg, chat, style], outputs=[chat, tool_log_md, citations_md], queue=True)
 # ----------------------------
 # Launch App
 # ----------------------------
+print(f"🚀 Starting Gradio Docs Chat with GPT-OSS-20B (Local Loading)")
+print(f"📁 Model: {MODEL_ID}")
 print(f"🔗 MCP Server: {GRADIO_DOCS_MCP_SSE}")
 demo = demo.queue(max_size=32)

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 gradio>=5.0.0
 huggingface_hub>=0.34.0
 python-dotenv>=1.0.1
-requests>=2.31.0

 gradio>=5.0.0
 huggingface_hub>=0.34.0
 python-dotenv>=1.0.1
+transformers>=4.40.0
+torch>=2.0.0
+accelerate>=0.20.0
+safetensors>=0.4.0