Spaces:

sudipta26889
/

gradio-doc

Sleeping

App Files Files Community

sudipta26889 commited on 16 days ago

Commit

8fd3d30

1 Parent(s): 079d1c0

Switch to Qwen3-30B model with fixed duplicate torch_dtype parameter and improved CPU loading

Browse files

Files changed (1) hide show

app.py +28 -36

app.py CHANGED Viewed

@@ -60,8 +60,8 @@ GRADIO_DOCS_MCP_SSE = os.environ.get(
     "https://gradio-docs-mcp.hf.space/gradio_api/mcp/sse",
 )
-# Model configuration - local GPT-OSS-20B loading
-MODEL_ID = "openai/gpt-oss-20b"
 PROVIDER = os.environ.get("CHAT_PROVIDER", "auto")
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
@@ -130,22 +130,18 @@ async def get_gpt_oss_model_and_tokenizer():
             torch.cuda.is_available = lambda: False
             torch.cuda.device_count = lambda: 0
-            print("🔄 Loading GPT-OSS-20B tokenizer...")
-            gpt_oss_tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_ID,
-                trust_remote_code=True,
-            )
-            print("🔄 Loading GPT-OSS-20B model (CPU-only)...")
-            # Strict CPU-only loading configuration
             gpt_oss_model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 torch_dtype=torch.float32,  # Use float32 for CPU compatibility
                 device_map=None,  # Don't use device mapping
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
-                # Force CPU placement
-                **{"torch_dtype": torch.float32, "device": "cpu"}
             )
             # Explicitly move to CPU
@@ -154,11 +150,11 @@ async def get_gpt_oss_model_and_tokenizer():
             # Set model to evaluation mode
             gpt_oss_model.eval()
-            print("✅ GPT-OSS-20B loaded successfully on CPU!")
             return gpt_oss_tokenizer, gpt_oss_model
         except Exception as e:
-            print(f"❌ Failed to load GPT-OSS-20B: {e}")
             # Reset globals on error
             gpt_oss_tokenizer = None
             gpt_oss_model = None
@@ -170,36 +166,34 @@ async def generate_with_gpt_oss(messages: List[Dict[str, Any]]) -> str:
         # Lazy load model only when needed
         tokenizer, model = await get_gpt_oss_model_and_tokenizer()
-        # Convert messages to GPT-OSS format with reasoning
-        gpt_oss_messages = []
         for msg in messages:
             if msg["role"] == "system":
-                gpt_oss_messages.append({
                     "role": "user",
-                    "content": f"Reasoning: high\n\n{msg['content']}"
                 })
             else:
-                gpt_oss_messages.append(msg)
         # Apply chat template and generate
-        inputs = tokenizer.apply_chat_template(
-            gpt_oss_messages,
             add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
         )
-        # Ensure inputs are on CPU
-        inputs = {k: v.to("cpu") if hasattr(v, "to") else v for k, v in inputs.items()}
         # Generate with timeout protection
         try:
             import torch
             with torch.no_grad():  # Disable gradients for inference
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=512,
                     do_sample=True,
                     temperature=0.7,
                     pad_token_id=tokenizer.eos_token_id,
@@ -209,15 +203,13 @@ async def generate_with_gpt_oss(messages: List[Dict[str, Any]]) -> str:
             raise Exception(f"Generation Error: {str(gen_error)}")
         # Decode the generated text
-        generated_text = tokenizer.decode(
-            outputs[0][inputs["input_ids"].shape[-1]:],
-            skip_special_tokens=True
-        )
         return generated_text
     except Exception as e:
-        raise Exception(f"GPT-OSS-20B Error: {str(e)}")
 async def ensure_mcp_init(model_id: str, provider: str, api_key: Optional[str]):
     """Initialize MCP server connection."""
@@ -445,7 +437,7 @@ async def stream_answer(
 with gr.Blocks(fill_height=True) as demo:
     gr.Markdown(
         "# 🤖 Gradio Docs Chat\n"
-        "Ask anything about **Gradio**. Powered by GPT-OSS-20B with local transformers loading."
     )
     with gr.Row():
@@ -475,7 +467,7 @@ with gr.Blocks(fill_height=True) as demo:
                 model_info = gr.Markdown(
                     f"**Model:** `{MODEL_ID}` (Local Loading)  \n"
                     f"**Provider:** `{PROVIDER}`  \n"
-                    "_(CPU-only loading to avoid CUDA issues)_"
                 )
             with gr.Accordion("🛠 Tool Activity (live)", open=True):
@@ -506,7 +498,7 @@ with gr.Blocks(fill_height=True) as demo:
 # ----------------------------
 # Launch App
 # ----------------------------
-print(f"🚀 Starting Gradio Docs Chat with GPT-OSS-20B (Local Loading)")
 print(f"📁 Model: {MODEL_ID}")
 print(f"🔗 MCP Server: {GRADIO_DOCS_MCP_SSE}")

     "https://gradio-docs-mcp.hf.space/gradio_api/mcp/sse",
 )
+# Model configuration - local Qwen model loading (more efficient than GPT-OSS-20B)
+MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
 PROVIDER = os.environ.get("CHAT_PROVIDER", "auto")
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
             torch.cuda.is_available = lambda: False
             torch.cuda.device_count = lambda: 0
+            print("🔄 Loading Qwen tokenizer...")
+            gpt_oss_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+            print("🔄 Loading Qwen model (CPU-only)...")
+            # Clean CPU-only loading configuration (no duplicate parameters)
             gpt_oss_model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 torch_dtype=torch.float32,  # Use float32 for CPU compatibility
                 device_map=None,  # Don't use device mapping
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
             )
             # Explicitly move to CPU
             # Set model to evaluation mode
             gpt_oss_model.eval()
+            print("✅ Qwen model loaded successfully on CPU!")
             return gpt_oss_tokenizer, gpt_oss_model
         except Exception as e:
+            print(f"❌ Failed to load Qwen model: {e}")
             # Reset globals on error
             gpt_oss_tokenizer = None
             gpt_oss_model = None
         # Lazy load model only when needed
         tokenizer, model = await get_gpt_oss_model_and_tokenizer()
+        # Convert messages to Qwen format
+        qwen_messages = []
         for msg in messages:
             if msg["role"] == "system":
+                # Convert system message to user message for Qwen
+                qwen_messages.append({
                     "role": "user",
+                    "content": f"System: {msg['content']}"
                 })
             else:
+                qwen_messages.append(msg)
         # Apply chat template and generate
+        text = tokenizer.apply_chat_template(
+            qwen_messages,
+            tokenize=False,
             add_generation_prompt=True,
         )
+        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
         # Generate with timeout protection
         try:
             import torch
             with torch.no_grad():  # Disable gradients for inference
+                generated_ids = model.generate(
+                    **model_inputs,
+                    max_new_tokens=512,  # Reduced from 16384 for faster response
                     do_sample=True,
                     temperature=0.7,
                     pad_token_id=tokenizer.eos_token_id,
             raise Exception(f"Generation Error: {str(gen_error)}")
         # Decode the generated text
+        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+        generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)
         return generated_text
     except Exception as e:
+        raise Exception(f"Qwen Model Error: {str(e)}")
 async def ensure_mcp_init(model_id: str, provider: str, api_key: Optional[str]):
     """Initialize MCP server connection."""
 with gr.Blocks(fill_height=True) as demo:
     gr.Markdown(
         "# 🤖 Gradio Docs Chat\n"
+        "Ask anything about **Gradio**. Powered by Qwen3-30B with local transformers loading."
     )
     with gr.Row():
                 model_info = gr.Markdown(
                     f"**Model:** `{MODEL_ID}` (Local Loading)  \n"
                     f"**Provider:** `{PROVIDER}`  \n"
+                    "_(CPU-only loading for stable inference)_"
                 )
             with gr.Accordion("🛠 Tool Activity (live)", open=True):
 # ----------------------------
 # Launch App
 # ----------------------------
+print(f"🚀 Starting Gradio Docs Chat with Qwen3-30B (Local Loading)")
 print(f"📁 Model: {MODEL_ID}")
 print(f"🔗 MCP Server: {GRADIO_DOCS_MCP_SSE}")