Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

App Files Files Community

mahdicv commited on Aug 25

Commit

ba7492c

1 Parent(s): 507da18

updating with web browsing + reasoning effort

Browse files

Files changed (4) hide show

README.md +1 -3
app.py +79 -46
gateway.py +88 -25
utils.py +2 -5

README.md CHANGED Viewed

@@ -8,9 +8,7 @@ sdk_version: 5.36.2
 app_file: app.py
 pinned: false
 license: apache-2.0
-models:
-- openai/gpt-oss-120b
-short_description: 'gpt-oss-120b model running on AMD MI300 infrastructure.'
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: 'UPDATED: openai/gpt-oss-120b with web browsing & reasoning effort on AMD MI300X GPUs.'
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
-import os, re, logging, gradio as gr
 from openai import OpenAI
 from gateway import request_generation
 from utils import LATEX_DELIMS
 openai_api_key = os.getenv("API_KEY")
 openai_api_base = os.getenv("API_ENDPOINT")
-MODEL = os.getenv("MODEL_NAME", "")
 client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
 MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
 CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
@@ -13,26 +14,26 @@ QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
 logging.basicConfig(level=logging.INFO)
-def format_analysis_response(text):
-    m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
-    if m:
-        reasoning = m.group(1).strip()
-        response = text.split("assistantfinal", 1)[-1].strip()
-        return (
-            f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
-            f"**💬 Response:**\n\n{response}"
-        )
-    return text.strip()
-def generate(message, history,
-             system_prompt, temperature,
-             frequency_penalty, presence_penalty,
-             max_new_tokens):
     if not message.strip():
         yield "Please enter a prompt."
         return
     msgs = []
     for h in history:
         if isinstance(h, dict):
@@ -45,59 +46,92 @@ def generate(message, history,
     logging.info(f"[User] {message}")
     logging.info(f"[System] {system_prompt} | Temp={temperature}")
-    collected, buffer = "", ""
-    yielded_once = False
     try:
-        for delta in request_generation(
             api_key=openai_api_key, api_base=openai_api_base,
             message=message, system_prompt=system_prompt,
-            model_name=MODEL, chat_history=msgs,
-            temperature=temperature,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            max_new_tokens=max_new_tokens,
         ):
-            if not delta:
                 continue
-            collected += delta
-            buffer += delta
-            if not yielded_once:
-                yield delta
-                buffer = ""
-                yielded_once = True
                 continue
-            if "\n" in buffer or len(buffer) > 150:
-                yield collected
-                buffer = ""
-        final = format_analysis_response(collected)
-        if final.count("$") % 2:
-            final += "$"
-        yield final
     except Exception as e:
         logging.exception("Stream failed")
         yield f"❌ Error: {e}"
 chatbot_ui = gr.ChatInterface(
     fn=generate,
     type="messages",
     chatbot=gr.Chatbot(
         label="OSS vLLM Chatbot",
         type="messages",
-        scale=2,
         height=600,
         latex_delimiters=LATEX_DELIMS,
     ),
-    stop_btn=True,
     additional_inputs=[
         gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
         gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
     ],
     examples=[
         ["Explain the difference between supervised and unsupervised learning."],
         ["Summarize the plot of Inception in two sentences."],
@@ -106,11 +140,10 @@ chatbot_ui = gr.ChatInterface(
         ["Derive the gradient of softmax cross-entropy loss."],
         ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
     ],
-    # title="Open-source GPT-OSS-120B on AMD MI300X",
     title=" GPT-OSS-120B on AMD MI300X",
     description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
 )
 if __name__ == "__main__":
     chatbot_ui.queue(max_size=QUEUE_SIZE,
-                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()

+import os, logging, gradio as gr
+from pydoc import html
 from openai import OpenAI
 from gateway import request_generation
 from utils import LATEX_DELIMS
 openai_api_key = os.getenv("API_KEY")
 openai_api_base = os.getenv("API_ENDPOINT")
+model_name = os.getenv("MODEL_NAME")
 client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
 MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
 CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
 logging.basicConfig(level=logging.INFO)
+def format_final(analysis_text: str, visible_text: str) -> str:
+    """Render final message with collapsible analysis + normal Markdown answer."""
+    reasoning_safe = html.escape((analysis_text or "").strip())
+    response = (visible_text or "").strip()
+    # Collapsible analysis, normal markdown answer
+    return (
+        "<details><summary><strong>🤔 Analysis</strong></summary>\n"
+        "<pre style='white-space:pre-wrap;'>"
+        f"{reasoning_safe}"
+        "</pre>\n</details>\n\n"
+        "**💬 Response:**\n\n"
+        f"{response}"
+    )
+def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
     if not message.strip():
         yield "Please enter a prompt."
         return
+    # Flatten gradio history
     msgs = []
     for h in history:
         if isinstance(h, dict):
     logging.info(f"[User] {message}")
     logging.info(f"[System] {system_prompt} | Temp={temperature}")
+    tools = [{"type": "web_search_preview"}] if enable_browsing else None
+    tool_choice = "auto" if enable_browsing else None
+    in_analysis = False
+    in_visible  = False
+    raw_analysis = ""
+    raw_visible  = ""
+    raw_started = False
+    last_flush_len = 0
+    def make_raw_preview() -> str:
+        return (
+            "```text\n"
+            "Analysis (live):\n"
+            f"{raw_analysis}\n\n"
+            "Response (draft):\n"
+            f"{raw_visible}\n"
+            "```"
+        )
     try:
+        for chunk in request_generation(
             api_key=openai_api_key, api_base=openai_api_base,
             message=message, system_prompt=system_prompt,
+            model_name=model_name, chat_history=msgs,
+            temperature=temperature, reasoning_effort=reasoning_effort,
+            max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
         ):
+            if chunk == "analysis":
+                in_analysis, in_visible = True, False
+                if not raw_started:
+                    raw_started = True
+                    yield make_raw_preview()
                 continue
+            if chunk == "assistantfinal":
+                in_analysis, in_visible = False, True
+                if not raw_started:
+                    raw_started = True
+                    yield make_raw_preview()
                 continue
+            if in_analysis:
+                raw_analysis += chunk
+            elif in_visible:
+                raw_visible += chunk
+            else:
+                raw_visible += chunk
+            total_len = len(raw_analysis) + len(raw_visible)
+            if total_len - last_flush_len >= 120 or "\n" in chunk:
+                last_flush_len = total_len
+                yield make_raw_preview()
+        final_markdown = format_final(raw_analysis, raw_visible)
+        if final_markdown.count("$") % 2:
+            final_markdown += "$"
+        # This replaces the raw preview in-place with the pretty final message
+        yield final_markdown
     except Exception as e:
         logging.exception("Stream failed")
         yield f"❌ Error: {e}"
 chatbot_ui = gr.ChatInterface(
     fn=generate,
     type="messages",
     chatbot=gr.Chatbot(
         label="OSS vLLM Chatbot",
         type="messages",
         height=600,
         latex_delimiters=LATEX_DELIMS,
     ),
+    additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
     additional_inputs=[
         gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
         gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
+        gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
+        gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
     ],
+    stop_btn=True,
     examples=[
         ["Explain the difference between supervised and unsupervised learning."],
         ["Summarize the plot of Inception in two sentences."],
         ["Derive the gradient of softmax cross-entropy loss."],
         ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
     ],
     title=" GPT-OSS-120B on AMD MI300X",
     description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
 )
 if __name__ == "__main__":
     chatbot_ui.queue(max_size=QUEUE_SIZE,
+                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()

gateway.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import logging
-from openai import OpenAI
 from typing import List, Generator, Optional
-logging.basicConfig(level=logging.INFO)
 def request_generation(
     api_key: str,
@@ -12,54 +11,118 @@ def request_generation(
     model_name: str,
     chat_history: Optional[List[dict]] = None,
     temperature: float = 0.3,
-    frequency_penalty: float = 0.0,
-    presence_penalty: float = 0.0,
     max_new_tokens: int = 1024,
     tools: Optional[List[dict]] = None,
     tool_choice: Optional[str] = None,
 ) -> Generator[str, None, None]:
     """
-    Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client.
-    Buffers output to improve LaTeX rendering.
     """
     client = OpenAI(api_key=api_key, base_url=api_base)
-    messages = [{"role": "system", "content": system_prompt}]
     if chat_history:
-        messages.extend(chat_history)
-    messages.append({"role": "user", "content": message})
     request_args = {
         "model": model_name,
-        "messages": messages,
         "temperature": temperature,
-        "frequency_penalty": frequency_penalty,
-        "presence_penalty": presence_penalty,
-        "max_tokens": max_new_tokens,
         "stream": True,
     }
     if tools:
         request_args["tools"] = tools
     if tool_choice:
         request_args["tool_choice"] = tool_choice
-    logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}")
     try:
-        stream = client.chat.completions.create(**request_args)
-        collected = ""
         buffer = ""
-        for chunk in stream:
-            delta = chunk.choices[0].delta.content or ""
-            collected += delta
-            buffer += delta
-            if "\n" in buffer or len(buffer) > 150:
-                yield buffer
-                buffer = ""
         if buffer:
             yield buffer

+import json, logging
 from typing import List, Generator, Optional
+from openai import OpenAI
 def request_generation(
     api_key: str,
     model_name: str,
     chat_history: Optional[List[dict]] = None,
     temperature: float = 0.3,
     max_new_tokens: int = 1024,
+    reasoning_effort: str = "off",
     tools: Optional[List[dict]] = None,
     tool_choice: Optional[str] = None,
 ) -> Generator[str, None, None]:
     """
+    Streams Responses API events. Emits:
+      - "analysis" sentinel once, then raw reasoning deltas
+      - "assistantfinal" sentinel once, then visible output deltas
+    If no visible deltas, emits a tool-call fallback message.
     """
     client = OpenAI(api_key=api_key, base_url=api_base)
+    input_messages: List[dict] = []
     if chat_history:
+        input_messages.extend(m for m in chat_history if m.get("role") != "system")
+    input_messages.append({"role": "user", "content": message})
     request_args = {
         "model": model_name,
+        "input": input_messages,
+        "instructions": system_prompt,
         "temperature": temperature,
+        "max_output_tokens": max_new_tokens,
+        "reasoning": {
+            "effort": reasoning_effort,
+            "generate_summary": "detailed",
+            "summary": "detailed",
+        },
         "stream": True,
     }
     if tools:
         request_args["tools"] = tools
     if tool_choice:
         request_args["tool_choice"] = tool_choice
+    raw_reasoning, raw_visible = [], []
     try:
+        stream = client.responses.create(**request_args)
+        reasoning_started = False
+        reasoning_closed = False
+        saw_visible_output = False
+        last_tool_name = None
+        last_tool_args = None
         buffer = ""
+        for event in stream:
+            et = getattr(event, "type", "")
+            if et == "response.reasoning_text.delta":
+                if not reasoning_started:
+                    yield "analysis"
+                    reasoning_started = True
+                rdelta = getattr(event, "delta", "") or ""
+                if rdelta:
+                    raw_reasoning.append(rdelta)
+                    yield rdelta
+                continue
+            if et == "response.output_text.delta":
+                if reasoning_started and not reasoning_closed:
+                    yield "assistantfinal"
+                    reasoning_closed = True
+                saw_visible_output = True
+                delta = getattr(event, "delta", "") or ""
+                raw_visible.append(delta)
+                buffer += delta
+                if "\n" in buffer or len(buffer) > 150:
+                    yield buffer
+                    buffer = ""
+                continue
+            if et.startswith("response.tool") or et.startswith("response.function_call"):
+                name = getattr(event, "name", None)
+                args = getattr(event, "arguments", None)
+                if args is None:
+                    args = getattr(event, "args", None) or getattr(event, "delta", None) or getattr(event, "data", None)
+                if name:
+                    last_tool_name = name
+                if args is not None:
+                    last_tool_args = args
+                continue
+            if et in ("response.completed", "response.error"):
+                if buffer:
+                    yield buffer
+                    buffer = ""
+                if reasoning_started and not reasoning_closed:
+                    yield "assistantfinal"
+                    reasoning_closed = True
+                if not saw_visible_output:
+                    msg = "I attempted to call a tool, but tools aren't executed in this environment, so no final answer was produced."
+                    if last_tool_name:
+                        try:
+                            args_text = json.dumps(last_tool_args, ensure_ascii=False, default=str)
+                        except Exception:
+                            args_text = str(last_tool_args)
+                        msg += f"\n\n• Tool requested: **{last_tool_name}**\n• Arguments: `{args_text}`"
+                    yield msg
+                if et == "response.error":
+                    err = getattr(event, "error", None)
+                    emsg = getattr(err, "message", "") if err else "Unknown error"
+                    yield f"Error: {emsg}"
+                break
         if buffer:
             yield buffer

utils.py CHANGED Viewed

@@ -4,9 +4,6 @@
 # ----------------------------------------------------------------------
 LATEX_DELIMS = [
-    {"left": "$$",  "right": "$$",  "display": True},
-    {"left": "$",   "right": "$",   "display": False},
-    {"left": "\\[", "right": "\\]", "display": True},
-    {"left": "\\(", "right": "\\)", "display": False},
 ]

 # ----------------------------------------------------------------------
 LATEX_DELIMS = [
+    {"left": "\\[", "right": "\\]", "display": True},
+    {"left": "\\(", "right": "\\)", "display": False},
 ]