Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

File size: 5,450 Bytes

ba7492c
 
f1b7ce9
 
 
ba7492c
f1b7ce9
 
ba7492c
f1b7ce9
 
 
 
ff1e824
f1b7ce9
ff1e824
ba7492c
 
 
 
 
 
 
 
 
 
 
 
 
ff1e824
ba7492c
f1b7ce9
 
 
ff1e824
ba7492c
f1b7ce9
 
 
 
 
 
 
 
ff1e824
ba7492c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff1e824
f1b7ce9
ba7492c
f1b7ce9
 
ba7492c
 
 
f1b7ce9
ba7492c
 
 
 
 
f1b7ce9
ff1e824
ba7492c
 
 
 
 
f1b7ce9
 
ba7492c
 
 
 
 
 
 
 
 
 
 
 
 
f1b7ce9
ba7492c
 
 
 
 
f1b7ce9
 
 
 
 
ba7492c
f1b7ce9
 
 
 
 
 
 
 
 
ba7492c
ff1e824
f1b7ce9
 
ba7492c
 
ff1e824
ba7492c
f1b7ce9
 
 
 
 
 
 
 
 
ae562c4
ff1e824
ba7492c
ff1e824
f1b7ce9
ba7492c

import os, logging, gradio as gr
from pydoc import html
from openai import OpenAI
from gateway import request_generation
from utils import LATEX_DELIMS

openai_api_key = os.getenv("API_KEY")
openai_api_base = os.getenv("API_ENDPOINT")
model_name = os.getenv("MODEL_NAME")
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))

logging.basicConfig(level=logging.INFO)

def format_final(analysis_text: str, visible_text: str) -> str:
    """Render final message with collapsible analysis + normal Markdown answer."""
    reasoning_safe = html.escape((analysis_text or "").strip())
    response = (visible_text or "").strip()
    # Collapsible analysis, normal markdown answer
    return (
        "<details><summary><strong>🤔 Analysis</strong></summary>\n"
        "<pre style='white-space:pre-wrap;'>"
        f"{reasoning_safe}"
        "</pre>\n</details>\n\n"
        "**💬 Response:**\n\n"
        f"{response}"
    )

def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
    if not message.strip():
        yield "Please enter a prompt."
        return

    # Flatten gradio history
    msgs = []
    for h in history:
        if isinstance(h, dict):
            msgs.append(h)
        elif isinstance(h, (list, tuple)) and len(h) == 2:
            u, a = h
            if u: msgs.append({"role": "user", "content": u})
            if a: msgs.append({"role": "assistant", "content": a})

    tools = [{"type": "web_search_preview"}] if enable_browsing else None
    tool_choice = "auto" if enable_browsing else None

    in_analysis = False
    in_visible  = False

    raw_analysis = ""   
    raw_visible  = ""   

    raw_started = False
    last_flush_len = 0

    def make_raw_preview() -> str:
        return (
            "```text\n"
            "Analysis (live):\n"
            f"{raw_analysis}\n\n"
            "Response (draft):\n"
            f"{raw_visible}\n"
            "```"
        )

    try:
        for chunk in request_generation(
            api_key=openai_api_key, api_base=openai_api_base,
            message=message, system_prompt=system_prompt,
            model_name=model_name, chat_history=msgs,
            temperature=temperature, reasoning_effort=reasoning_effort,
            max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
        ):
            if chunk == "analysis":
                in_analysis, in_visible = True, False
                if not raw_started:
                    raw_started = True
                    yield make_raw_preview()
                continue

            if chunk == "assistantfinal":
                in_analysis, in_visible = False, True
                if not raw_started:
                    raw_started = True
                    yield make_raw_preview()
                continue

            if in_analysis:
                raw_analysis += chunk
            elif in_visible:
                raw_visible += chunk
            else:
                raw_visible += chunk

            total_len = len(raw_analysis) + len(raw_visible)
            if total_len - last_flush_len >= 120 or "\n" in chunk:
                last_flush_len = total_len
                yield make_raw_preview()

        final_markdown = format_final(raw_analysis, raw_visible)

        if final_markdown.count("$") % 2:
            final_markdown += "$"

        # This replaces the raw preview in-place with the pretty final message
        yield final_markdown

    except Exception as e:
        logging.exception("Stream failed")
        yield f"❌ Error: {e}"


chatbot_ui = gr.ChatInterface(
    fn=generate,
    type="messages",
    chatbot=gr.Chatbot(
        label="OSS vLLM Chatbot",
        type="messages",
        height=600,
        latex_delimiters=LATEX_DELIMS,
    ),
    additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
    additional_inputs=[
        gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
        gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
        gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
        gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
    ],
    stop_btn=True,
    examples=[
        ["Explain the difference between supervised and unsupervised learning."],
        ["Summarize the plot of Inception in two sentences."],
        ["Show me the LaTeX for the quadratic formula."],
        ["What are advantages of AMD Instinct MI300X GPU?"],
        ["Derive the gradient of softmax cross-entropy loss."],
        ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
    ],
    title=" GPT-OSS-120B on AMD MI300X",
    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
)

if __name__ == "__main__":
    chatbot_ui.queue(max_size=QUEUE_SIZE,
                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()