import os, logging, gradio as gr from pydoc import html from openai import OpenAI from gateway import request_generation from utils import LATEX_DELIMS openai_api_key = os.getenv("API_KEY") openai_api_base = os.getenv("API_ENDPOINT") model_name = os.getenv("MODEL_NAME") client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024)) CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20)) QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4)) logging.basicConfig(level=logging.INFO) def format_final(analysis_text: str, visible_text: str) -> str: """Render final message with collapsible analysis + normal Markdown answer.""" reasoning_safe = html.escape((analysis_text or "").strip()) response = (visible_text or "").strip() # Collapsible analysis, normal markdown answer return ( "
🤔 Analysis\n" "
"
        f"{reasoning_safe}"
        "
\n
\n\n" "**💬 Response:**\n\n" f"{response}" ) def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens): if not message.strip(): yield "Please enter a prompt." return # Flatten gradio history msgs = [] for h in history: if isinstance(h, dict): msgs.append(h) elif isinstance(h, (list, tuple)) and len(h) == 2: u, a = h if u: msgs.append({"role": "user", "content": u}) if a: msgs.append({"role": "assistant", "content": a}) logging.info(f"[User] {message}") logging.info(f"[System] {system_prompt} | Temp={temperature}") tools = [{"type": "web_search_preview"}] if enable_browsing else None tool_choice = "auto" if enable_browsing else None in_analysis = False in_visible = False raw_analysis = "" raw_visible = "" raw_started = False last_flush_len = 0 def make_raw_preview() -> str: return ( "```text\n" "Analysis (live):\n" f"{raw_analysis}\n\n" "Response (draft):\n" f"{raw_visible}\n" "```" ) try: for chunk in request_generation( api_key=openai_api_key, api_base=openai_api_base, message=message, system_prompt=system_prompt, model_name=model_name, chat_history=msgs, temperature=temperature, reasoning_effort=reasoning_effort, max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice, ): if chunk == "analysis": in_analysis, in_visible = True, False if not raw_started: raw_started = True yield make_raw_preview() continue if chunk == "assistantfinal": in_analysis, in_visible = False, True if not raw_started: raw_started = True yield make_raw_preview() continue if in_analysis: raw_analysis += chunk elif in_visible: raw_visible += chunk else: raw_visible += chunk total_len = len(raw_analysis) + len(raw_visible) if total_len - last_flush_len >= 120 or "\n" in chunk: last_flush_len = total_len yield make_raw_preview() final_markdown = format_final(raw_analysis, raw_visible) if final_markdown.count("$") % 2: final_markdown += "$" # This replaces the raw preview in-place with the pretty final message yield final_markdown except Exception as e: logging.exception("Stream failed") yield f"❌ Error: {e}" chatbot_ui = gr.ChatInterface( fn=generate, type="messages", chatbot=gr.Chatbot( label="OSS vLLM Chatbot", type="messages", height=600, latex_delimiters=LATEX_DELIMS, ), additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True), additional_inputs=[ gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2), gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7), gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"), gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False), ], stop_btn=True, examples=[ ["Explain the difference between supervised and unsupervised learning."], ["Summarize the plot of Inception in two sentences."], ["Show me the LaTeX for the quadratic formula."], ["What are advantages of AMD Instinct MI300X GPU?"], ["Derive the gradient of softmax cross-entropy loss."], ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."], ], title=" GPT-OSS-120B on AMD MI300X", description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.", ) if __name__ == "__main__": chatbot_ui.queue(max_size=QUEUE_SIZE, default_concurrency_limit=CONCURRENCY_LIMIT).launch()