Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os, logging, gradio as gr | |
| from pydoc import html | |
| from openai import OpenAI | |
| from gateway import request_generation | |
| from utils import LATEX_DELIMS | |
| openai_api_key = os.getenv("API_KEY") | |
| openai_api_base = os.getenv("API_ENDPOINT") | |
| model_name = os.getenv("MODEL_NAME") | |
| client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) | |
| MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024)) | |
| CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20)) | |
| QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4)) | |
| logging.basicConfig(level=logging.INFO) | |
| def format_final(analysis_text: str, visible_text: str) -> str: | |
| """Render final message with collapsible analysis + normal Markdown answer.""" | |
| reasoning_safe = html.escape((analysis_text or "").strip()) | |
| response = (visible_text or "").strip() | |
| # Collapsible analysis, normal markdown answer | |
| return ( | |
| "<details><summary><strong>🤔 Analysis</strong></summary>\n" | |
| "<pre style='white-space:pre-wrap;'>" | |
| f"{reasoning_safe}" | |
| "</pre>\n</details>\n\n" | |
| "**💬 Response:**\n\n" | |
| f"{response}" | |
| ) | |
| def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens): | |
| if not message.strip(): | |
| yield "Please enter a prompt." | |
| return | |
| # Flatten gradio history | |
| msgs = [] | |
| for h in history: | |
| if isinstance(h, dict): | |
| msgs.append(h) | |
| elif isinstance(h, (list, tuple)) and len(h) == 2: | |
| u, a = h | |
| if u: msgs.append({"role": "user", "content": u}) | |
| if a: msgs.append({"role": "assistant", "content": a}) | |
| logging.info(f"[User] {message}") | |
| logging.info(f"[System] {system_prompt} | Temp={temperature}") | |
| tools = [{"type": "web_search_preview"}] if enable_browsing else None | |
| tool_choice = "auto" if enable_browsing else None | |
| in_analysis = False | |
| in_visible = False | |
| raw_analysis = "" | |
| raw_visible = "" | |
| raw_started = False | |
| last_flush_len = 0 | |
| def make_raw_preview() -> str: | |
| return ( | |
| "```text\n" | |
| "Analysis (live):\n" | |
| f"{raw_analysis}\n\n" | |
| "Response (draft):\n" | |
| f"{raw_visible}\n" | |
| "```" | |
| ) | |
| try: | |
| for chunk in request_generation( | |
| api_key=openai_api_key, api_base=openai_api_base, | |
| message=message, system_prompt=system_prompt, | |
| model_name=model_name, chat_history=msgs, | |
| temperature=temperature, reasoning_effort=reasoning_effort, | |
| max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice, | |
| ): | |
| if chunk == "analysis": | |
| in_analysis, in_visible = True, False | |
| if not raw_started: | |
| raw_started = True | |
| yield make_raw_preview() | |
| continue | |
| if chunk == "assistantfinal": | |
| in_analysis, in_visible = False, True | |
| if not raw_started: | |
| raw_started = True | |
| yield make_raw_preview() | |
| continue | |
| if in_analysis: | |
| raw_analysis += chunk | |
| elif in_visible: | |
| raw_visible += chunk | |
| else: | |
| raw_visible += chunk | |
| total_len = len(raw_analysis) + len(raw_visible) | |
| if total_len - last_flush_len >= 120 or "\n" in chunk: | |
| last_flush_len = total_len | |
| yield make_raw_preview() | |
| final_markdown = format_final(raw_analysis, raw_visible) | |
| if final_markdown.count("$") % 2: | |
| final_markdown += "$" | |
| # This replaces the raw preview in-place with the pretty final message | |
| yield final_markdown | |
| except Exception as e: | |
| logging.exception("Stream failed") | |
| yield f"❌ Error: {e}" | |
| chatbot_ui = gr.ChatInterface( | |
| fn=generate, | |
| type="messages", | |
| chatbot=gr.Chatbot( | |
| label="OSS vLLM Chatbot", | |
| type="messages", | |
| height=600, | |
| latex_delimiters=LATEX_DELIMS, | |
| ), | |
| additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True), | |
| additional_inputs=[ | |
| gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2), | |
| gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7), | |
| gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"), | |
| gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False), | |
| ], | |
| stop_btn=True, | |
| examples=[ | |
| ["Explain the difference between supervised and unsupervised learning."], | |
| ["Summarize the plot of Inception in two sentences."], | |
| ["Show me the LaTeX for the quadratic formula."], | |
| ["What are advantages of AMD Instinct MI300X GPU?"], | |
| ["Derive the gradient of softmax cross-entropy loss."], | |
| ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."], | |
| ], | |
| title=" GPT-OSS-120B on AMD MI300X", | |
| description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.", | |
| ) | |
| if __name__ == "__main__": | |
| chatbot_ui.queue(max_size=QUEUE_SIZE, | |
| default_concurrency_limit=CONCURRENCY_LIMIT).launch() |