Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os, logging, gradio as gr | |
from pydoc import html | |
from openai import OpenAI | |
from gateway import request_generation | |
from utils import LATEX_DELIMS | |
openai_api_key = os.getenv("API_KEY") | |
openai_api_base = os.getenv("API_ENDPOINT") | |
model_name = os.getenv("MODEL_NAME") | |
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) | |
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024)) | |
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20)) | |
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4)) | |
logging.basicConfig(level=logging.INFO) | |
def format_final(analysis_text: str, visible_text: str) -> str: | |
"""Render final message with collapsible analysis + normal Markdown answer.""" | |
reasoning_safe = html.escape((analysis_text or "").strip()) | |
response = (visible_text or "").strip() | |
# Collapsible analysis, normal markdown answer | |
return ( | |
"<details><summary><strong>🤔 Analysis</strong></summary>\n" | |
"<pre style='white-space:pre-wrap;'>" | |
f"{reasoning_safe}" | |
"</pre>\n</details>\n\n" | |
"**💬 Response:**\n\n" | |
f"{response}" | |
) | |
def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens): | |
if not message.strip(): | |
yield "Please enter a prompt." | |
return | |
# Flatten gradio history | |
msgs = [] | |
for h in history: | |
if isinstance(h, dict): | |
msgs.append(h) | |
elif isinstance(h, (list, tuple)) and len(h) == 2: | |
u, a = h | |
if u: msgs.append({"role": "user", "content": u}) | |
if a: msgs.append({"role": "assistant", "content": a}) | |
tools = [{"type": "web_search_preview"}] if enable_browsing else None | |
tool_choice = "auto" if enable_browsing else None | |
in_analysis = False | |
in_visible = False | |
raw_analysis = "" | |
raw_visible = "" | |
raw_started = False | |
last_flush_len = 0 | |
def make_raw_preview() -> str: | |
return ( | |
"```text\n" | |
"Analysis (live):\n" | |
f"{raw_analysis}\n\n" | |
"Response (draft):\n" | |
f"{raw_visible}\n" | |
"```" | |
) | |
try: | |
for chunk in request_generation( | |
api_key=openai_api_key, api_base=openai_api_base, | |
message=message, system_prompt=system_prompt, | |
model_name=model_name, chat_history=msgs, | |
temperature=temperature, reasoning_effort=reasoning_effort, | |
max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice, | |
): | |
if chunk == "analysis": | |
in_analysis, in_visible = True, False | |
if not raw_started: | |
raw_started = True | |
yield make_raw_preview() | |
continue | |
if chunk == "assistantfinal": | |
in_analysis, in_visible = False, True | |
if not raw_started: | |
raw_started = True | |
yield make_raw_preview() | |
continue | |
if in_analysis: | |
raw_analysis += chunk | |
elif in_visible: | |
raw_visible += chunk | |
else: | |
raw_visible += chunk | |
total_len = len(raw_analysis) + len(raw_visible) | |
if total_len - last_flush_len >= 120 or "\n" in chunk: | |
last_flush_len = total_len | |
yield make_raw_preview() | |
final_markdown = format_final(raw_analysis, raw_visible) | |
if final_markdown.count("$") % 2: | |
final_markdown += "$" | |
# This replaces the raw preview in-place with the pretty final message | |
yield final_markdown | |
except Exception as e: | |
logging.exception("Stream failed") | |
yield f"❌ Error: {e}" | |
chatbot_ui = gr.ChatInterface( | |
fn=generate, | |
type="messages", | |
chatbot=gr.Chatbot( | |
label="OSS vLLM Chatbot", | |
type="messages", | |
height=600, | |
latex_delimiters=LATEX_DELIMS, | |
), | |
additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True), | |
additional_inputs=[ | |
gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2), | |
gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7), | |
gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"), | |
gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False), | |
], | |
stop_btn=True, | |
examples=[ | |
["Explain the difference between supervised and unsupervised learning."], | |
["Summarize the plot of Inception in two sentences."], | |
["Show me the LaTeX for the quadratic formula."], | |
["What are advantages of AMD Instinct MI300X GPU?"], | |
["Derive the gradient of softmax cross-entropy loss."], | |
["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."], | |
], | |
title=" GPT-OSS-120B on AMD MI300X", | |
description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.", | |
) | |
if __name__ == "__main__": | |
chatbot_ui.queue(max_size=QUEUE_SIZE, | |
default_concurrency_limit=CONCURRENCY_LIMIT).launch() |