Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,450 Bytes
ba7492c f1b7ce9 ba7492c f1b7ce9 ba7492c f1b7ce9 ff1e824 f1b7ce9 ff1e824 ba7492c ff1e824 ba7492c f1b7ce9 ff1e824 ba7492c f1b7ce9 ff1e824 ba7492c ff1e824 f1b7ce9 ba7492c f1b7ce9 ba7492c f1b7ce9 ba7492c f1b7ce9 ff1e824 ba7492c f1b7ce9 ba7492c f1b7ce9 ba7492c f1b7ce9 ba7492c f1b7ce9 ba7492c ff1e824 f1b7ce9 ba7492c ff1e824 ba7492c f1b7ce9 ae562c4 ff1e824 ba7492c ff1e824 f1b7ce9 ba7492c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import os, logging, gradio as gr
from pydoc import html
from openai import OpenAI
from gateway import request_generation
from utils import LATEX_DELIMS
openai_api_key = os.getenv("API_KEY")
openai_api_base = os.getenv("API_ENDPOINT")
model_name = os.getenv("MODEL_NAME")
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
logging.basicConfig(level=logging.INFO)
def format_final(analysis_text: str, visible_text: str) -> str:
"""Render final message with collapsible analysis + normal Markdown answer."""
reasoning_safe = html.escape((analysis_text or "").strip())
response = (visible_text or "").strip()
# Collapsible analysis, normal markdown answer
return (
"<details><summary><strong>🤔 Analysis</strong></summary>\n"
"<pre style='white-space:pre-wrap;'>"
f"{reasoning_safe}"
"</pre>\n</details>\n\n"
"**💬 Response:**\n\n"
f"{response}"
)
def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
if not message.strip():
yield "Please enter a prompt."
return
# Flatten gradio history
msgs = []
for h in history:
if isinstance(h, dict):
msgs.append(h)
elif isinstance(h, (list, tuple)) and len(h) == 2:
u, a = h
if u: msgs.append({"role": "user", "content": u})
if a: msgs.append({"role": "assistant", "content": a})
tools = [{"type": "web_search_preview"}] if enable_browsing else None
tool_choice = "auto" if enable_browsing else None
in_analysis = False
in_visible = False
raw_analysis = ""
raw_visible = ""
raw_started = False
last_flush_len = 0
def make_raw_preview() -> str:
return (
"```text\n"
"Analysis (live):\n"
f"{raw_analysis}\n\n"
"Response (draft):\n"
f"{raw_visible}\n"
"```"
)
try:
for chunk in request_generation(
api_key=openai_api_key, api_base=openai_api_base,
message=message, system_prompt=system_prompt,
model_name=model_name, chat_history=msgs,
temperature=temperature, reasoning_effort=reasoning_effort,
max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
):
if chunk == "analysis":
in_analysis, in_visible = True, False
if not raw_started:
raw_started = True
yield make_raw_preview()
continue
if chunk == "assistantfinal":
in_analysis, in_visible = False, True
if not raw_started:
raw_started = True
yield make_raw_preview()
continue
if in_analysis:
raw_analysis += chunk
elif in_visible:
raw_visible += chunk
else:
raw_visible += chunk
total_len = len(raw_analysis) + len(raw_visible)
if total_len - last_flush_len >= 120 or "\n" in chunk:
last_flush_len = total_len
yield make_raw_preview()
final_markdown = format_final(raw_analysis, raw_visible)
if final_markdown.count("$") % 2:
final_markdown += "$"
# This replaces the raw preview in-place with the pretty final message
yield final_markdown
except Exception as e:
logging.exception("Stream failed")
yield f"❌ Error: {e}"
chatbot_ui = gr.ChatInterface(
fn=generate,
type="messages",
chatbot=gr.Chatbot(
label="OSS vLLM Chatbot",
type="messages",
height=600,
latex_delimiters=LATEX_DELIMS,
),
additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
additional_inputs=[
gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
],
stop_btn=True,
examples=[
["Explain the difference between supervised and unsupervised learning."],
["Summarize the plot of Inception in two sentences."],
["Show me the LaTeX for the quadratic formula."],
["What are advantages of AMD Instinct MI300X GPU?"],
["Derive the gradient of softmax cross-entropy loss."],
["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
],
title=" GPT-OSS-120B on AMD MI300X",
description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
)
if __name__ == "__main__":
chatbot_ui.queue(max_size=QUEUE_SIZE,
default_concurrency_limit=CONCURRENCY_LIMIT).launch() |