Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

removed logging

8731e01 about 2 months ago

5.45 kB

	import os, logging, gradio as gr
	from pydoc import html
	from openai import OpenAI
	from gateway import request_generation
	from utils import LATEX_DELIMS

	openai_api_key = os.getenv("API_KEY")
	openai_api_base = os.getenv("API_ENDPOINT")
	model_name = os.getenv("MODEL_NAME")
	client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
	MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
	CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
	QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))

	logging.basicConfig(level=logging.INFO)

	def format_final(analysis_text: str, visible_text: str) -> str:
	"""Render final message with collapsible analysis + normal Markdown answer."""
	reasoning_safe = html.escape((analysis_text or "").strip())
	response = (visible_text or "").strip()
	# Collapsible analysis, normal markdown answer
	return (
	"<details><summary><strong>🤔 Analysis</strong></summary>\n"
	"<pre style='white-space:pre-wrap;'>"
	f"{reasoning_safe}"
	"</pre>\n</details>\n\n"
	"💬 Response:\n\n"
	f"{response}"
	)

	def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
	if not message.strip():
	yield "Please enter a prompt."
	return

	# Flatten gradio history
	msgs = []
	for h in history:
	if isinstance(h, dict):
	msgs.append(h)
	elif isinstance(h, (list, tuple)) and len(h) == 2:
	u, a = h
	if u: msgs.append({"role": "user", "content": u})
	if a: msgs.append({"role": "assistant", "content": a})

	tools = [{"type": "web_search_preview"}] if enable_browsing else None
	tool_choice = "auto" if enable_browsing else None

	in_analysis = False
	in_visible = False

	raw_analysis = ""
	raw_visible = ""

	raw_started = False
	last_flush_len = 0

	def make_raw_preview() -> str:
	return (
	"```text\n"
	"Analysis (live):\n"
	f"{raw_analysis}\n\n"
	"Response (draft):\n"
	f"{raw_visible}\n"
	"```"
	)

	try:
	for chunk in request_generation(
	api_key=openai_api_key, api_base=openai_api_base,
	message=message, system_prompt=system_prompt,
	model_name=model_name, chat_history=msgs,
	temperature=temperature, reasoning_effort=reasoning_effort,
	max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
	):
	if chunk == "analysis":
	in_analysis, in_visible = True, False
	if not raw_started:
	raw_started = True
	yield make_raw_preview()
	continue

	if chunk == "assistantfinal":
	in_analysis, in_visible = False, True
	if not raw_started:
	raw_started = True
	yield make_raw_preview()
	continue

	if in_analysis:
	raw_analysis += chunk
	elif in_visible:
	raw_visible += chunk
	else:
	raw_visible += chunk

	total_len = len(raw_analysis) + len(raw_visible)
	if total_len - last_flush_len >= 120 or "\n" in chunk:
	last_flush_len = total_len
	yield make_raw_preview()

	final_markdown = format_final(raw_analysis, raw_visible)

	if final_markdown.count("$") % 2:
	final_markdown += "$"

	# This replaces the raw preview in-place with the pretty final message
	yield final_markdown

	except Exception as e:
	logging.exception("Stream failed")
	yield f"❌ Error: {e}"


	chatbot_ui = gr.ChatInterface(
	fn=generate,
	type="messages",
	chatbot=gr.Chatbot(
	label="OSS vLLM Chatbot",
	type="messages",
	height=600,
	latex_delimiters=LATEX_DELIMS,
	),
	additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
	additional_inputs=[
	gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
	gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
	gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
	gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
	],
	stop_btn=True,
	examples=[
	["Explain the difference between supervised and unsupervised learning."],
	["Summarize the plot of Inception in two sentences."],
	["Show me the LaTeX for the quadratic formula."],
	["What are advantages of AMD Instinct MI300X GPU?"],
	["Derive the gradient of softmax cross-entropy loss."],
	["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
	],
	title=" GPT-OSS-120B on AMD MI300X",
	description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. *DISCLAIMER:* Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
	)

	if __name__ == "__main__":
	chatbot_ui.queue(max_size=QUEUE_SIZE,
	default_concurrency_limit=CONCURRENCY_LIMIT).launch()