mahdicv's picture
removed logging
8731e01
raw
history blame
5.45 kB
import os, logging, gradio as gr
from pydoc import html
from openai import OpenAI
from gateway import request_generation
from utils import LATEX_DELIMS
openai_api_key = os.getenv("API_KEY")
openai_api_base = os.getenv("API_ENDPOINT")
model_name = os.getenv("MODEL_NAME")
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
logging.basicConfig(level=logging.INFO)
def format_final(analysis_text: str, visible_text: str) -> str:
"""Render final message with collapsible analysis + normal Markdown answer."""
reasoning_safe = html.escape((analysis_text or "").strip())
response = (visible_text or "").strip()
# Collapsible analysis, normal markdown answer
return (
"<details><summary><strong>🤔 Analysis</strong></summary>\n"
"<pre style='white-space:pre-wrap;'>"
f"{reasoning_safe}"
"</pre>\n</details>\n\n"
"**💬 Response:**\n\n"
f"{response}"
)
def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
if not message.strip():
yield "Please enter a prompt."
return
# Flatten gradio history
msgs = []
for h in history:
if isinstance(h, dict):
msgs.append(h)
elif isinstance(h, (list, tuple)) and len(h) == 2:
u, a = h
if u: msgs.append({"role": "user", "content": u})
if a: msgs.append({"role": "assistant", "content": a})
tools = [{"type": "web_search_preview"}] if enable_browsing else None
tool_choice = "auto" if enable_browsing else None
in_analysis = False
in_visible = False
raw_analysis = ""
raw_visible = ""
raw_started = False
last_flush_len = 0
def make_raw_preview() -> str:
return (
"```text\n"
"Analysis (live):\n"
f"{raw_analysis}\n\n"
"Response (draft):\n"
f"{raw_visible}\n"
"```"
)
try:
for chunk in request_generation(
api_key=openai_api_key, api_base=openai_api_base,
message=message, system_prompt=system_prompt,
model_name=model_name, chat_history=msgs,
temperature=temperature, reasoning_effort=reasoning_effort,
max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
):
if chunk == "analysis":
in_analysis, in_visible = True, False
if not raw_started:
raw_started = True
yield make_raw_preview()
continue
if chunk == "assistantfinal":
in_analysis, in_visible = False, True
if not raw_started:
raw_started = True
yield make_raw_preview()
continue
if in_analysis:
raw_analysis += chunk
elif in_visible:
raw_visible += chunk
else:
raw_visible += chunk
total_len = len(raw_analysis) + len(raw_visible)
if total_len - last_flush_len >= 120 or "\n" in chunk:
last_flush_len = total_len
yield make_raw_preview()
final_markdown = format_final(raw_analysis, raw_visible)
if final_markdown.count("$") % 2:
final_markdown += "$"
# This replaces the raw preview in-place with the pretty final message
yield final_markdown
except Exception as e:
logging.exception("Stream failed")
yield f"❌ Error: {e}"
chatbot_ui = gr.ChatInterface(
fn=generate,
type="messages",
chatbot=gr.Chatbot(
label="OSS vLLM Chatbot",
type="messages",
height=600,
latex_delimiters=LATEX_DELIMS,
),
additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
additional_inputs=[
gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
],
stop_btn=True,
examples=[
["Explain the difference between supervised and unsupervised learning."],
["Summarize the plot of Inception in two sentences."],
["Show me the LaTeX for the quadratic formula."],
["What are advantages of AMD Instinct MI300X GPU?"],
["Derive the gradient of softmax cross-entropy loss."],
["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
],
title=" GPT-OSS-120B on AMD MI300X",
description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
)
if __name__ == "__main__":
chatbot_ui.queue(max_size=QUEUE_SIZE,
default_concurrency_limit=CONCURRENCY_LIMIT).launch()