mahdicv commited on
Commit
ba7492c
·
1 Parent(s): 507da18

updating with web browsing + reasoning effort

Browse files
Files changed (4) hide show
  1. README.md +1 -3
  2. app.py +79 -46
  3. gateway.py +88 -25
  4. utils.py +2 -5
README.md CHANGED
@@ -8,9 +8,7 @@ sdk_version: 5.36.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- models:
12
- - openai/gpt-oss-120b
13
- short_description: 'gpt-oss-120b model running on AMD MI300 infrastructure.'
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: 'UPDATED: openai/gpt-oss-120b with web browsing & reasoning effort on AMD MI300X GPUs.'
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,11 +1,12 @@
1
- import os, re, logging, gradio as gr
 
2
  from openai import OpenAI
3
  from gateway import request_generation
4
  from utils import LATEX_DELIMS
5
-
6
  openai_api_key = os.getenv("API_KEY")
7
  openai_api_base = os.getenv("API_ENDPOINT")
8
- MODEL = os.getenv("MODEL_NAME", "")
9
  client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
10
  MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
11
  CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
@@ -13,26 +14,26 @@ QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
13
 
14
  logging.basicConfig(level=logging.INFO)
15
 
16
- def format_analysis_response(text):
17
- m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
18
- if m:
19
- reasoning = m.group(1).strip()
20
- response = text.split("assistantfinal", 1)[-1].strip()
21
- return (
22
- f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
23
- f"**💬 Response:**\n\n{response}"
24
- )
25
- return text.strip()
26
-
27
- def generate(message, history,
28
- system_prompt, temperature,
29
- frequency_penalty, presence_penalty,
30
- max_new_tokens):
31
 
 
32
  if not message.strip():
33
  yield "Please enter a prompt."
34
  return
35
 
 
36
  msgs = []
37
  for h in history:
38
  if isinstance(h, dict):
@@ -45,59 +46,92 @@ def generate(message, history,
45
  logging.info(f"[User] {message}")
46
  logging.info(f"[System] {system_prompt} | Temp={temperature}")
47
 
48
- collected, buffer = "", ""
49
- yielded_once = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  try:
52
- for delta in request_generation(
53
  api_key=openai_api_key, api_base=openai_api_base,
54
  message=message, system_prompt=system_prompt,
55
- model_name=MODEL, chat_history=msgs,
56
- temperature=temperature,
57
- frequency_penalty=frequency_penalty,
58
- presence_penalty=presence_penalty,
59
- max_new_tokens=max_new_tokens,
60
  ):
61
- if not delta:
 
 
 
 
62
  continue
63
 
64
- collected += delta
65
- buffer += delta
66
-
67
- if not yielded_once:
68
- yield delta
69
- buffer = ""
70
- yielded_once = True
71
  continue
72
 
73
- if "\n" in buffer or len(buffer) > 150:
74
- yield collected
75
- buffer = ""
 
 
 
 
 
 
 
 
 
 
76
 
77
- final = format_analysis_response(collected)
78
- if final.count("$") % 2:
79
- final += "$"
80
- yield final
 
81
 
82
  except Exception as e:
83
  logging.exception("Stream failed")
84
  yield f"❌ Error: {e}"
85
 
 
86
  chatbot_ui = gr.ChatInterface(
87
  fn=generate,
88
  type="messages",
89
  chatbot=gr.Chatbot(
90
  label="OSS vLLM Chatbot",
91
  type="messages",
92
- scale=2,
93
  height=600,
94
  latex_delimiters=LATEX_DELIMS,
95
  ),
96
- stop_btn=True,
97
  additional_inputs=[
98
  gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
99
  gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
 
 
100
  ],
 
101
  examples=[
102
  ["Explain the difference between supervised and unsupervised learning."],
103
  ["Summarize the plot of Inception in two sentences."],
@@ -106,11 +140,10 @@ chatbot_ui = gr.ChatInterface(
106
  ["Derive the gradient of softmax cross-entropy loss."],
107
  ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
108
  ],
109
- # title="Open-source GPT-OSS-120B on AMD MI300X",
110
  title=" GPT-OSS-120B on AMD MI300X",
111
  description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
112
  )
 
113
  if __name__ == "__main__":
114
  chatbot_ui.queue(max_size=QUEUE_SIZE,
115
- default_concurrency_limit=CONCURRENCY_LIMIT).launch()
116
-
 
1
+ import os, logging, gradio as gr
2
+ from pydoc import html
3
  from openai import OpenAI
4
  from gateway import request_generation
5
  from utils import LATEX_DELIMS
6
+
7
  openai_api_key = os.getenv("API_KEY")
8
  openai_api_base = os.getenv("API_ENDPOINT")
9
+ model_name = os.getenv("MODEL_NAME")
10
  client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
11
  MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
12
  CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
 
14
 
15
  logging.basicConfig(level=logging.INFO)
16
 
17
+ def format_final(analysis_text: str, visible_text: str) -> str:
18
+ """Render final message with collapsible analysis + normal Markdown answer."""
19
+ reasoning_safe = html.escape((analysis_text or "").strip())
20
+ response = (visible_text or "").strip()
21
+ # Collapsible analysis, normal markdown answer
22
+ return (
23
+ "<details><summary><strong>🤔 Analysis</strong></summary>\n"
24
+ "<pre style='white-space:pre-wrap;'>"
25
+ f"{reasoning_safe}"
26
+ "</pre>\n</details>\n\n"
27
+ "**💬 Response:**\n\n"
28
+ f"{response}"
29
+ )
 
 
30
 
31
+ def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
32
  if not message.strip():
33
  yield "Please enter a prompt."
34
  return
35
 
36
+ # Flatten gradio history
37
  msgs = []
38
  for h in history:
39
  if isinstance(h, dict):
 
46
  logging.info(f"[User] {message}")
47
  logging.info(f"[System] {system_prompt} | Temp={temperature}")
48
 
49
+ tools = [{"type": "web_search_preview"}] if enable_browsing else None
50
+ tool_choice = "auto" if enable_browsing else None
51
+
52
+ in_analysis = False
53
+ in_visible = False
54
+
55
+ raw_analysis = ""
56
+ raw_visible = ""
57
+
58
+ raw_started = False
59
+ last_flush_len = 0
60
+
61
+ def make_raw_preview() -> str:
62
+ return (
63
+ "```text\n"
64
+ "Analysis (live):\n"
65
+ f"{raw_analysis}\n\n"
66
+ "Response (draft):\n"
67
+ f"{raw_visible}\n"
68
+ "```"
69
+ )
70
 
71
  try:
72
+ for chunk in request_generation(
73
  api_key=openai_api_key, api_base=openai_api_base,
74
  message=message, system_prompt=system_prompt,
75
+ model_name=model_name, chat_history=msgs,
76
+ temperature=temperature, reasoning_effort=reasoning_effort,
77
+ max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
 
 
78
  ):
79
+ if chunk == "analysis":
80
+ in_analysis, in_visible = True, False
81
+ if not raw_started:
82
+ raw_started = True
83
+ yield make_raw_preview()
84
  continue
85
 
86
+ if chunk == "assistantfinal":
87
+ in_analysis, in_visible = False, True
88
+ if not raw_started:
89
+ raw_started = True
90
+ yield make_raw_preview()
 
 
91
  continue
92
 
93
+ if in_analysis:
94
+ raw_analysis += chunk
95
+ elif in_visible:
96
+ raw_visible += chunk
97
+ else:
98
+ raw_visible += chunk
99
+
100
+ total_len = len(raw_analysis) + len(raw_visible)
101
+ if total_len - last_flush_len >= 120 or "\n" in chunk:
102
+ last_flush_len = total_len
103
+ yield make_raw_preview()
104
+
105
+ final_markdown = format_final(raw_analysis, raw_visible)
106
 
107
+ if final_markdown.count("$") % 2:
108
+ final_markdown += "$"
109
+
110
+ # This replaces the raw preview in-place with the pretty final message
111
+ yield final_markdown
112
 
113
  except Exception as e:
114
  logging.exception("Stream failed")
115
  yield f"❌ Error: {e}"
116
 
117
+
118
  chatbot_ui = gr.ChatInterface(
119
  fn=generate,
120
  type="messages",
121
  chatbot=gr.Chatbot(
122
  label="OSS vLLM Chatbot",
123
  type="messages",
 
124
  height=600,
125
  latex_delimiters=LATEX_DELIMS,
126
  ),
127
+ additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
128
  additional_inputs=[
129
  gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
130
  gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
131
+ gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
132
+ gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
133
  ],
134
+ stop_btn=True,
135
  examples=[
136
  ["Explain the difference between supervised and unsupervised learning."],
137
  ["Summarize the plot of Inception in two sentences."],
 
140
  ["Derive the gradient of softmax cross-entropy loss."],
141
  ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
142
  ],
 
143
  title=" GPT-OSS-120B on AMD MI300X",
144
  description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
145
  )
146
+
147
  if __name__ == "__main__":
148
  chatbot_ui.queue(max_size=QUEUE_SIZE,
149
+ default_concurrency_limit=CONCURRENCY_LIMIT).launch()
 
gateway.py CHANGED
@@ -1,8 +1,7 @@
1
- import logging
2
- from openai import OpenAI
3
  from typing import List, Generator, Optional
 
4
 
5
- logging.basicConfig(level=logging.INFO)
6
 
7
  def request_generation(
8
  api_key: str,
@@ -12,54 +11,118 @@ def request_generation(
12
  model_name: str,
13
  chat_history: Optional[List[dict]] = None,
14
  temperature: float = 0.3,
15
- frequency_penalty: float = 0.0,
16
- presence_penalty: float = 0.0,
17
  max_new_tokens: int = 1024,
 
18
  tools: Optional[List[dict]] = None,
19
  tool_choice: Optional[str] = None,
20
  ) -> Generator[str, None, None]:
21
  """
22
- Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client.
23
- Buffers output to improve LaTeX rendering.
 
 
24
  """
25
  client = OpenAI(api_key=api_key, base_url=api_base)
26
 
27
- messages = [{"role": "system", "content": system_prompt}]
28
  if chat_history:
29
- messages.extend(chat_history)
30
- messages.append({"role": "user", "content": message})
31
 
32
  request_args = {
33
  "model": model_name,
34
- "messages": messages,
 
35
  "temperature": temperature,
36
- "frequency_penalty": frequency_penalty,
37
- "presence_penalty": presence_penalty,
38
- "max_tokens": max_new_tokens,
 
 
 
39
  "stream": True,
40
  }
41
-
42
  if tools:
43
  request_args["tools"] = tools
44
  if tool_choice:
45
  request_args["tool_choice"] = tool_choice
46
 
47
- logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}")
 
48
 
49
  try:
50
- stream = client.chat.completions.create(**request_args)
51
 
52
- collected = ""
 
 
 
 
53
  buffer = ""
54
 
55
- for chunk in stream:
56
- delta = chunk.choices[0].delta.content or ""
57
- collected += delta
58
- buffer += delta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- if "\n" in buffer or len(buffer) > 150:
61
- yield buffer
62
- buffer = ""
 
 
63
 
64
  if buffer:
65
  yield buffer
 
1
+ import json, logging
 
2
  from typing import List, Generator, Optional
3
+ from openai import OpenAI
4
 
 
5
 
6
  def request_generation(
7
  api_key: str,
 
11
  model_name: str,
12
  chat_history: Optional[List[dict]] = None,
13
  temperature: float = 0.3,
 
 
14
  max_new_tokens: int = 1024,
15
+ reasoning_effort: str = "off",
16
  tools: Optional[List[dict]] = None,
17
  tool_choice: Optional[str] = None,
18
  ) -> Generator[str, None, None]:
19
  """
20
+ Streams Responses API events. Emits:
21
+ - "analysis" sentinel once, then raw reasoning deltas
22
+ - "assistantfinal" sentinel once, then visible output deltas
23
+ If no visible deltas, emits a tool-call fallback message.
24
  """
25
  client = OpenAI(api_key=api_key, base_url=api_base)
26
 
27
+ input_messages: List[dict] = []
28
  if chat_history:
29
+ input_messages.extend(m for m in chat_history if m.get("role") != "system")
30
+ input_messages.append({"role": "user", "content": message})
31
 
32
  request_args = {
33
  "model": model_name,
34
+ "input": input_messages,
35
+ "instructions": system_prompt,
36
  "temperature": temperature,
37
+ "max_output_tokens": max_new_tokens,
38
+ "reasoning": {
39
+ "effort": reasoning_effort,
40
+ "generate_summary": "detailed",
41
+ "summary": "detailed",
42
+ },
43
  "stream": True,
44
  }
 
45
  if tools:
46
  request_args["tools"] = tools
47
  if tool_choice:
48
  request_args["tool_choice"] = tool_choice
49
 
50
+
51
+ raw_reasoning, raw_visible = [], []
52
 
53
  try:
54
+ stream = client.responses.create(**request_args)
55
 
56
+ reasoning_started = False
57
+ reasoning_closed = False
58
+ saw_visible_output = False
59
+ last_tool_name = None
60
+ last_tool_args = None
61
  buffer = ""
62
 
63
+ for event in stream:
64
+ et = getattr(event, "type", "")
65
+
66
+ if et == "response.reasoning_text.delta":
67
+ if not reasoning_started:
68
+ yield "analysis"
69
+ reasoning_started = True
70
+ rdelta = getattr(event, "delta", "") or ""
71
+ if rdelta:
72
+ raw_reasoning.append(rdelta)
73
+ yield rdelta
74
+ continue
75
+
76
+ if et == "response.output_text.delta":
77
+ if reasoning_started and not reasoning_closed:
78
+ yield "assistantfinal"
79
+ reasoning_closed = True
80
+
81
+ saw_visible_output = True
82
+ delta = getattr(event, "delta", "") or ""
83
+ raw_visible.append(delta)
84
+ buffer += delta
85
+
86
+ if "\n" in buffer or len(buffer) > 150:
87
+ yield buffer
88
+ buffer = ""
89
+ continue
90
+
91
+ if et.startswith("response.tool") or et.startswith("response.function_call"):
92
+ name = getattr(event, "name", None)
93
+ args = getattr(event, "arguments", None)
94
+ if args is None:
95
+ args = getattr(event, "args", None) or getattr(event, "delta", None) or getattr(event, "data", None)
96
+ if name:
97
+ last_tool_name = name
98
+ if args is not None:
99
+ last_tool_args = args
100
+ continue
101
+
102
+ if et in ("response.completed", "response.error"):
103
+ if buffer:
104
+ yield buffer
105
+ buffer = ""
106
+
107
+ if reasoning_started and not reasoning_closed:
108
+ yield "assistantfinal"
109
+ reasoning_closed = True
110
+
111
+ if not saw_visible_output:
112
+ msg = "I attempted to call a tool, but tools aren't executed in this environment, so no final answer was produced."
113
+ if last_tool_name:
114
+ try:
115
+ args_text = json.dumps(last_tool_args, ensure_ascii=False, default=str)
116
+ except Exception:
117
+ args_text = str(last_tool_args)
118
+ msg += f"\n\n• Tool requested: **{last_tool_name}**\n• Arguments: `{args_text}`"
119
+ yield msg
120
 
121
+ if et == "response.error":
122
+ err = getattr(event, "error", None)
123
+ emsg = getattr(err, "message", "") if err else "Unknown error"
124
+ yield f"Error: {emsg}"
125
+ break
126
 
127
  if buffer:
128
  yield buffer
utils.py CHANGED
@@ -4,9 +4,6 @@
4
  # ----------------------------------------------------------------------
5
 
6
  LATEX_DELIMS = [
7
- {"left": "$$", "right": "$$", "display": True},
8
- {"left": "$", "right": "$", "display": False},
9
- {"left": "\\[", "right": "\\]", "display": True},
10
- {"left": "\\(", "right": "\\)", "display": False},
11
  ]
12
-
 
4
  # ----------------------------------------------------------------------
5
 
6
  LATEX_DELIMS = [
7
+ {"left": "\\[", "right": "\\]", "display": True},
8
+ {"left": "\\(", "right": "\\)", "display": False},
 
 
9
  ]