Nymbo commited on
Commit
542c2ac
·
verified ·
1 Parent(s): 4c18bfc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -55
app.py CHANGED
@@ -2,8 +2,19 @@ import gradio as gr
2
  from openai import OpenAI
3
  import os
4
 
5
- # Retrieve the access token from the environment variable
6
- ACCESS_TOKEN = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
7
  print("Access token loaded.")
8
 
9
  # Initialize the OpenAI client with the Hugging Face Inference API endpoint
@@ -21,104 +32,265 @@ def respond(
21
  temperature,
22
  top_p,
23
  frequency_penalty,
24
- seed
 
 
 
 
25
  ):
26
  """
27
- This function handles the chatbot response. It takes in:
28
- - message: the user's new message
29
- - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
30
- - system_message: the system prompt
31
- - max_tokens: the maximum number of tokens to generate in the response
32
- - temperature: sampling temperature
33
- - top_p: top-p (nucleus) sampling
34
- - frequency_penalty: penalize repeated tokens in the output
35
- - seed: a fixed seed for reproducibility; -1 will mean 'random'
 
 
 
 
 
36
  """
37
 
 
38
  print(f"Received message: {message}")
39
  print(f"History: {history}")
40
  print(f"System message: {system_message}")
41
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
42
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
 
 
43
 
44
  # Convert seed to None if -1 (meaning random)
45
  if seed == -1:
46
  seed = None
47
 
48
- # Construct the messages array required by the API
49
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
50
 
51
- # Add conversation history to the context
 
 
52
  for val in history:
53
  user_part = val[0]
54
  assistant_part = val[1]
55
  if user_part:
56
  messages.append({"role": "user", "content": user_part})
57
- print(f"Added user message to context: {user_part}")
58
  if assistant_part:
59
  messages.append({"role": "assistant", "content": assistant_part})
60
- print(f"Added assistant message to context: {assistant_part}")
61
 
62
  # Append the latest user message
63
  messages.append({"role": "user", "content": message})
64
 
65
  # Start with an empty string to build the response as tokens stream in
66
  response = ""
67
- print("Sending request to OpenAI API.")
 
68
 
69
  # Make the streaming request to the HF Inference API via openai-like client
 
70
  for message_chunk in client.chat.completions.create(
71
- model="meta-llama/Llama-3.3-70B-Instruct", # You can update this to your specific model
72
  max_tokens=max_tokens,
73
- stream=True, # Stream the response
74
  temperature=temperature,
75
  top_p=top_p,
76
- frequency_penalty=frequency_penalty, # <-- NEW
77
- seed=seed, # <-- NEW
78
  messages=messages,
79
  ):
80
- # Extract the token text from the response chunk
81
  token_text = message_chunk.choices[0].delta.content
82
- print(f"Received token: {token_text}")
83
  response += token_text
 
84
  yield response
85
 
86
  print("Completed response generation.")
87
 
88
  # Create a Chatbot component with a specified height
89
  chatbot = gr.Chatbot(height=600)
90
- print("Chatbot interface created.")
91
-
92
- # Create the Gradio ChatInterface
93
- # We add two new sliders for Frequency Penalty and Seed
94
- demo = gr.ChatInterface(
95
- respond,
96
- additional_inputs=[
97
- gr.Textbox(value="", label="System message"),
98
- gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
99
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
100
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
101
- gr.Slider(
102
- minimum=-2.0,
103
- maximum=2.0,
104
- value=0.0,
105
- step=0.1,
106
- label="Frequency Penalty"
107
- ),
108
- gr.Slider(
109
- minimum=-1,
110
- maximum=65535, # Arbitrary upper limit for demonstration
111
- value=-1,
112
- step=1,
113
- label="Seed (-1 for random)"
114
- ),
115
- ],
116
- fill_height=True,
117
- chatbot=chatbot,
118
- theme="Nymbo/Nymbo_Theme",
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  print("Gradio interface initialized.")
121
 
 
 
 
122
  if __name__ == "__main__":
123
- print("Launching the demo application.")
124
  demo.launch()
 
2
  from openai import OpenAI
3
  import os
4
 
5
+ # --------------------------------------------------------------------------------
6
+ # Serverless-TextGen-Hub
7
+ # This application is a Gradio-based UI for text generation using
8
+ # Hugging Face's serverless Inference API. We also incorporate features
9
+ # inspired by the ImgGen-Hub, such as:
10
+ # - A "Featured Models" accordion with text filtering.
11
+ # - A "Custom Model" textbox for specifying a non-featured model.
12
+ # - An "Information" tab with accordions for "Featured Models" and
13
+ # "Parameters Overview" containing helpful user guides.
14
+ # --------------------------------------------------------------------------------
15
+
16
+ # Retrieve the access token from environment variables
17
+ ACCESS_TOKEN = os.getenv("HF_TOKEN") # HF_TOKEN is your Hugging Face Inference API key
18
  print("Access token loaded.")
19
 
20
  # Initialize the OpenAI client with the Hugging Face Inference API endpoint
 
32
  temperature,
33
  top_p,
34
  frequency_penalty,
35
+ seed,
36
+ # NEW inputs for model selection
37
+ model_search,
38
+ selected_model,
39
+ custom_model
40
  ):
41
  """
42
+ This function handles the chatbot response.
43
+
44
+ Parameters:
45
+ - message: The user's newest message (string).
46
+ - history: The list of previous messages in the conversation, each as a tuple (user_msg, assistant_msg).
47
+ - system_message: The system prompt provided.
48
+ - max_tokens: The maximum number of tokens to generate in the response.
49
+ - temperature: Sampling temperature (float).
50
+ - top_p: Top-p (nucleus) sampling (float).
51
+ - frequency_penalty: Penalize repeated tokens in the output (float).
52
+ - seed: A fixed seed for reproducibility; -1 means 'random'.
53
+ - model_search: The text used to filter the "Featured Models" Radio button list (unused here directly, but updated by the UI).
54
+ - selected_model: The model selected via the "Featured Models" Radio button.
55
+ - custom_model: If not empty, overrides selected_model with this custom path.
56
  """
57
 
58
+ # DEBUG LOGGING
59
  print(f"Received message: {message}")
60
  print(f"History: {history}")
61
  print(f"System message: {system_message}")
62
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
63
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
64
+ print(f"Model search text: {model_search}")
65
+ print(f"Selected featured model: {selected_model}")
66
+ print(f"Custom model (overrides if not empty): {custom_model}")
67
 
68
  # Convert seed to None if -1 (meaning random)
69
  if seed == -1:
70
  seed = None
71
 
72
+ # Determine the final model name to use
73
+ # If the custom_model textbox is non-empty, we use that.
74
+ # Otherwise, we use the selected model from the Radio buttons.
75
+ if custom_model.strip():
76
+ model_to_use = custom_model.strip()
77
+ else:
78
+ model_to_use = selected_model
79
 
80
+ # Construct the messages array required by the OpenAI-like HF API
81
+ messages = [{"role": "system", "content": system_message}] # System prompt
82
+ # Add conversation history to context
83
  for val in history:
84
  user_part = val[0]
85
  assistant_part = val[1]
86
  if user_part:
87
  messages.append({"role": "user", "content": user_part})
 
88
  if assistant_part:
89
  messages.append({"role": "assistant", "content": assistant_part})
 
90
 
91
  # Append the latest user message
92
  messages.append({"role": "user", "content": message})
93
 
94
  # Start with an empty string to build the response as tokens stream in
95
  response = ""
96
+ print(f"Using model: {model_to_use}")
97
+ print("Sending request to OpenAI API...")
98
 
99
  # Make the streaming request to the HF Inference API via openai-like client
100
+ # Below, we pass 'model_to_use' instead of a hard-coded model
101
  for message_chunk in client.chat.completions.create(
102
+ model=model_to_use, # <-- model is now dynamically selected
103
  max_tokens=max_tokens,
104
+ stream=True, # Stream the response
105
  temperature=temperature,
106
  top_p=top_p,
107
+ frequency_penalty=frequency_penalty,
108
+ seed=seed,
109
  messages=messages,
110
  ):
111
+ # Extract token text from the response chunk
112
  token_text = message_chunk.choices[0].delta.content
 
113
  response += token_text
114
+ # As we get new tokens, we stream them back to the user
115
  yield response
116
 
117
  print("Completed response generation.")
118
 
119
  # Create a Chatbot component with a specified height
120
  chatbot = gr.Chatbot(height=600)
121
+
122
+ # ------------------------------------------------------------
123
+ # Below: We define the UI with additional features integrated.
124
+ # We'll replicate some of the style from the ImgGen-Hub code:
125
+ # - A "Featured Models" accordion with the ability to filter
126
+ # - A "Custom Model" text box
127
+ # - An "Information" tab with "Featured Models" table and
128
+ # "Parameters Overview" containing markdown descriptions.
129
+ # ------------------------------------------------------------
130
+
131
+ # List of placeholder "Featured Models" for demonstration
132
+ featured_models_list = [
133
+ "meta-llama/Llama-3.3-70B-Instruct",
134
+ "meta-llama/Llama-2-70B-chat-hf",
135
+ "meta-llama/Llama-2-13B-chat-hf",
136
+ "bigscience/bloom",
137
+ "google/flan-t5-xxl",
138
+ ]
139
+
140
+ # This function filters the models in featured_models_list based on user input
141
+ def filter_models(search_term):
142
+ """
143
+ Filters featured_models_list based on the text in 'search_term'.
144
+ """
145
+ filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
146
+ return gr.update(choices=filtered)
147
+
148
+ print("Initializing Gradio interface...") # Debug log
149
+
150
+ # We build a custom Blocks layout to incorporate tabs and advanced UI elements
151
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
152
+
153
+ # Top-level heading for clarity
154
+ gr.Markdown("# Serverless-TextGen-Hub\nA Comprehensive UI for Text Generation")
155
+
156
+ with gr.Tab("Chat"):
157
+ # We'll place the ChatInterface within this tab
158
+
159
+ # Create the additional UI elements in a collapsible or visible layout
160
+ with gr.Accordion("Featured Models", open=False):
161
+ with gr.Row():
162
+ model_search = gr.Textbox(
163
+ label="Filter Models",
164
+ placeholder="Search for a featured model...",
165
+ lines=1,
166
+ )
167
+ with gr.Row():
168
+ model_radio = gr.Radio(
169
+ label="Select a featured model below",
170
+ choices=featured_models_list,
171
+ value="meta-llama/Llama-3.3-70B-Instruct",
172
+ interactive=True,
173
+ )
174
+ # On change of model_search, we update the radio choices
175
+ model_search.change(
176
+ filter_models,
177
+ inputs=model_search,
178
+ outputs=model_radio
179
+ )
180
+
181
+ # Textbox for specifying a custom model that overrides the featured selection if not empty
182
+ custom_model = gr.Textbox(
183
+ label="Custom Model Path (overrides Featured Models if not empty)",
184
+ placeholder="e.g. meta-llama/Llama-2-13B-chat-hf",
185
+ lines=1
186
+ )
187
+
188
+ # Build the chat interface itself
189
+ # We'll pass "model_search", "model_radio", and "custom_model" as additional inputs
190
+ # so that the 'respond' function can see them and decide which model to use
191
+ chatbot_interface = gr.ChatInterface(
192
+ fn=respond, # The function that generates the text
193
+ additional_inputs=[
194
+ gr.Textbox(
195
+ value="You are a helpful AI assistant.",
196
+ label="System message",
197
+ lines=2
198
+ ), # system_message
199
+ gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"), # max_tokens
200
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # temperature
201
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05,label="Top-P"), # top_p
202
+ gr.Slider(
203
+ minimum=-2.0,
204
+ maximum=2.0,
205
+ value=0.0,
206
+ step=0.1,
207
+ label="Frequency Penalty"
208
+ ), # frequency_penalty
209
+ gr.Slider(
210
+ minimum=-1,
211
+ maximum=65535,
212
+ value=-1,
213
+ step=1,
214
+ label="Seed (-1 for random)"
215
+ ), # seed
216
+ model_search, # Exposed but won't be typed into during conversation,
217
+ model_radio,
218
+ custom_model
219
+ ],
220
+ chatbot=chatbot,
221
+ title="Serverless-TextGen-Hub",
222
+ # The fill_height ensures the chat area expands
223
+ fill_height=True
224
+ )
225
+
226
+ # A new tab for "Information" about Featured Models and Parameters
227
+ with gr.Tab("Information"):
228
+ gr.Markdown("## Learn More About the Parameters and Models")
229
+
230
+ # Accordion for "Featured Models"
231
+ with gr.Accordion("Featured Models (WiP)", open=False):
232
+ gr.HTML(
233
+ """
234
+ <p>Below is a small table of example models. In practice, you can pick from
235
+ thousands of available text generation models on Hugging Face.
236
+ <br>
237
+ Use the <b>Filter Models</b> box under the <b>Featured Models</b> accordion
238
+ in the Chat tab to search by name, or enter a <b>Custom Model</b> path.</p>
239
+ <table style="width:100%; text-align:center; margin:auto;">
240
+ <tr>
241
+ <th>Model Name</th>
242
+ <th>Is It Large?</th>
243
+ <th>Notes</th>
244
+ </tr>
245
+ <tr>
246
+ <td>meta-llama/Llama-3.3-70B-Instruct</td>
247
+ <td>Yes</td>
248
+ <td>Placeholder example</td>
249
+ </tr>
250
+ <tr>
251
+ <td>meta-llama/Llama-2-13B-chat-hf</td>
252
+ <td>Medium</td>
253
+ <td>Placeholder example</td>
254
+ </tr>
255
+ <tr>
256
+ <td>google/flan-t5-xxl</td>
257
+ <td>Yes</td>
258
+ <td>Placeholder example</td>
259
+ </tr>
260
+ </table>
261
+ """
262
+ )
263
+
264
+ # Accordion for "Parameters Overview"
265
+ with gr.Accordion("Parameters Overview", open=False):
266
+ gr.Markdown(
267
+ """
268
+ ### Max New Tokens
269
+ Controls how many tokens can be generated in the response. A token is roughly a word or a piece of a word. If you need longer answers, increase this.
270
+
271
+ ### Temperature
272
+ A higher temperature makes the AI more 'creative' and random in its responses. Lower temperature keeps it more focused and deterministic.
273
+
274
+ ### Top-P
275
+ This is 'nucleus sampling.' It dictates the proportion of probability mass the model considers. At 1.0, it considers all words. Lower it to focus on the most likely words.
276
+
277
+ ### Frequency Penalty
278
+ Penalizes repeated tokens in the output. If you see a lot of repetition, increase this slightly to reduce the repetition.
279
+
280
+ ### Seed
281
+ If set to -1, the randomness is different each time. Setting a specific number ensures the same result each run, making responses reproducible.
282
+
283
+ ### Custom Model
284
+ If this field is filled, it overrides the selection from Featured Models. This way, you can try out any model on the HF Hub, e.g.
285
+ <code>meta-llama/Llama-2-70B-chat-hf</code> or <code>bigscience/bloom</code>.
286
+ """
287
+ )
288
+
289
  print("Gradio interface initialized.")
290
 
291
+ # ------------------------------------------------------------
292
+ # Finally, we launch the app if the script is run directly.
293
+ # ------------------------------------------------------------
294
  if __name__ == "__main__":
295
+ print("Launching the demo application...")
296
  demo.launch()