Nymbo commited on
Commit
880ced6
·
verified ·
1 Parent(s): e419588

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -226
app.py CHANGED
@@ -2,19 +2,8 @@ import gradio as gr
2
  from openai import OpenAI
3
  import os
4
 
5
- # --------------------------------------------------------------------------------
6
- # Serverless-TextGen-Hub
7
- # This application is a Gradio-based UI for text generation using
8
- # Hugging Face's serverless Inference API. We also incorporate features
9
- # inspired by the ImgGen-Hub, such as:
10
- # - A "Featured Models" accordion with text filtering.
11
- # - A "Custom Model" textbox for specifying a non-featured model.
12
- # - An "Information" tab with accordions for "Featured Models" and
13
- # "Parameters Overview" containing helpful user guides.
14
- # --------------------------------------------------------------------------------
15
-
16
- # Retrieve the access token from environment variables
17
- ACCESS_TOKEN = os.getenv("HF_TOKEN") # HF_TOKEN is your Hugging Face Inference API key
18
  print("Access token loaded.")
19
 
20
  # Initialize the OpenAI client with the Hugging Face Inference API endpoint
@@ -28,269 +17,268 @@ def respond(
28
  message,
29
  history: list[tuple[str, str]],
30
  system_message,
 
 
31
  max_tokens,
32
  temperature,
33
  top_p,
34
  frequency_penalty,
35
- seed,
36
- # NEW inputs for model selection
37
- model_search,
38
- selected_model,
39
- custom_model
40
  ):
41
  """
42
- This function handles the chatbot response.
43
-
44
- Parameters:
45
- - message: The user's newest message (string).
46
- - history: The list of previous messages in the conversation, each as a tuple (user_msg, assistant_msg).
47
- - system_message: The system prompt provided.
48
- - max_tokens: The maximum number of tokens to generate in the response.
49
- - temperature: Sampling temperature (float).
50
- - top_p: Top-p (nucleus) sampling (float).
51
- - frequency_penalty: Penalize repeated tokens in the output (float).
52
- - seed: A fixed seed for reproducibility; -1 means 'random'.
53
- - model_search: The text used to filter the "Featured Models" Radio button list (unused here directly, but updated by the UI).
54
- - selected_model: The model selected via the "Featured Models" Radio button.
55
- - custom_model: If not empty, overrides selected_model with this custom path.
56
  """
57
 
58
- # DEBUG LOGGING
59
  print(f"Received message: {message}")
60
  print(f"History: {history}")
61
  print(f"System message: {system_message}")
 
 
62
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
63
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
64
- print(f"Model search text: {model_search}")
65
- print(f"Selected featured model: {selected_model}")
66
- print(f"Custom model (overrides if not empty): {custom_model}")
67
 
68
  # Convert seed to None if -1 (meaning random)
69
  if seed == -1:
70
  seed = None
71
 
72
- # Determine the final model name to use
73
- # If the custom_model textbox is non-empty, we use that.
74
- # Otherwise, we use the selected model from the Radio buttons.
75
- if custom_model.strip():
76
- model_to_use = custom_model.strip()
77
- else:
78
- model_to_use = selected_model
79
 
80
- # Construct the messages array required by the OpenAI-like HF API
81
- messages = [{"role": "system", "content": system_message}] # System prompt
82
- # Add conversation history to context
83
  for val in history:
84
  user_part = val[0]
85
  assistant_part = val[1]
86
  if user_part:
87
  messages.append({"role": "user", "content": user_part})
 
88
  if assistant_part:
89
  messages.append({"role": "assistant", "content": assistant_part})
 
90
 
91
  # Append the latest user message
92
  messages.append({"role": "user", "content": message})
93
 
94
  # Start with an empty string to build the response as tokens stream in
95
  response = ""
96
- print(f"Using model: {model_to_use}")
97
- print("Sending request to OpenAI API...")
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # Make the streaming request to the HF Inference API via openai-like client
100
- # Below, we pass 'model_to_use' instead of a hard-coded model
101
  for message_chunk in client.chat.completions.create(
102
- model=model_to_use, # <-- model is now dynamically selected
103
  max_tokens=max_tokens,
104
- stream=True, # Stream the response
105
  temperature=temperature,
106
  top_p=top_p,
107
  frequency_penalty=frequency_penalty,
108
  seed=seed,
109
  messages=messages,
110
  ):
111
- # Extract token text from the response chunk
112
  token_text = message_chunk.choices[0].delta.content
 
113
  response += token_text
114
- # As we get new tokens, we stream them back to the user
115
  yield response
116
 
117
  print("Completed response generation.")
118
 
119
  # Create a Chatbot component with a specified height
120
  chatbot = gr.Chatbot(height=600)
 
121
 
122
- # ------------------------------------------------------------
123
- # Below: We define the UI with additional features integrated.
124
- # We'll replicate some of the style from the ImgGen-Hub code:
125
- # - A "Featured Models" accordion with the ability to filter
126
- # - A "Custom Model" text box
127
- # - An "Information" tab with "Featured Models" table and
128
- # "Parameters Overview" containing markdown descriptions.
129
- # ------------------------------------------------------------
130
-
131
- # List of placeholder "Featured Models" for demonstration
132
- featured_models_list = [
133
- "meta-llama/Llama-3.3-70B-Instruct",
134
- "meta-llama/Llama-2-70B-chat-hf",
135
- "meta-llama/Llama-2-13B-chat-hf",
136
- "bigscience/bloom",
137
- "google/flan-t5-xxl",
138
- ]
139
-
140
- # This function filters the models in featured_models_list based on user input
141
- def filter_models(search_term):
142
- """
143
- Filters featured_models_list based on the text in 'search_term'.
144
- """
145
- filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
146
- return gr.update(choices=filtered)
147
-
148
- print("Initializing Gradio interface...") # Debug log
149
-
150
- # We build a custom Blocks layout to incorporate tabs and advanced UI elements
151
  with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
152
-
153
- # Top-level heading for clarity
154
- gr.Markdown("# Serverless-TextGen-Hub\nA Comprehensive UI for Text Generation")
155
-
156
- with gr.Tab("Chat"):
157
- # We'll place the ChatInterface within this tab
158
-
159
- # Create the additional UI elements in a collapsible or visible layout
160
- with gr.Accordion("Featured Models", open=False):
161
- with gr.Row():
162
- model_search = gr.Textbox(
163
- label="Filter Models",
164
- placeholder="Search for a featured model...",
165
- lines=1,
166
- )
167
- with gr.Row():
168
- model_radio = gr.Radio(
169
- label="Select a featured model below",
170
- choices=featured_models_list,
171
- value="meta-llama/Llama-3.3-70B-Instruct",
172
- interactive=True,
173
- )
174
- # On change of model_search, we update the radio choices
175
- model_search.change(
176
- filter_models,
177
- inputs=model_search,
178
- outputs=model_radio
179
- )
180
-
181
- # Textbox for specifying a custom model that overrides the featured selection if not empty
182
- custom_model = gr.Textbox(
183
- label="Custom Model Path (overrides Featured Models if not empty)",
184
- placeholder="e.g. meta-llama/Llama-2-13B-chat-hf",
185
- lines=1
186
- )
187
-
188
- # Build the chat interface itself
189
- # We'll pass "model_search", "model_radio", and "custom_model" as additional inputs
190
- # so that the 'respond' function can see them and decide which model to use
191
- chatbot_interface = gr.ChatInterface(
192
- fn=respond, # The function that generates the text
193
- additional_inputs=[
194
- gr.Textbox(
195
- value="You are a helpful AI assistant.",
196
  label="System message",
197
- lines=2
198
- ), # system_message
199
- gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"), # max_tokens
200
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # temperature
201
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05,label="Top-P"), # top_p
202
- gr.Slider(
203
- minimum=-2.0,
204
- maximum=2.0,
205
- value=0.0,
206
- step=0.1,
207
- label="Frequency Penalty"
208
- ), # frequency_penalty
209
- gr.Slider(
210
- minimum=-1,
211
- maximum=65535,
212
- value=-1,
213
- step=1,
214
- label="Seed (-1 for random)"
215
- ), # seed
216
- model_search, # Exposed but won't be typed into during conversation,
217
- model_radio,
218
- custom_model
219
- ],
220
- chatbot=chatbot,
221
- title="Serverless-TextGen-Hub",
222
- # The fill_height ensures the chat area expands
223
- fill_height=True
224
- )
225
-
226
- # A new tab for "Information" about Featured Models and Parameters
227
- with gr.Tab("Information"):
228
- gr.Markdown("## Learn More About the Parameters and Models")
229
-
230
- # Accordion for "Featured Models"
231
- with gr.Accordion("Featured Models (WiP)", open=False):
232
- gr.HTML(
233
- """
234
- <p>Below is a small table of example models. In practice, you can pick from
235
- thousands of available text generation models on Hugging Face.
236
- <br>
237
- Use the <b>Filter Models</b> box under the <b>Featured Models</b> accordion
238
- in the Chat tab to search by name, or enter a <b>Custom Model</b> path.</p>
239
- <table style="width:100%; text-align:center; margin:auto;">
240
- <tr>
241
- <th>Model Name</th>
242
- <th>Is It Large?</th>
243
- <th>Notes</th>
244
- </tr>
245
- <tr>
246
- <td>meta-llama/Llama-3.3-70B-Instruct</td>
247
- <td>Yes</td>
248
- <td>Placeholder example</td>
249
- </tr>
250
- <tr>
251
- <td>meta-llama/Llama-2-13B-chat-hf</td>
252
- <td>Medium</td>
253
- <td>Placeholder example</td>
254
- </tr>
255
- <tr>
256
- <td>google/flan-t5-xxl</td>
257
- <td>Yes</td>
258
- <td>Placeholder example</td>
259
- </tr>
260
- </table>
261
- """
262
- )
263
-
264
- # Accordion for "Parameters Overview"
265
- with gr.Accordion("Parameters Overview", open=False):
266
- gr.Markdown(
267
- """
268
- ### Max New Tokens
269
- Controls how many tokens can be generated in the response. A token is roughly a word or a piece of a word. If you need longer answers, increase this.
270
-
271
- ### Temperature
272
- A higher temperature makes the AI more 'creative' and random in its responses. Lower temperature keeps it more focused and deterministic.
273
-
274
- ### Top-P
275
- This is 'nucleus sampling.' It dictates the proportion of probability mass the model considers. At 1.0, it considers all words. Lower it to focus on the most likely words.
276
-
277
- ### Frequency Penalty
278
- Penalizes repeated tokens in the output. If you see a lot of repetition, increase this slightly to reduce the repetition.
279
-
280
- ### Seed
281
- If set to -1, the randomness is different each time. Setting a specific number ensures the same result each run, making responses reproducible.
282
-
283
- ### Custom Model
284
- If this field is filled, it overrides the selection from Featured Models. This way, you can try out any model on the HF Hub, e.g.
285
- <code>meta-llama/Llama-2-70B-chat-hf</code> or <code>bigscience/bloom</code>.
286
- """
287
- )
288
-
289
- print("Gradio interface initialized.")
290
-
291
- # ------------------------------------------------------------
292
- # Finally, we launch the app if the script is run directly.
293
- # ------------------------------------------------------------
294
- if __name__ == "__main__":
295
- print("Launching the demo application...")
296
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from openai import OpenAI
3
  import os
4
 
5
+ # Retrieve the access token from the environment variable
6
+ ACCESS_TOKEN = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
7
  print("Access token loaded.")
8
 
9
  # Initialize the OpenAI client with the Hugging Face Inference API endpoint
 
17
  message,
18
  history: list[tuple[str, str]],
19
  system_message,
20
+ custom_model,
21
+ model,
22
  max_tokens,
23
  temperature,
24
  top_p,
25
  frequency_penalty,
26
+ seed
 
 
 
 
27
  ):
28
  """
29
+ This function handles the chatbot response. It takes in:
30
+ - message: the user's new message
31
+ - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
32
+ - system_message: the system prompt
33
+ - custom_model: custom model path (if any)
34
+ - model: selected model from featured models
35
+ - max_tokens: the maximum number of tokens to generate in the response
36
+ - temperature: sampling temperature
37
+ - top_p: top-p (nucleus) sampling
38
+ - frequency_penalty: penalize repeated tokens in the output
39
+ - seed: a fixed seed for reproducibility; -1 will mean 'random'
 
 
 
40
  """
41
 
 
42
  print(f"Received message: {message}")
43
  print(f"History: {history}")
44
  print(f"System message: {system_message}")
45
+ print(f"Custom model: {custom_model}")
46
+ print(f"Selected model: {model}")
47
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
48
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
 
 
49
 
50
  # Convert seed to None if -1 (meaning random)
51
  if seed == -1:
52
  seed = None
53
 
54
+ # Construct the messages array required by the API
55
+ messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
56
 
57
+ # Add conversation history to the context
 
 
58
  for val in history:
59
  user_part = val[0]
60
  assistant_part = val[1]
61
  if user_part:
62
  messages.append({"role": "user", "content": user_part})
63
+ print(f"Added user message to context: {user_part}")
64
  if assistant_part:
65
  messages.append({"role": "assistant", "content": assistant_part})
66
+ print(f"Added assistant message to context: {assistant_part}")
67
 
68
  # Append the latest user message
69
  messages.append({"role": "user", "content": message})
70
 
71
  # Start with an empty string to build the response as tokens stream in
72
  response = ""
73
+ print("Sending request to OpenAI API.")
74
+
75
+ # Determine which model to use
76
+ if custom_model.strip():
77
+ selected_model = custom_model.strip()
78
+ else:
79
+ # Map the display names to actual model paths
80
+ model_mapping = {
81
+ "Llama 2 70B": "meta-llama/Llama-2-70b-chat-hf",
82
+ "Mixtral 8x7B": "mistralai/Mixtral-8x7B-Instruct-v0.1",
83
+ "Zephyr 7B": "HuggingFaceH4/zephyr-7b-beta",
84
+ "OpenChat 3.5": "openchat/openchat-3.5-0106",
85
+ }
86
+ selected_model = model_mapping.get(model, "meta-llama/Llama-2-70b-chat-hf")
87
 
88
  # Make the streaming request to the HF Inference API via openai-like client
 
89
  for message_chunk in client.chat.completions.create(
90
+ model=selected_model,
91
  max_tokens=max_tokens,
92
+ stream=True,
93
  temperature=temperature,
94
  top_p=top_p,
95
  frequency_penalty=frequency_penalty,
96
  seed=seed,
97
  messages=messages,
98
  ):
99
+ # Extract the token text from the response chunk
100
  token_text = message_chunk.choices[0].delta.content
101
+ print(f"Received token: {token_text}")
102
  response += token_text
 
103
  yield response
104
 
105
  print("Completed response generation.")
106
 
107
  # Create a Chatbot component with a specified height
108
  chatbot = gr.Chatbot(height=600)
109
+ print("Chatbot interface created.")
110
 
111
+ # Create the Gradio interface with tabs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
113
+ with gr.Row():
114
+ with gr.Column():
115
+ # Basic Settings Tab
116
+ with gr.Tab("Settings"):
117
+ # System Message
118
+ system_message = gr.Textbox(
119
+ value="",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  label="System message",
121
+ placeholder="Enter a system message to guide the model's behavior"
122
+ )
123
+
124
+ # Model Selection Section
125
+ with gr.Accordion("Featured Models", open=True):
126
+ # Model Search
127
+ model_search = gr.Textbox(
128
+ label="Filter Models",
129
+ placeholder="Search for a featured model...",
130
+ lines=1
131
+ )
132
+
133
+ # Featured Models List
134
+ models_list = [
135
+ "Llama 2 70B",
136
+ "Mixtral 8x7B",
137
+ "Zephyr 7B",
138
+ "OpenChat 3.5"
139
+ ]
140
+
141
+ model = gr.Radio(
142
+ label="Select a model",
143
+ choices=models_list,
144
+ value="Llama 2 70B"
145
+ )
146
+
147
+ # Custom Model Input
148
+ custom_model = gr.Textbox(
149
+ label="Custom Model",
150
+ info="Hugging Face model path (optional)",
151
+ placeholder="meta-llama/Llama-2-70b-chat-hf"
152
+ )
153
+
154
+ # Function to filter models
155
+ def filter_models(search_term):
156
+ filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
157
+ return gr.update(choices=filtered_models)
158
+
159
+ # Update model list when search box is used
160
+ model_search.change(filter_models, inputs=model_search, outputs=model)
161
+
162
+ # Generation Parameters
163
+ with gr.Row():
164
+ max_tokens = gr.Slider(
165
+ minimum=1,
166
+ maximum=4096,
167
+ value=512,
168
+ step=1,
169
+ label="Max new tokens"
170
+ )
171
+ temperature = gr.Slider(
172
+ minimum=0.1,
173
+ maximum=4.0,
174
+ value=0.7,
175
+ step=0.1,
176
+ label="Temperature"
177
+ )
178
+
179
+ with gr.Row():
180
+ top_p = gr.Slider(
181
+ minimum=0.1,
182
+ maximum=1.0,
183
+ value=0.95,
184
+ step=0.05,
185
+ label="Top-P"
186
+ )
187
+ frequency_penalty = gr.Slider(
188
+ minimum=-2.0,
189
+ maximum=2.0,
190
+ value=0.0,
191
+ step=0.1,
192
+ label="Frequency Penalty"
193
+ )
194
+
195
+ with gr.Row():
196
+ seed = gr.Slider(
197
+ minimum=-1,
198
+ maximum=65535,
199
+ value=-1,
200
+ step=1,
201
+ label="Seed (-1 for random)"
202
+ )
203
+
204
+ # Information Tab
205
+ with gr.Tab("Information"):
206
+ # Featured Models Table
207
+ with gr.Accordion("Featured Models", open=True):
208
+ gr.HTML(
209
+ """
210
+ <p><a href="https://huggingface.co/models?inference=warm&pipeline_tag=text-to-text">See all available models</a></p>
211
+ <table style="width:100%; text-align:center; margin:auto;">
212
+ <tr>
213
+ <th>Model Name</th>
214
+ <th>Size</th>
215
+ <th>Notes</th>
216
+ </tr>
217
+ <tr>
218
+ <td>Llama 2 70B</td>
219
+ <td>70B</td>
220
+ <td>Meta's flagship model</td>
221
+ </tr>
222
+ <tr>
223
+ <td>Mixtral 8x7B</td>
224
+ <td>47B</td>
225
+ <td>Mistral AI's MoE model</td>
226
+ </tr>
227
+ <tr>
228
+ <td>Zephyr 7B</td>
229
+ <td>7B</td>
230
+ <td>Efficient fine-tuned model</td>
231
+ </tr>
232
+ <tr>
233
+ <td>OpenChat 3.5</td>
234
+ <td>7B</td>
235
+ <td>High performance chat model</td>
236
+ </tr>
237
+ </table>
238
+ """
239
+ )
240
+
241
+ # Parameters Overview
242
+ with gr.Accordion("Parameters Overview", open=False):
243
+ gr.Markdown(
244
+ """
245
+ ## System Message
246
+ A message that sets the context and behavior for the model. This helps guide the model's responses.
247
+
248
+ ## Max New Tokens
249
+ Controls the maximum length of the generated response. Higher values allow for longer outputs but may take more time.
250
+
251
+ ## Temperature
252
+ Controls randomness in the output:
253
+ - Lower values (0.1-0.5): More focused and deterministic
254
+ - Higher values (0.7-1.0): More creative and diverse
255
+ - Very high values (>1.0): More random and potentially chaotic
256
+
257
+ ## Top-P (Nucleus Sampling)
258
+ Controls the cumulative probability threshold for token selection:
259
+ - Lower values: More focused on highly likely tokens
260
+ - Higher values: Considers a wider range of possibilities
261
+
262
+ ## Frequency Penalty
263
+ Adjusts the likelihood of token repetition:
264
+ - Negative values: May encourage repetition
265
+ - Zero: Neutral
266
+ - Positive values: Discourages repetition
267
+
268
+ ## Seed
269
+ A number that controls the randomness in generation:
270
+ - -1: Random seed each time
271
+ - Fixed value: Reproducible outputs with same parameters
272
+ """
273
+ )
274
+
275
+ # Set up the chat interface
276
+ chatbot = gr.Chatbot(height=600)
277
+ msg = gr.Textbox(label="Message")
278
+
279
+ clear = gr.ClearButton([msg, chatbot])
280
+
281
+ msg.submit(respond, [msg, chatbot, system_message, custom_model, model, max_tokens, temperature, top_p, frequency_penalty, seed], [chatbot, msg])
282
+
283
+ print("Launching the demo application.")
284
+ demo.launch(show_api=False, share=False)