Nymbo commited on
Commit
e13eb1b
·
verified ·
1 Parent(s): cf508a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -126
app.py CHANGED
@@ -4,166 +4,204 @@ import os
4
 
5
  # Retrieve the access token from the environment variable
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
 
7
 
8
- # Initialize the OpenAI API client
9
  client = OpenAI(
10
  base_url="https://api-inference.huggingface.co/v1/",
11
  api_key=ACCESS_TOKEN,
12
  )
 
13
 
14
  def respond(
15
  message,
16
- history,
17
  system_message,
18
  max_tokens,
19
  temperature,
20
  top_p,
21
  frequency_penalty,
22
- seed
 
23
  ):
24
- # Process the incoming message
 
 
 
 
 
 
 
 
 
 
 
 
25
  print(f"Received message: {message}")
26
  print(f"History: {history}")
27
- print(f"System Message: {system_message}")
28
- print(f"Max Tokens: {max_tokens}, Temperature: {temperature}, Top P: {top_p}")
29
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
30
 
31
- # Convert seed to None if -1 (random)
32
  if seed == -1:
33
  seed = None
34
 
35
- # Construct the messages list for the API
36
  messages = [{"role": "system", "content": system_message}]
37
 
38
  # Add conversation history to the context
39
- for user_message, assistant_message in history:
40
- if user_message:
41
- messages.append({"role": "user", "content": user_message})
42
- print(f"Added user message: {user_message}")
43
- if assistant_message:
44
- messages.append({"role": "assistant", "content": assistant_message})
45
- print(f"Added assistant message: {assistant_message}")
46
-
47
- # Append the latest message
 
 
48
  messages.append({"role": "user", "content": message})
49
 
50
- # Initialize response
51
  response = ""
 
52
 
53
- # Make the API request
54
- for chunk in client.chat.completions.create(
55
- model="meta-llama/Llama-3.3-70B-Instruct",
56
- messages=messages,
57
  max_tokens=max_tokens,
 
58
  temperature=temperature,
59
  top_p=top_p,
60
- frequency_penalty=frequency_penalty,
61
- seed=seed,
62
- stream=True,
63
  ):
64
  # Extract the token text from the response chunk
65
- token = chunk.choices[0].message.content
66
- response += token
 
67
  yield response
68
 
69
- # Create the Gradio Chatbot component
70
- chatbot = gr.Chatbot(height=600)
71
 
72
- # Define the Gradio ChatInterface
73
- demo = gr.ChatInterface(
74
- chatbot=chatbot,
75
- fn=respond,
76
- inputs=[
77
- gr.Textbox(lines=1, placeholder="Enter your message..."),
78
- gr.Chatbot(label="Conversation History"),
79
- gr.Textbox(label="System Message"),
80
- gr.Slider(minimum=10, maximum=200, step=1, label="Max Tokens"),
81
- gr.Slider(minimum=0, maximum=2, step=0.1, label="Temperature"),
82
- gr.Slider(minimum=0, maximum=1, step=0.05, label="Top P"),
83
- gr.Slider(minimum=-2, maximum=2, step=0.1, label="Frequency Penalty"),
84
- gr.Slider(minimum=-1, maximum=1000000, step=1, label="Seed (-1 for random)"),
85
- ],
86
- theme="Nymbo/Nymbo_Theme",
87
- )
88
-
89
- # Create the "Featured Models" accordion
90
- with gr.Accordion("Featured Models", open=True) as featured_models:
91
- # Textbox for searching models
92
- model_search = gr.Textbox(label="Filter Models")
93
- # List of featured models
94
- models = [
95
- "meta-llama/Llama-3.3-70B-Instruct",
96
- "meta-llama/Llama-2-70B-Chat-hf",
97
- "TheBloke/Llama-2-13B-Chat-GGML",
98
- "TheBloke/Llama-2-70B-Chat-GGML",
99
- "TheBloke/Llama-2-13B-Chat-GGML-v2",
100
- "TheBloke/Llama-2-70B-Chat-GGML-v2",
101
- "TheBloke/Llama-2-70B-Chat-HF-API-compatible-GGML",
102
- "TheBloke/Llama-2-70b-chat-hf",
103
- "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
104
- "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
105
- "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
106
- "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
107
- "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
108
- "TheBloke/Llama-7-13B-Chat-GGML-v2-32K",
109
- "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
110
- "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
111
- "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
112
- # Add more models as needed...
113
- ]
114
- # Radio buttons for selecting a model
115
- model_radio = gr.Radio(choices=models, label="Select a Model")
116
-
117
- # Update the model list based on search input
118
- def filter_models(search_term):
119
- filtered_models = [model for model in models if search_term.lower() in model.lower()]
120
- return gr.update(choices=filtered_models)
121
-
122
- # Update the model list when the search box is used
123
- model_search.change(filter_models, inputs=model_search, outputs=model_radio)
124
-
125
- # Create a "Custom Model" textbox
126
- custom_model = gr.Textbox(label="Custom Model", placeholder="Hugging Face model path")
127
-
128
- # Create the "Information" tab
129
- with gr.Tab("Information"):
130
- # Featured Models accordion
131
- with gr.Accordion("Featured Models", open=False):
132
- gr.Markdown(
133
- """
134
- # Featured Models
135
-
136
- Here's a list of some popular models available on Hugging Face:
137
-
138
- - meta-llama/Llama-3.3-70B-Instruct
139
- - meta-llama/Llama-2-70B-Chat-hf
140
- - TheBloke/Llama-2-13B-Chat-GGML
141
- - TheBloke/Llama-2-70B-Chat-GGML
142
- - TheBloke/Llama-2-13B-Chat-GGML-v2
143
- - TheBloke/Llama-2-70B-Chat-GGML-v2
144
- - ... (and many more)
145
-
146
- You can search and select a model from the list above, or use your own custom model path.
147
- """
148
- )
149
-
150
- # Parameters Overview accordion
151
- with gr.Accordion("Parameters Overview", open=False):
152
- gr.Markdown(
153
- """
154
- # Parameters Overview
155
-
156
- Here's a brief explanation of the parameters you can adjust:
157
-
158
- - **Max Tokens**: The maximum number of tokens to generate in the response.
159
- - **Temperature**: Controls the randomness of the output. Higher values make the output more random.
160
- - **Top P**: Also known as nucleus sampling, it filters the least probable tokens, encouraging the model to be more creative.
161
- - **Frequency Penalty**: Penalizes repeated tokens to avoid repetition.
162
- - **Seed**: A fixed seed for reproducibility. Use -1 for a random seed.
163
-
164
- Feel free to experiment with these settings to achieve the desired output.
165
- """
 
166
  )
167
-
168
- # Launch the Gradio interface
169
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  # Retrieve the access token from the environment variable
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
7
+ print("Access token loaded.")
8
 
9
+ # Initialize the OpenAI client with the Hugging Face Inference API endpoint
10
  client = OpenAI(
11
  base_url="https://api-inference.huggingface.co/v1/",
12
  api_key=ACCESS_TOKEN,
13
  )
14
+ print("OpenAI client initialized.")
15
 
16
  def respond(
17
  message,
18
+ history: list[tuple[str, str]],
19
  system_message,
20
  max_tokens,
21
  temperature,
22
  top_p,
23
  frequency_penalty,
24
+ seed,
25
+ selected_model,
26
  ):
27
+ """
28
+ This function handles the chatbot response. It takes in:
29
+ - message: the user's new message
30
+ - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
31
+ - system_message: the system prompt
32
+ - max_tokens: the maximum number of tokens to generate in the response
33
+ - temperature: sampling temperature
34
+ - top_p: top-p (nucleus) sampling
35
+ - frequency_penalty: penalize repeated tokens in the output
36
+ - seed: a fixed seed for reproducibility; -1 will mean 'random'
37
+ - selected_model: the model to use for generating the response
38
+ """
39
+
40
  print(f"Received message: {message}")
41
  print(f"History: {history}")
42
+ print(f"System message: {system_message}")
43
+ print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
44
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
45
+ print(f"Selected model: {selected_model}")
46
 
47
+ # Convert seed to None if -1 (meaning random)
48
  if seed == -1:
49
  seed = None
50
 
51
+ # Construct the messages array required by the API
52
  messages = [{"role": "system", "content": system_message}]
53
 
54
  # Add conversation history to the context
55
+ for val in history:
56
+ user_part = val[0]
57
+ assistant_part = val[1]
58
+ if user_part:
59
+ messages.append({"role": "user", "content": user_part})
60
+ print(f"Added user message to context: {user_part}")
61
+ if assistant_part:
62
+ messages.append({"role": "assistant", "content": assistant_part})
63
+ print(f"Added assistant message to context: {assistant_part}")
64
+
65
+ # Append the latest user message
66
  messages.append({"role": "user", "content": message})
67
 
68
+ # Start with an empty string to build the response as tokens stream in
69
  response = ""
70
+ print("Sending request to OpenAI API.")
71
 
72
+ # Make the streaming request to the HF Inference API via openai-like client
73
+ for message_chunk in client.chat.completions.create(
74
+ model=selected_model, # Use the selected model
 
75
  max_tokens=max_tokens,
76
+ stream=True, # Stream the response
77
  temperature=temperature,
78
  top_p=top_p,
79
+ frequency_penalty=frequency_penalty, # <-- NEW
80
+ seed=seed, # <-- NEW
81
+ messages=messages,
82
  ):
83
  # Extract the token text from the response chunk
84
+ token_text = message_chunk.choices[0].delta.content
85
+ print(f"Received token: {token_text}")
86
+ response += token_text
87
  yield response
88
 
89
+ print("Completed response generation.")
 
90
 
91
+ # Create a Chatbot component with a specified height
92
+ chatbot = gr.Chatbot(height=600)
93
+ print("Chatbot interface created.")
94
+
95
+ # Define the list of featured models
96
+ featured_models = [
97
+ "meta-llama/Llama-3.3-70B-Instruct",
98
+ "google/flan-t5-xl",
99
+ "facebook/bart-large-cnn",
100
+ "EleutherAI/gpt-neo-2.7B",
101
+ # Add more featured models here
102
+ ]
103
+
104
+ # Create the Gradio Blocks interface
105
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
106
+ # Tab for model selection
107
+ with gr.Tab("Models"):
108
+ with gr.Row():
109
+ with gr.Column():
110
+ with gr.Accordion("Featured Models", open=True):
111
+ model_search = gr.Textbox(label="Filter Models", placeholder="Search for a featured model...", lines=1)
112
+ model = gr.Dropdown(label="Select a model below", choices=featured_models, value="meta-llama/Llama-3.3-70B-Instruct", interactive=True)
113
+
114
+ def filter_models(search_term):
115
+ filtered_models = [m for m in featured_models if search_term.lower() in m.lower()]
116
+ return gr.update(choices=filtered_models)
117
+
118
+ model_search.change(filter_models, inputs=model_search, outputs=model)
119
+
120
+ custom_model = gr.Textbox(label="Custom Model", placeholder="Enter a custom model ID here", interactive=True)
121
+
122
+ # Tab for chat interface
123
+ with gr.Tab("Chat"):
124
+ with gr.Row():
125
+ with gr.Column():
126
+ txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
127
+
128
+ # Additional parameters
129
+ with gr.Row():
130
+ with gr.Column():
131
+ system_message = gr.Textbox(label="System Message", value="", lines=3)
132
+ max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max New Tokens")
133
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
134
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
135
+ frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
136
+ seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
137
+
138
+ # Chatbot display
139
+ chatbot = gr.Chatbot(height=600)
140
+
141
+ # Submit button
142
+ submit_btn = gr.Button("Submit")
143
+
144
+ # Tab for information
145
+ with gr.Tab("Information"):
146
+ with gr.Row():
147
+ gr.Markdown(
148
+ """
149
+ # Featured Models
150
+
151
+ - **meta-llama/Llama-3.3-70B-Instruct**: A large language model from Meta.
152
+ - **google/flan-t5-xl**: A pretrained encoder-decoder model from Google.
153
+ - **facebook/bart-large-cnn**: A pretrained sequence-to-sequence model from Facebook.
154
+ - **EleutherAI/gpt-neo-2.7B**: A large autoregressive language model from EleutherAI.
155
+
156
+ # Parameters Overview
157
+
158
+ - **System Message**: Sets the behavior and context for the assistant.
159
+ - **Max New Tokens**: Limits the length of the generated response.
160
+ - **Temperature**: Controls the randomness of the output. Higher values make output more random.
161
+ - **Top-P**: Controls the diversity of text by selecting tokens that account for top-p probability mass.
162
+ - **Frequency Penalty**: Decreases the model's likelihood to repeat the same lines.
163
+ - **Seed**: Ensures reproducibility of results; set to -1 for random seed.
164
+ """
165
+ )
166
+
167
+ # Function to handle chat submission
168
+ def user(user_message, history):
169
+ return "", history + [[user_message, None]]
170
+
171
+ # Function to process the chat
172
+ def bot(history, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, selected_model):
173
+ # Get the last user message
174
+ user_message = history[-1][0]
175
+ # Generate response
176
+ response_iter = respond(
177
+ user_message,
178
+ history[:-1], # Exclude the last user message which doesn't have a response yet
179
+ system_message,
180
+ max_tokens,
181
+ temperature,
182
+ top_p,
183
+ frequency_penalty,
184
+ seed,
185
+ selected_model,
186
  )
187
+ # Collect the entire response
188
+ full_response = ""
189
+ for resp in response_iter:
190
+ full_response = resp
191
+ # Update history with the bot's response
192
+ history[-1][1] = full_response
193
+ return history
194
+
195
+ # Set up the chat flow
196
+ txt.submit(user, [txt, chatbot], [txt, chatbot], queue=False).then(
197
+ bot, [chatbot, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, model], chatbot
198
+ )
199
+ submit_btn.click(user, [txt, chatbot], [txt, chatbot], queue=False).then(
200
+ bot, [chatbot, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, model], chatbot
201
+ )
202
+
203
+ print("Gradio interface initialized.")
204
+
205
+ if __name__ == "__main__":
206
+ print("Launching the demo application.")
207
+ demo.launch()