Spaces:
Running
Running
import gradio as gr | |
import os | |
from openai import OpenAI | |
################################################ | |
# INITIAL SETUP | |
################################################ | |
# Retrieve the access token from the environment variable | |
ACCESS_TOKEN = os.getenv("HF_TOKEN") | |
print("Access token loaded.") | |
# Initialize the OpenAI client with the Hugging Face Inference API endpoint | |
client = OpenAI( | |
base_url="https://api-inference.huggingface.co/v1/", | |
api_key=ACCESS_TOKEN, | |
) | |
print("OpenAI client initialized.") | |
# Our main response-generating function | |
def respond( | |
user_message, | |
history, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
frequency_penalty, | |
seed, | |
featured_model, | |
custom_model | |
): | |
""" | |
This function handles the chatbot response. It takes in: | |
- user_message: the user's new message | |
- history: the list of previous messages, each as [user_text, assistant_text] | |
- system_message: the system prompt | |
- max_tokens: the maximum number of tokens to generate in the response | |
- temperature: sampling temperature | |
- top_p: top-p (nucleus) sampling | |
- frequency_penalty: penalize repeated tokens in the output | |
- seed: a fixed seed for reproducibility; -1 will mean 'random' | |
- featured_model: the user-chosen model from the radio button | |
- custom_model: a user-specified custom model that overrides featured_model if not empty | |
""" | |
print(f"New user message: {user_message}") | |
print(f"History so far: {history}") | |
print(f"System message: {system_message}") | |
print(f"max_tokens: {max_tokens}, temperature: {temperature}, top_p: {top_p}") | |
print(f"frequency_penalty: {frequency_penalty}, seed: {seed}") | |
print(f"Featured Model: {featured_model}") | |
print(f"Custom Model: {custom_model}") | |
# Convert seed to None if -1 (meaning random) | |
if seed == -1: | |
seed = None | |
# Determine which model to use | |
# If the user typed something in custom_model, that overrides the featured model | |
# Otherwise we use the model selected in the radio. If neither, default to the example "meta-llama..." | |
model_to_use = None | |
if custom_model.strip(): | |
model_to_use = custom_model.strip() | |
elif featured_model is not None and featured_model.strip(): | |
model_to_use = featured_model.strip() | |
else: | |
model_to_use = "meta-llama/Llama-3.3-70B-Instruct" | |
print(f"Model selected for inference: {model_to_use}") | |
# Construct the conversation messages for the HF Inference API | |
messages = [{"role": "system", "content": system_message}] | |
for user_text, assistant_text in history: | |
if user_text: | |
messages.append({"role": "user", "content": user_text}) | |
if assistant_text: | |
messages.append({"role": "assistant", "content": assistant_text}) | |
messages.append({"role": "user", "content": user_message}) | |
# We'll collect and stream the response | |
response_so_far = "" | |
# Make the streaming request to the HF Inference API | |
print("Sending request to OpenAI/Hugging Face Inference API...") | |
for message_chunk in client.chat.completions.create( | |
model=model_to_use, | |
max_tokens=max_tokens, | |
stream=True, | |
temperature=temperature, | |
top_p=top_p, | |
frequency_penalty=frequency_penalty, | |
seed=seed, | |
messages=messages, | |
): | |
# The content for the partial chunk | |
token_text = message_chunk.choices[0].delta.content | |
response_so_far += token_text | |
# Return partial response to Gradio to display in real-time | |
yield response_so_far | |
print("Completed response generation.") | |
################################################ | |
# GRADIO UI + STATE MANAGEMENT | |
################################################ | |
def user_submit(user_message, history): | |
""" | |
This function is called when the user sends a message. | |
We simply add the user message to the conversation history. | |
""" | |
print("user_submit triggered.") | |
# Append the new user message to history | |
if not history: | |
history = [] | |
history = history + [[user_message, None]] | |
return history, "" | |
def bot_reply(history, system_message, max_tokens, temperature, top_p, | |
frequency_penalty, seed, featured_model, custom_model): | |
""" | |
This function is triggered to produce the bot's response after the user has submitted. | |
We call 'respond' for streaming text. | |
""" | |
print("bot_reply triggered.") | |
# The last conversation item has user_message, None | |
user_message = history[-1][0] | |
# We will stream the partial responses from 'respond' | |
bot_response = respond( | |
user_message=user_message, | |
history=history[:-1], # all items except the last, because we pass the last user msg separately | |
system_message=system_message, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
frequency_penalty=frequency_penalty, | |
seed=seed, | |
featured_model=featured_model, | |
custom_model=custom_model | |
) | |
# As we yield from the generator, we update the last item in history with the partial response | |
# Gradio streaming logic: yield the partial updates as they come in | |
for partial_text in bot_response: | |
history[-1][1] = partial_text | |
yield history | |
# We define a small list of placeholder featured models for demonstration | |
models_list = [ | |
"meta-llama/Llama-2-13B-Chat-hf", | |
"bigscience/bloom", | |
"EleutherAI/gpt-neo-2.7B", | |
"meta-llama/Llama-3.3-70B-Instruct" | |
] | |
def filter_models(search_term): | |
""" | |
Filter function triggered when user types in the model_search box. | |
Returns an updated list of models that contain the search term. | |
""" | |
filtered = [m for m in models_list if search_term.lower() in m.lower()] | |
return gr.update(choices=filtered) | |
################################################ | |
# BUILDING THE GRADIO LAYOUT | |
################################################ | |
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo: | |
gr.Markdown( | |
""" | |
# Serverless-TextGen-Hub | |
**A UI for text generation using Hugging Face's Inference API.** | |
Below is a simple chat interface. You can pick from **Featured Models** or specify a **Custom Model** | |
to override the choice. If you're not sure, just use the default. | |
""" | |
) | |
# State to hold the conversation history, will be a list of [user, bot] | |
conversation_state = gr.State([]) | |
# Row for system message + advanced settings | |
with gr.Accordion("Advanced Settings", open=False): | |
system_message = gr.Textbox( | |
label="System Message", | |
value="You are a helpful assistant.", | |
lines=2, | |
info="Provides background or personality instructions to the model." | |
) | |
max_tokens = gr.Slider( | |
minimum=1, | |
maximum=4096, | |
value=512, | |
step=1, | |
label="Max new tokens" | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=4.0, | |
value=0.7, | |
step=0.1, | |
label="Temperature" | |
) | |
top_p = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-P" | |
) | |
frequency_penalty = gr.Slider( | |
minimum=-2.0, | |
maximum=2.0, | |
value=0.0, | |
step=0.1, | |
label="Frequency Penalty" | |
) | |
seed = gr.Slider( | |
minimum=-1, | |
maximum=65535, | |
value=-1, | |
step=1, | |
label="Seed (-1 for random)" | |
) | |
# Featured Models + filtering | |
with gr.Accordion("Featured Models", open=False): | |
model_search = gr.Textbox( | |
label="Filter Models", | |
placeholder="Search for a featured model...", | |
lines=1 | |
) | |
featured_model_radio = gr.Radio( | |
label="Select a featured model below", | |
choices=models_list, | |
value=models_list[0], # default selection | |
interactive=True | |
) | |
model_search.change( | |
filter_models, | |
inputs=model_search, | |
outputs=featured_model_radio | |
) | |
# This is the Custom Model box (overrides Featured Models if not empty) | |
custom_model = gr.Textbox( | |
label="Custom Model", | |
value="", | |
info="(Optional) Provide a custom HF model path. If not empty, it overrides the Featured Model." | |
) | |
# The main Chatbot interface | |
chatbot = gr.Chatbot(height=600) | |
# Textbox for the user to type a new message | |
with gr.Row(): | |
user_input = gr.Textbox( | |
show_label=False, | |
placeholder="Type your message here (press enter or click 'Submit')", | |
lines=2 | |
) | |
submit_btn = gr.Button("Submit", variant="primary") | |
# The user submits -> we update the conversation state | |
submit_btn.click( | |
fn=user_submit, | |
inputs=[user_input, conversation_state], | |
outputs=[conversation_state, user_input], | |
) | |
# Then the bot replies, streaming the output | |
# We pass all required arguments from the advanced settings, plus the model selection boxes | |
submit_btn.click( | |
fn=bot_reply, | |
inputs=[ | |
conversation_state, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
frequency_penalty, | |
seed, | |
featured_model_radio, | |
custom_model | |
], | |
outputs=[chatbot], | |
# 'bot_reply' is a generator, so we set streaming=True: | |
queue=True | |
) | |
# We also allow pressing Enter in user_input to do the same thing | |
user_input.submit( | |
fn=user_submit, | |
inputs=[user_input, conversation_state], | |
outputs=[conversation_state, user_input], | |
) | |
user_input.submit( | |
fn=bot_reply, | |
inputs=[ | |
conversation_state, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
frequency_penalty, | |
seed, | |
featured_model_radio, | |
custom_model | |
], | |
outputs=[chatbot], | |
queue=True | |
) | |
gr.HTML(""" | |
<br> | |
<p style='text-align:center;'> | |
Developed by <strong>Nymbo</strong>. | |
Powered by <strong>Hugging Face Inference API</strong>. | |
</p> | |
""") | |
# Finally, launch the app | |
if __name__ == "__main__": | |
print("Launching the Serverless-TextGen-Hub application...") | |
demo.launch() |