Nymbo's picture
Update app.py
7de1759 verified
raw
history blame
10.7 kB
import gradio as gr
from openai import OpenAI
import os
# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")
# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
def respond(
user_message,
chat_history,
system_msg,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_model,
custom_model
):
"""
This function handles the chatbot response. It takes in:
- user_message: the user's newly typed message
- chat_history: the list of (user, assistant) message pairs
- system_msg: the system instruction or system-level context
- max_tokens: the maximum number of tokens to generate
- temperature: sampling temperature
- top_p: top-p (nucleus) sampling
- frequency_penalty: penalize repeated tokens in the output
- seed: a fixed seed for reproducibility; -1 means 'random'
- featured_model: the chosen model name from 'Featured Models' radio
- custom_model: the optional custom model that overrides the featured one if provided
"""
print(f"Received user message: {user_message}")
print(f"System message: {system_msg}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}, Freq-Penalty: {frequency_penalty}, Seed: {seed}")
print(f"Featured model: {featured_model}")
print(f"Custom model: {custom_model}")
# Convert the seed to None if user set it to -1 (meaning random)
if seed == -1:
seed = None
# Decide which model to actually use
# If custom_model is non-empty, use that; otherwise use the chosen featured_model
model_to_use = custom_model.strip() if custom_model.strip() != "" else featured_model
# Provide a default fallback if for some reason both are empty
if model_to_use.strip() == "":
model_to_use = "meta-llama/Llama-3.3-70B-Instruct"
print(f"Model selected for inference: {model_to_use}")
# Construct the conversation history in the format required by HF's Inference API
messages = []
if system_msg.strip():
messages.append({"role": "system", "content": system_msg.strip()})
# Add the conversation history
for user_text, assistant_text in chat_history:
if user_text:
messages.append({"role": "user", "content": user_text})
if assistant_text:
messages.append({"role": "assistant", "content": assistant_text})
# Add the new user message to the conversation
messages.append({"role": "user", "content": user_message})
# We'll build the response token-by-token in a streaming loop
response_so_far = ""
print("Sending request to the Hugging Face Inference API...")
# Make the streaming request to the HF Inference API
try:
for resp_chunk in client.chat.completions.create(
model=model_to_use,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
messages=messages,
):
token_text = resp_chunk.choices[0].delta.content
response_so_far += token_text
# We yield back the updated message to display partial progress in the chatbot
yield response_so_far
except Exception as e:
# If there's an error, let's at least show it in the chat
error_text = f"[ERROR] {str(e)}"
print(error_text)
yield response_so_far + "\n\n" + error_text
print("Completed response generation.")
#
# BUILDING THE GRADIO INTERFACE BELOW
#
# List of featured models; adjust or replace these placeholders with real text-generation models
models_list = [
"meta-llama/Llama-3.3-70B-Instruct",
"meta-llama/Llama-2-13B-chat-hf",
"bigscience/bloom",
"openlm-research/open_llama_7b",
"facebook/opt-6.7b",
"google/flan-t5-xxl",
]
def filter_models(search_term):
"""Filters the models_list by the given search_term and returns an update for the Radio component."""
filtered = [m for m in models_list if search_term.lower() in m.lower()]
return gr.update(choices=filtered)
with gr.Blocks(theme="Nymbo/Nymbo_Theme_5") as demo:
gr.Markdown("# Serverless-TextGen-Hub (Enhanced)")
gr.Markdown("**A comprehensive UI for text generation with a featured-models dropdown and a custom override**.")
# We keep track of the conversation in a Gradio state variable (list of tuples)
chat_history = gr.State([])
# Tabs for organization
with gr.Tab("Basic Settings"):
with gr.Row():
with gr.Column(elem_id="prompt-container"):
# System Message
system_msg = gr.Textbox(
label="System message",
placeholder="Enter system-level instructions or context here.",
lines=2
)
# Accordion for featured models
with gr.Accordion("Featured Models", open=True):
model_search = gr.Textbox(
label="Filter Models",
placeholder="Search for a featured model...",
lines=1
)
# The radio that lists our featured models
model_radio = gr.Radio(
label="Select a featured model below",
choices=models_list,
value=models_list[0], # default
interactive=True
)
# Link the search box to update the model_radio choices
model_search.change(filter_models, inputs=model_search, outputs=model_radio)
# Custom Model
custom_model_box = gr.Textbox(
label="Custom Model (Optional)",
info="If provided, overrides the featured model above. e.g. 'meta-llama/Llama-3.3-70B-Instruct'",
placeholder="Your huggingface.co/username/model_name path"
)
with gr.Tab("Advanced Settings"):
with gr.Row():
max_tokens_slider = gr.Slider(
minimum=1,
maximum=4096,
value=512,
step=1,
label="Max new tokens"
)
temperature_slider = gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature"
)
top_p_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-P"
)
with gr.Row():
freq_penalty_slider = gr.Slider(
minimum=-2.0,
maximum=2.0,
value=0.0,
step=0.1,
label="Frequency Penalty"
)
seed_slider = gr.Slider(
minimum=-1,
maximum=65535,
value=-1,
step=1,
label="Seed (-1 for random)"
)
# Chat interface area: user input -> assistant output
with gr.Row():
chatbot = gr.Chatbot(
label="TextGen Chat",
height=500
)
# The user types a message here
user_input = gr.Textbox(
label="Your message",
placeholder="Type your text prompt here..."
)
# "Send" button triggers our respond() function, updates the chatbot
send_button = gr.Button("Send")
# A Clear Chat button to reset the conversation
clear_button = gr.Button("Clear Chat")
# Define how the Send button updates the state and chatbot
def user_submission(user_text, history):
"""
This function gets called first to add the user's message to the chat.
We return the updated chat_history with the user's message appended,
plus an empty string for the next user input box.
"""
if user_text.strip() == "":
return history, ""
# Append user message to chat
history = history + [(user_text, None)]
return history, ""
send_button.click(
fn=user_submission,
inputs=[user_input, chat_history],
outputs=[chat_history, user_input]
)
# Then we run the respond function (streaming) to generate the assistant message
def bot_response(
history,
system_msg,
max_tokens,
temperature,
top_p,
freq_penalty,
seed,
featured_model,
custom_model
):
"""
This function is called to generate the assistant's response
based on the conversation so far, system message, etc.
We do the streaming here.
"""
if not history:
yield history
# The last user message is in history[-1][0]
user_message = history[-1][0] if history else ""
# We pass everything to respond() generator
bot_stream = respond(
user_message=user_message,
chat_history=history[:-1], # all except the newly appended user message
system_msg=system_msg,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
frequency_penalty=freq_penalty,
seed=seed,
featured_model=featured_model,
custom_model=custom_model
)
partial_text = ""
for partial_text in bot_stream:
# We'll keep updating the last message in the conversation with partial_text
updated_history = history[:-1] + [(history[-1][0], partial_text)]
yield updated_history
send_button.click(
fn=bot_response,
inputs=[
chat_history,
system_msg,
max_tokens_slider,
temperature_slider,
top_p_slider,
freq_penalty_slider,
seed_slider,
model_radio,
custom_model_box
],
outputs=chatbot
)
# Clear chat just resets the state
def clear_chat():
return [], ""
clear_button.click(
fn=clear_chat,
inputs=[],
outputs=[chat_history, user_input]
)
# Launch the application
if __name__ == "__main__":
print("Launching the Serverless-TextGen-Hub with Featured Models & Custom Model override.")
demo.launch()