import gradio as gr from openai import OpenAI import os # Retrieve the access token from the environment variable ACCESS_TOKEN = os.getenv("HF_TOKEN") print("Access token loaded.") # Initialize the OpenAI client with the Hugging Face Inference API endpoint client = OpenAI( base_url="https://api-inference.huggingface.co/v1/", api_key=ACCESS_TOKEN, ) print("OpenAI client initialized.") def respond( user_message, chat_history, system_msg, max_tokens, temperature, top_p, frequency_penalty, seed, featured_model, custom_model ): """ This function handles the chatbot response. It takes in: - user_message: the user's newly typed message - chat_history: the list of (user, assistant) message pairs - system_msg: the system instruction or system-level context - max_tokens: the maximum number of tokens to generate - temperature: sampling temperature - top_p: top-p (nucleus) sampling - frequency_penalty: penalize repeated tokens in the output - seed: a fixed seed for reproducibility; -1 means 'random' - featured_model: the chosen model name from 'Featured Models' radio - custom_model: the optional custom model that overrides the featured one if provided """ print(f"Received user message: {user_message}") print(f"System message: {system_msg}") print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}, Freq-Penalty: {frequency_penalty}, Seed: {seed}") print(f"Featured model: {featured_model}") print(f"Custom model: {custom_model}") # Convert the seed to None if user set it to -1 (meaning random) if seed == -1: seed = None # Decide which model to actually use # If custom_model is non-empty, use that; otherwise use the chosen featured_model model_to_use = custom_model.strip() if custom_model.strip() != "" else featured_model # Provide a default fallback if for some reason both are empty if model_to_use.strip() == "": model_to_use = "meta-llama/Llama-3.3-70B-Instruct" print(f"Model selected for inference: {model_to_use}") # Construct the conversation history in the format required by HF's Inference API messages = [] if system_msg.strip(): messages.append({"role": "system", "content": system_msg.strip()}) # Add the conversation history for user_text, assistant_text in chat_history: if user_text: messages.append({"role": "user", "content": user_text}) if assistant_text: messages.append({"role": "assistant", "content": assistant_text}) # Add the new user message to the conversation messages.append({"role": "user", "content": user_message}) # We'll build the response token-by-token in a streaming loop response_so_far = "" print("Sending request to the Hugging Face Inference API...") # Make the streaming request to the HF Inference API try: for resp_chunk in client.chat.completions.create( model=model_to_use, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, frequency_penalty=frequency_penalty, seed=seed, messages=messages, ): token_text = resp_chunk.choices[0].delta.content response_so_far += token_text # We yield back the updated message to display partial progress in the chatbot yield response_so_far except Exception as e: # If there's an error, let's at least show it in the chat error_text = f"[ERROR] {str(e)}" print(error_text) yield response_so_far + "\n\n" + error_text print("Completed response generation.") # # BUILDING THE GRADIO INTERFACE BELOW # # List of featured models; adjust or replace these placeholders with real text-generation models models_list = [ "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-2-13B-chat-hf", "bigscience/bloom", "openlm-research/open_llama_7b", "facebook/opt-6.7b", "google/flan-t5-xxl", ] def filter_models(search_term): """Filters the models_list by the given search_term and returns an update for the Radio component.""" filtered = [m for m in models_list if search_term.lower() in m.lower()] return gr.update(choices=filtered) with gr.Blocks(theme="Nymbo/Nymbo_Theme_5") as demo: gr.Markdown("# Serverless-TextGen-Hub (Enhanced)") gr.Markdown("**A comprehensive UI for text generation with a featured-models dropdown and a custom override**.") # We keep track of the conversation in a Gradio state variable (list of tuples) chat_history = gr.State([]) # Tabs for organization with gr.Tab("Basic Settings"): with gr.Row(): with gr.Column(elem_id="prompt-container"): # System Message system_msg = gr.Textbox( label="System message", placeholder="Enter system-level instructions or context here.", lines=2 ) # Accordion for featured models with gr.Accordion("Featured Models", open=True): model_search = gr.Textbox( label="Filter Models", placeholder="Search for a featured model...", lines=1 ) # The radio that lists our featured models model_radio = gr.Radio( label="Select a featured model below", choices=models_list, value=models_list[0], # default interactive=True ) # Link the search box to update the model_radio choices model_search.change(filter_models, inputs=model_search, outputs=model_radio) # Custom Model custom_model_box = gr.Textbox( label="Custom Model (Optional)", info="If provided, overrides the featured model above. e.g. 'meta-llama/Llama-3.3-70B-Instruct'", placeholder="Your huggingface.co/username/model_name path" ) with gr.Tab("Advanced Settings"): with gr.Row(): max_tokens_slider = gr.Slider( minimum=1, maximum=4096, value=512, step=1, label="Max new tokens" ) temperature_slider = gr.Slider( minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ) top_p_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P" ) with gr.Row(): freq_penalty_slider = gr.Slider( minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty" ) seed_slider = gr.Slider( minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)" ) # Chat interface area: user input -> assistant output with gr.Row(): chatbot = gr.Chatbot( label="TextGen Chat", height=500 ) # The user types a message here user_input = gr.Textbox( label="Your message", placeholder="Type your text prompt here..." ) # "Send" button triggers our respond() function, updates the chatbot send_button = gr.Button("Send") # A Clear Chat button to reset the conversation clear_button = gr.Button("Clear Chat") # Define how the Send button updates the state and chatbot def user_submission(user_text, history): """ This function gets called first to add the user's message to the chat. We return the updated chat_history with the user's message appended, plus an empty string for the next user input box. """ if user_text.strip() == "": return history, "" # Append user message to chat history = history + [(user_text, None)] return history, "" send_button.click( fn=user_submission, inputs=[user_input, chat_history], outputs=[chat_history, user_input] ) # Then we run the respond function (streaming) to generate the assistant message def bot_response( history, system_msg, max_tokens, temperature, top_p, freq_penalty, seed, featured_model, custom_model ): """ This function is called to generate the assistant's response based on the conversation so far, system message, etc. We do the streaming here. """ if not history: yield history # The last user message is in history[-1][0] user_message = history[-1][0] if history else "" # We pass everything to respond() generator bot_stream = respond( user_message=user_message, chat_history=history[:-1], # all except the newly appended user message system_msg=system_msg, max_tokens=max_tokens, temperature=temperature, top_p=top_p, frequency_penalty=freq_penalty, seed=seed, featured_model=featured_model, custom_model=custom_model ) partial_text = "" for partial_text in bot_stream: # We'll keep updating the last message in the conversation with partial_text updated_history = history[:-1] + [(history[-1][0], partial_text)] yield updated_history send_button.click( fn=bot_response, inputs=[ chat_history, system_msg, max_tokens_slider, temperature_slider, top_p_slider, freq_penalty_slider, seed_slider, model_radio, custom_model_box ], outputs=chatbot ) # Clear chat just resets the state def clear_chat(): return [], "" clear_button.click( fn=clear_chat, inputs=[], outputs=[chat_history, user_input] ) # Launch the application if __name__ == "__main__": print("Launching the Serverless-TextGen-Hub with Featured Models & Custom Model override.") demo.launch()