Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files

xet

Community

Nymbo commited on Jan 4

Commit

880ced6

verified ·

1 Parent(s): e419588

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -226

app.py CHANGED Viewed

@@ -2,19 +2,8 @@ import gradio as gr
 from openai import OpenAI
 import os
-# --------------------------------------------------------------------------------
-#  Serverless-TextGen-Hub
-#  This application is a Gradio-based UI for text generation using
-#  Hugging Face's serverless Inference API. We also incorporate features
-#  inspired by the ImgGen-Hub, such as:
-#    - A "Featured Models" accordion with text filtering.
-#    - A "Custom Model" textbox for specifying a non-featured model.
-#    - An "Information" tab with accordions for "Featured Models" and
-#      "Parameters Overview" containing helpful user guides.
-# --------------------------------------------------------------------------------
-# Retrieve the access token from environment variables
-ACCESS_TOKEN = os.getenv("HF_TOKEN")  # HF_TOKEN is your Hugging Face Inference API key
 print("Access token loaded.")
 # Initialize the OpenAI client with the Hugging Face Inference API endpoint
@@ -28,269 +17,268 @@ def respond(
     message,
     history: list[tuple[str, str]],
     system_message,
     max_tokens,
     temperature,
     top_p,
     frequency_penalty,
-    seed,
-    # NEW inputs for model selection
-    model_search,
-    selected_model,
-    custom_model
 ):
     """
-    This function handles the chatbot response.
-    Parameters:
-    - message: The user's newest message (string).
-    - history: The list of previous messages in the conversation, each as a tuple (user_msg, assistant_msg).
-    - system_message: The system prompt provided.
-    - max_tokens: The maximum number of tokens to generate in the response.
-    - temperature: Sampling temperature (float).
-    - top_p: Top-p (nucleus) sampling (float).
-    - frequency_penalty: Penalize repeated tokens in the output (float).
-    - seed: A fixed seed for reproducibility; -1 means 'random'.
-    - model_search: The text used to filter the "Featured Models" Radio button list (unused here directly, but updated by the UI).
-    - selected_model: The model selected via the "Featured Models" Radio button.
-    - custom_model: If not empty, overrides selected_model with this custom path.
     """
-    # DEBUG LOGGING
     print(f"Received message: {message}")
     print(f"History: {history}")
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
-    print(f"Model search text: {model_search}")
-    print(f"Selected featured model: {selected_model}")
-    print(f"Custom model (overrides if not empty): {custom_model}")
     # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
-    # Determine the final model name to use
-    # If the custom_model textbox is non-empty, we use that.
-    # Otherwise, we use the selected model from the Radio buttons.
-    if custom_model.strip():
-        model_to_use = custom_model.strip()
-    else:
-        model_to_use = selected_model
-    # Construct the messages array required by the OpenAI-like HF API
-    messages = [{"role": "system", "content": system_message}]  # System prompt
-    # Add conversation history to context
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
         if user_part:
             messages.append({"role": "user", "content": user_part})
         if assistant_part:
             messages.append({"role": "assistant", "content": assistant_part})
     # Append the latest user message
     messages.append({"role": "user", "content": message})
     # Start with an empty string to build the response as tokens stream in
     response = ""
-    print(f"Using model: {model_to_use}")
-    print("Sending request to OpenAI API...")
     # Make the streaming request to the HF Inference API via openai-like client
-    # Below, we pass 'model_to_use' instead of a hard-coded model
     for message_chunk in client.chat.completions.create(
-        model=model_to_use,             # <-- model is now dynamically selected
         max_tokens=max_tokens,
-        stream=True,                    # Stream the response
         temperature=temperature,
         top_p=top_p,
         frequency_penalty=frequency_penalty,
         seed=seed,
         messages=messages,
     ):
-        # Extract token text from the response chunk
         token_text = message_chunk.choices[0].delta.content
         response += token_text
-        # As we get new tokens, we stream them back to the user
         yield response
     print("Completed response generation.")
 # Create a Chatbot component with a specified height
 chatbot = gr.Chatbot(height=600)
-# ------------------------------------------------------------
-# Below: We define the UI with additional features integrated.
-# We'll replicate some of the style from the ImgGen-Hub code:
-#  - A "Featured Models" accordion with the ability to filter
-#  - A "Custom Model" text box
-#  - An "Information" tab with "Featured Models" table and
-#    "Parameters Overview" containing markdown descriptions.
-# ------------------------------------------------------------
-# List of placeholder "Featured Models" for demonstration
-featured_models_list = [
-    "meta-llama/Llama-3.3-70B-Instruct",
-    "meta-llama/Llama-2-70B-chat-hf",
-    "meta-llama/Llama-2-13B-chat-hf",
-    "bigscience/bloom",
-    "google/flan-t5-xxl",
-]
-# This function filters the models in featured_models_list based on user input
-def filter_models(search_term):
-    """
-    Filters featured_models_list based on the text in 'search_term'.
-    """
-    filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
-    return gr.update(choices=filtered)
-print("Initializing Gradio interface...")  # Debug log
-# We build a custom Blocks layout to incorporate tabs and advanced UI elements
 with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
-    # Top-level heading for clarity
-    gr.Markdown("# Serverless-TextGen-Hub\nA Comprehensive UI for Text Generation")
-    with gr.Tab("Chat"):
-        # We'll place the ChatInterface within this tab
-        # Create the additional UI elements in a collapsible or visible layout
-        with gr.Accordion("Featured Models", open=False):
-            with gr.Row():
-                model_search = gr.Textbox(
-                    label="Filter Models",
-                    placeholder="Search for a featured model...",
-                    lines=1,
-                )
-            with gr.Row():
-                model_radio = gr.Radio(
-                    label="Select a featured model below",
-                    choices=featured_models_list,
-                    value="meta-llama/Llama-3.3-70B-Instruct",
-                    interactive=True,
-                )
-            # On change of model_search, we update the radio choices
-            model_search.change(
-                filter_models,
-                inputs=model_search,
-                outputs=model_radio
-            )
-        # Textbox for specifying a custom model that overrides the featured selection if not empty
-        custom_model = gr.Textbox(
-            label="Custom Model Path (overrides Featured Models if not empty)",
-            placeholder="e.g. meta-llama/Llama-2-13B-chat-hf",
-            lines=1
-        )
-        # Build the chat interface itself
-        # We'll pass "model_search", "model_radio", and "custom_model" as additional inputs
-        # so that the 'respond' function can see them and decide which model to use
-        chatbot_interface = gr.ChatInterface(
-            fn=respond,  # The function that generates the text
-            additional_inputs=[
-                gr.Textbox(
-                    value="You are a helpful AI assistant.",
                     label="System message",
-                    lines=2
-                ),  # system_message
-                gr.Slider(minimum=1,   maximum=4096, value=512,  step=1,   label="Max new tokens"),  # max_tokens
-                gr.Slider(minimum=0.1, maximum=4.0,   value=0.7,  step=0.1, label="Temperature"),      # temperature
-                gr.Slider(minimum=0.1, maximum=1.0,   value=0.95, step=0.05,label="Top-P"),           # top_p
-                gr.Slider(
-                    minimum=-2.0,
-                    maximum=2.0,
-                    value=0.0,
-                    step=0.1,
-                    label="Frequency Penalty"
-                ),  # frequency_penalty
-                gr.Slider(
-                    minimum=-1,
-                    maximum=65535,
-                    value=-1,
-                    step=1,
-                    label="Seed (-1 for random)"
-                ),  # seed
-                model_search,  # Exposed but won't be typed into during conversation,
-                model_radio,
-                custom_model
-            ],
-            chatbot=chatbot,
-            title="Serverless-TextGen-Hub",
-            # The fill_height ensures the chat area expands
-            fill_height=True
-        )
-    # A new tab for "Information" about Featured Models and Parameters
-    with gr.Tab("Information"):
-        gr.Markdown("## Learn More About the Parameters and Models")
-        # Accordion for "Featured Models"
-        with gr.Accordion("Featured Models (WiP)", open=False):
-            gr.HTML(
-                """
-                <p>Below is a small table of example models. In practice, you can pick from
-                thousands of available text generation models on Hugging Face.
-                <br>
-                Use the <b>Filter Models</b> box under the <b>Featured Models</b> accordion
-                in the Chat tab to search by name, or enter a <b>Custom Model</b> path.</p>
-                <table style="width:100%; text-align:center; margin:auto;">
-                    <tr>
-                        <th>Model Name</th>
-                        <th>Is It Large?</th>
-                        <th>Notes</th>
-                    </tr>
-                    <tr>
-                        <td>meta-llama/Llama-3.3-70B-Instruct</td>
-                        <td>Yes</td>
-                        <td>Placeholder example</td>
-                    </tr>
-                    <tr>
-                        <td>meta-llama/Llama-2-13B-chat-hf</td>
-                        <td>Medium</td>
-                        <td>Placeholder example</td>
-                    </tr>
-                    <tr>
-                        <td>google/flan-t5-xxl</td>
-                        <td>Yes</td>
-                        <td>Placeholder example</td>
-                    </tr>
-                </table>
-                """
-            )
-        # Accordion for "Parameters Overview"
-        with gr.Accordion("Parameters Overview", open=False):
-            gr.Markdown(
-                """
-                ### Max New Tokens
-                Controls how many tokens can be generated in the response. A token is roughly a word or a piece of a word. If you need longer answers, increase this.
-                ### Temperature
-                A higher temperature makes the AI more 'creative' and random in its responses. Lower temperature keeps it more focused and deterministic.
-                ### Top-P
-                This is 'nucleus sampling.' It dictates the proportion of probability mass the model considers. At 1.0, it considers all words. Lower it to focus on the most likely words.
-                ### Frequency Penalty
-                Penalizes repeated tokens in the output. If you see a lot of repetition, increase this slightly to reduce the repetition.
-                ### Seed
-                If set to -1, the randomness is different each time. Setting a specific number ensures the same result each run, making responses reproducible.
-                ### Custom Model
-                If this field is filled, it overrides the selection from Featured Models. This way, you can try out any model on the HF Hub, e.g.
-                <code>meta-llama/Llama-2-70B-chat-hf</code> or <code>bigscience/bloom</code>.
-                """
-            )
-print("Gradio interface initialized.")
-# ------------------------------------------------------------
-# Finally, we launch the app if the script is run directly.
-# ------------------------------------------------------------
-if __name__ == "__main__":
-    print("Launching the demo application...")
-    demo.launch()

 from openai import OpenAI
 import os
+# Retrieve the access token from the environment variable
+ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
 # Initialize the OpenAI client with the Hugging Face Inference API endpoint
     message,
     history: list[tuple[str, str]],
     system_message,
+    custom_model,
+    model,
     max_tokens,
     temperature,
     top_p,
     frequency_penalty,
+    seed
 ):
     """
+    This function handles the chatbot response. It takes in:
+    - message: the user's new message
+    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
+    - system_message: the system prompt
+    - custom_model: custom model path (if any)
+    - model: selected model from featured models
+    - max_tokens: the maximum number of tokens to generate in the response
+    - temperature: sampling temperature
+    - top_p: top-p (nucleus) sampling
+    - frequency_penalty: penalize repeated tokens in the output
+    - seed: a fixed seed for reproducibility; -1 will mean 'random'
     """
     print(f"Received message: {message}")
     print(f"History: {history}")
     print(f"System message: {system_message}")
+    print(f"Custom model: {custom_model}")
+    print(f"Selected model: {model}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
     # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
+    # Construct the messages array required by the API
+    messages = [{"role": "system", "content": system_message}]
+    # Add conversation history to the context
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
         if user_part:
             messages.append({"role": "user", "content": user_part})
+            print(f"Added user message to context: {user_part}")
         if assistant_part:
             messages.append({"role": "assistant", "content": assistant_part})
+            print(f"Added assistant message to context: {assistant_part}")
     # Append the latest user message
     messages.append({"role": "user", "content": message})
     # Start with an empty string to build the response as tokens stream in
     response = ""
+    print("Sending request to OpenAI API.")
+    # Determine which model to use
+    if custom_model.strip():
+        selected_model = custom_model.strip()
+    else:
+        # Map the display names to actual model paths
+        model_mapping = {
+            "Llama 2 70B": "meta-llama/Llama-2-70b-chat-hf",
+            "Mixtral 8x7B": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "Zephyr 7B": "HuggingFaceH4/zephyr-7b-beta",
+            "OpenChat 3.5": "openchat/openchat-3.5-0106",
+        }
+        selected_model = model_mapping.get(model, "meta-llama/Llama-2-70b-chat-hf")
     # Make the streaming request to the HF Inference API via openai-like client
     for message_chunk in client.chat.completions.create(
+        model=selected_model,
         max_tokens=max_tokens,
+        stream=True,
         temperature=temperature,
         top_p=top_p,
         frequency_penalty=frequency_penalty,
         seed=seed,
         messages=messages,
     ):
+        # Extract the token text from the response chunk
         token_text = message_chunk.choices[0].delta.content
+        print(f"Received token: {token_text}")
         response += token_text
         yield response
     print("Completed response generation.")
 # Create a Chatbot component with a specified height
 chatbot = gr.Chatbot(height=600)
+print("Chatbot interface created.")
+# Create the Gradio interface with tabs
 with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
+    with gr.Row():
+        with gr.Column():
+            # Basic Settings Tab
+            with gr.Tab("Settings"):
+                # System Message
+                system_message = gr.Textbox(
+                    value="",
                     label="System message",
+                    placeholder="Enter a system message to guide the model's behavior"
+                )
+                # Model Selection Section
+                with gr.Accordion("Featured Models", open=True):
+                    # Model Search
+                    model_search = gr.Textbox(
+                        label="Filter Models",
+                        placeholder="Search for a featured model...",
+                        lines=1
+                    )
+                    # Featured Models List
+                    models_list = [
+                        "Llama 2 70B",
+                        "Mixtral 8x7B",
+                        "Zephyr 7B",
+                        "OpenChat 3.5"
+                    ]
+                    model = gr.Radio(
+                        label="Select a model",
+                        choices=models_list,
+                        value="Llama 2 70B"
+                    )
+                    # Custom Model Input
+                    custom_model = gr.Textbox(
+                        label="Custom Model",
+                        info="Hugging Face model path (optional)",
+                        placeholder="meta-llama/Llama-2-70b-chat-hf"
+                    )
+                    # Function to filter models
+                    def filter_models(search_term):
+                        filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
+                        return gr.update(choices=filtered_models)
+                    # Update model list when search box is used
+                    model_search.change(filter_models, inputs=model_search, outputs=model)
+                # Generation Parameters
+                with gr.Row():
+                    max_tokens = gr.Slider(
+                        minimum=1,
+                        maximum=4096,
+                        value=512,
+                        step=1,
+                        label="Max new tokens"
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=4.0,
+                        value=0.7,
+                        step=0.1,
+                        label="Temperature"
+                    )
+                with gr.Row():
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.95,
+                        step=0.05,
+                        label="Top-P"
+                    )
+                    frequency_penalty = gr.Slider(
+                        minimum=-2.0,
+                        maximum=2.0,
+                        value=0.0,
+                        step=0.1,
+                        label="Frequency Penalty"
+                    )
+                with gr.Row():
+                    seed = gr.Slider(
+                        minimum=-1,
+                        maximum=65535,
+                        value=-1,
+                        step=1,
+                        label="Seed (-1 for random)"
+                    )
+            # Information Tab
+            with gr.Tab("Information"):
+                # Featured Models Table
+                with gr.Accordion("Featured Models", open=True):
+                    gr.HTML(
+                        """
+                        <p><a href="https://huggingface.co/models?inference=warm&pipeline_tag=text-to-text">See all available models</a></p>
+                        <table style="width:100%; text-align:center; margin:auto;">
+                            <tr>
+                                <th>Model Name</th>
+                                <th>Size</th>
+                                <th>Notes</th>
+                            </tr>
+                            <tr>
+                                <td>Llama 2 70B</td>
+                                <td>70B</td>
+                                <td>Meta's flagship model</td>
+                            </tr>
+                            <tr>
+                                <td>Mixtral 8x7B</td>
+                                <td>47B</td>
+                                <td>Mistral AI's MoE model</td>
+                            </tr>
+                            <tr>
+                                <td>Zephyr 7B</td>
+                                <td>7B</td>
+                                <td>Efficient fine-tuned model</td>
+                            </tr>
+                            <tr>
+                                <td>OpenChat 3.5</td>
+                                <td>7B</td>
+                                <td>High performance chat model</td>
+                            </tr>
+                        </table>
+                        """
+                    )
+                # Parameters Overview
+                with gr.Accordion("Parameters Overview", open=False):
+                    gr.Markdown(
+                        """
+                        ## System Message
+                        A message that sets the context and behavior for the model. This helps guide the model's responses.
+                        ## Max New Tokens
+                        Controls the maximum length of the generated response. Higher values allow for longer outputs but may take more time.
+                        ## Temperature
+                        Controls randomness in the output:
+                        - Lower values (0.1-0.5): More focused and deterministic
+                        - Higher values (0.7-1.0): More creative and diverse
+                        - Very high values (>1.0): More random and potentially chaotic
+                        ## Top-P (Nucleus Sampling)
+                        Controls the cumulative probability threshold for token selection:
+                        - Lower values: More focused on highly likely tokens
+                        - Higher values: Considers a wider range of possibilities
+                        ## Frequency Penalty
+                        Adjusts the likelihood of token repetition:
+                        - Negative values: May encourage repetition
+                        - Zero: Neutral
+                        - Positive values: Discourages repetition
+                        ## Seed
+                        A number that controls the randomness in generation:
+                        - -1: Random seed each time
+                        - Fixed value: Reproducible outputs with same parameters
+                        """
+                    )
+    # Set up the chat interface
+    chatbot = gr.Chatbot(height=600)
+    msg = gr.Textbox(label="Message")
+    clear = gr.ClearButton([msg, chatbot])
+    msg.submit(respond, [msg, chatbot, system_message, custom_model, model, max_tokens, temperature, top_p, frequency_penalty, seed], [chatbot, msg])
+print("Launching the demo application.")
+demo.launch(show_api=False, share=False)