Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on 27 days ago

Commit

e13eb1b

verified ·

1 Parent(s): cf508a7

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -126

app.py CHANGED Viewed

@@ -4,166 +4,204 @@ import os
 # Retrieve the access token from the environment variable
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
-# Initialize the OpenAI API client
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
 )
 def respond(
     message,
-    history,
     system_message,
     max_tokens,
     temperature,
     top_p,
     frequency_penalty,
-    seed
 ):
-    # Process the incoming message
     print(f"Received message: {message}")
     print(f"History: {history}")
-    print(f"System Message: {system_message}")
-    print(f"Max Tokens: {max_tokens}, Temperature: {temperature}, Top P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
-    # Convert seed to None if -1 (random)
     if seed == -1:
         seed = None
-    # Construct the messages list for the API
     messages = [{"role": "system", "content": system_message}]
     # Add conversation history to the context
-    for user_message, assistant_message in history:
-        if user_message:
-            messages.append({"role": "user", "content": user_message})
-            print(f"Added user message: {user_message}")
-        if assistant_message:
-            messages.append({"role": "assistant", "content": assistant_message})
-            print(f"Added assistant message: {assistant_message}")
-    # Append the latest message
     messages.append({"role": "user", "content": message})
-    # Initialize response
     response = ""
-    # Make the API request
-    for chunk in client.chat.completions.create(
-        model="meta-llama/Llama-3.3-70B-Instruct",
-        messages=messages,
         max_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
-        frequency_penalty=frequency_penalty,
-        seed=seed,
-        stream=True,
     ):
         # Extract the token text from the response chunk
-        token = chunk.choices[0].message.content
-        response += token
         yield response
-# Create the Gradio Chatbot component
-chatbot = gr.Chatbot(height=600)
-# Define the Gradio ChatInterface
-demo = gr.ChatInterface(
-    chatbot=chatbot,
-    fn=respond,
-    inputs=[
-        gr.Textbox(lines=1, placeholder="Enter your message..."),
-        gr.Chatbot(label="Conversation History"),
-        gr.Textbox(label="System Message"),
-        gr.Slider(minimum=10, maximum=200, step=1, label="Max Tokens"),
-        gr.Slider(minimum=0, maximum=2, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0, maximum=1, step=0.05, label="Top P"),
-        gr.Slider(minimum=-2, maximum=2, step=0.1, label="Frequency Penalty"),
-        gr.Slider(minimum=-1, maximum=1000000, step=1, label="Seed (-1 for random)"),
-    ],
-    theme="Nymbo/Nymbo_Theme",
-)
-# Create the "Featured Models" accordion
-with gr.Accordion("Featured Models", open=True) as featured_models:
-    # Textbox for searching models
-    model_search = gr.Textbox(label="Filter Models")
-    # List of featured models
-    models = [
-        "meta-llama/Llama-3.3-70B-Instruct",
-        "meta-llama/Llama-2-70B-Chat-hf",
-        "TheBloke/Llama-2-13B-Chat-GGML",
-        "TheBloke/Llama-2-70B-Chat-GGML",
-        "TheBloke/Llama-2-13B-Chat-GGML-v2",
-        "TheBloke/Llama-2-70B-Chat-GGML-v2",
-        "TheBloke/Llama-2-70B-Chat-HF-API-compatible-GGML",
-        "TheBloke/Llama-2-70b-chat-hf",
-        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-7-13B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
-        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
-        # Add more models as needed...
-    ]
-    # Radio buttons for selecting a model
-    model_radio = gr.Radio(choices=models, label="Select a Model")
-    # Update the model list based on search input
-    def filter_models(search_term):
-        filtered_models = [model for model in models if search_term.lower() in model.lower()]
-        return gr.update(choices=filtered_models)
-    # Update the model list when the search box is used
-    model_search.change(filter_models, inputs=model_search, outputs=model_radio)
-# Create a "Custom Model" textbox
-custom_model = gr.Textbox(label="Custom Model", placeholder="Hugging Face model path")
-# Create the "Information" tab
-with gr.Tab("Information"):
-    # Featured Models accordion
-    with gr.Accordion("Featured Models", open=False):
-        gr.Markdown(
-            """
-            # Featured Models
-            Here's a list of some popular models available on Hugging Face:
-            - meta-llama/Llama-3.3-70B-Instruct
-            - meta-llama/Llama-2-70B-Chat-hf
-            - TheBloke/Llama-2-13B-Chat-GGML
-            - TheBloke/Llama-2-70B-Chat-GGML
-            - TheBloke/Llama-2-13B-Chat-GGML-v2
-            - TheBloke/Llama-2-70B-Chat-GGML-v2
-            - ... (and many more)
-            You can search and select a model from the list above, or use your own custom model path.
-            """
-        )
-    # Parameters Overview accordion
-    with gr.Accordion("Parameters Overview", open=False):
-        gr.Markdown(
-            """
-            # Parameters Overview
-            Here's a brief explanation of the parameters you can adjust:
-            - **Max Tokens**: The maximum number of tokens to generate in the response.
-            - **Temperature**: Controls the randomness of the output. Higher values make the output more random.
-            - **Top P**: Also known as nucleus sampling, it filters the least probable tokens, encouraging the model to be more creative.
-            - **Frequency Penalty**: Penalizes repeated tokens to avoid repetition.
-            - **Seed**: A fixed seed for reproducibility. Use -1 for a random seed.
-            Feel free to experiment with these settings to achieve the desired output.
-            """
         )
-# Launch the Gradio interface
-demo.launch(share=True)

 # Retrieve the access token from the environment variable
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
+print("Access token loaded.")
+# Initialize the OpenAI client with the Hugging Face Inference API endpoint
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
 )
+print("OpenAI client initialized.")
 def respond(
     message,
+    history: list[tuple[str, str]],
     system_message,
     max_tokens,
     temperature,
     top_p,
     frequency_penalty,
+    seed,
+    selected_model,
 ):
+    """
+    This function handles the chatbot response. It takes in:
+    - message: the user's new message
+    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
+    - system_message: the system prompt
+    - max_tokens: the maximum number of tokens to generate in the response
+    - temperature: sampling temperature
+    - top_p: top-p (nucleus) sampling
+    - frequency_penalty: penalize repeated tokens in the output
+    - seed: a fixed seed for reproducibility; -1 will mean 'random'
+    - selected_model: the model to use for generating the response
+    """
     print(f"Received message: {message}")
     print(f"History: {history}")
+    print(f"System message: {system_message}")
+    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
+    print(f"Selected model: {selected_model}")
+    # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
+    # Construct the messages array required by the API
     messages = [{"role": "system", "content": system_message}]
     # Add conversation history to the context
+    for val in history:
+        user_part = val[0]
+        assistant_part = val[1]
+        if user_part:
+            messages.append({"role": "user", "content": user_part})
+            print(f"Added user message to context: {user_part}")
+        if assistant_part:
+            messages.append({"role": "assistant", "content": assistant_part})
+            print(f"Added assistant message to context: {assistant_part}")
+    # Append the latest user message
     messages.append({"role": "user", "content": message})
+    # Start with an empty string to build the response as tokens stream in
     response = ""
+    print("Sending request to OpenAI API.")
+    # Make the streaming request to the HF Inference API via openai-like client
+    for message_chunk in client.chat.completions.create(
+        model=selected_model,  # Use the selected model
         max_tokens=max_tokens,
+        stream=True,  # Stream the response
         temperature=temperature,
         top_p=top_p,
+        frequency_penalty=frequency_penalty,  # <-- NEW
+        seed=seed,  # <-- NEW
+        messages=messages,
     ):
         # Extract the token text from the response chunk
+        token_text = message_chunk.choices[0].delta.content
+        print(f"Received token: {token_text}")
+        response += token_text
         yield response
+    print("Completed response generation.")
+# Create a Chatbot component with a specified height
+chatbot = gr.Chatbot(height=600)
+print("Chatbot interface created.")
+# Define the list of featured models
+featured_models = [
+    "meta-llama/Llama-3.3-70B-Instruct",
+    "google/flan-t5-xl",
+    "facebook/bart-large-cnn",
+    "EleutherAI/gpt-neo-2.7B",
+    # Add more featured models here
+]
+# Create the Gradio Blocks interface
+with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
+    # Tab for model selection
+    with gr.Tab("Models"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Accordion("Featured Models", open=True):
+                    model_search = gr.Textbox(label="Filter Models", placeholder="Search for a featured model...", lines=1)
+                    model = gr.Dropdown(label="Select a model below", choices=featured_models, value="meta-llama/Llama-3.3-70B-Instruct", interactive=True)
+                    def filter_models(search_term):
+                        filtered_models = [m for m in featured_models if search_term.lower() in m.lower()]
+                        return gr.update(choices=filtered_models)
+                    model_search.change(filter_models, inputs=model_search, outputs=model)
+                custom_model = gr.Textbox(label="Custom Model", placeholder="Enter a custom model ID here", interactive=True)
+    # Tab for chat interface
+    with gr.Tab("Chat"):
+        with gr.Row():
+            with gr.Column():
+                txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
+        # Additional parameters
+        with gr.Row():
+            with gr.Column():
+                system_message = gr.Textbox(label="System Message", value="", lines=3)
+                max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max New Tokens")
+                temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
+                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
+                frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
+                seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
+        # Chatbot display
+        chatbot = gr.Chatbot(height=600)
+        # Submit button
+        submit_btn = gr.Button("Submit")
+    # Tab for information
+    with gr.Tab("Information"):
+        with gr.Row():
+            gr.Markdown(
+                """
+                # Featured Models
+                - **meta-llama/Llama-3.3-70B-Instruct**: A large language model from Meta.
+                - **google/flan-t5-xl**: A pretrained encoder-decoder model from Google.
+                - **facebook/bart-large-cnn**: A pretrained sequence-to-sequence model from Facebook.
+                - **EleutherAI/gpt-neo-2.7B**: A large autoregressive language model from EleutherAI.
+                # Parameters Overview
+                - **System Message**: Sets the behavior and context for the assistant.
+                - **Max New Tokens**: Limits the length of the generated response.
+                - **Temperature**: Controls the randomness of the output. Higher values make output more random.
+                - **Top-P**: Controls the diversity of text by selecting tokens that account for top-p probability mass.
+                - **Frequency Penalty**: Decreases the model's likelihood to repeat the same lines.
+                - **Seed**: Ensures reproducibility of results; set to -1 for random seed.
+                """
+            )
+    # Function to handle chat submission
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    # Function to process the chat
+    def bot(history, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, selected_model):
+        # Get the last user message
+        user_message = history[-1][0]
+        # Generate response
+        response_iter = respond(
+            user_message,
+            history[:-1],  # Exclude the last user message which doesn't have a response yet
+            system_message,
+            max_tokens,
+            temperature,
+            top_p,
+            frequency_penalty,
+            seed,
+            selected_model,
         )
+        # Collect the entire response
+        full_response = ""
+        for resp in response_iter:
+            full_response = resp
+        # Update history with the bot's response
+        history[-1][1] = full_response
+        return history
+    # Set up the chat flow
+    txt.submit(user, [txt, chatbot], [txt, chatbot], queue=False).then(
+        bot, [chatbot, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, model], chatbot
+    )
+    submit_btn.click(user, [txt, chatbot], [txt, chatbot], queue=False).then(
+        bot, [chatbot, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, model], chatbot
+    )
+print("Gradio interface initialized.")
+if __name__ == "__main__":
+    print("Launching the demo application.")
+    demo.launch()