Spaces:

SkyNetWalker
/

HF-LLMs

Running

File size: 3,456 Bytes

1be87ac
 
fc98e77
21a478e
1be87ac
 
21a478e
8037c4b
1be87ac
 
 
 
 
 
 
 
5d7db46
1be87ac
 
 
 
 
 
 
 
4facf91
1be87ac
 
 
80eed0f
1be87ac
fc98e77
1be87ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4facf91
1be87ac
 
 
 
 
 
 
 
 
 
 
 
 
50119d1
1be87ac
 
 
4facf91
 
d04ced8
 
 
 
 
0f5768c
d04ced8
dcc0728
 
fc98e77
 
40b508f
 
50119d1
40b508f
 
80eed0f
40b508f
 
 
 
1f94115
40b508f
80eed0f
 
 
4063429
40b508f
 
 
 
 
1be87ac
 
 
 
fc98e77

#refer llama recipes for more info https://github.com/huggingface/huggingface-llama-recipes/blob/main/inference-api.ipynb
#huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main

import gradio as gr
from openai import OpenAI
import os

ACCESS_TOKEN = os.getenv("myHFtoken")

print("Access token loaded.")

client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)

print("Client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    model_name,  # New parameter for model selection
):
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Selected model: {model_name}")

    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
            print(f"Added user message to context: {val[0]}")
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
            print(f"Added assistant message to context: {val[1]}")

    messages.append({"role": "user", "content": message})

    response = ""
    print("Sending request to OpenAI API.")
    
    for message in client.chat.completions.create(
        model=model_name,  # Use the selected model
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        messages=messages,
    ):
        token = message.choices[0].delta.content
        print(f"Received token: {token}")
        response += token
        yield response

    print("Completed response generation.")
        
chatbot = gr.Chatbot(height=400)

print("Chatbot interface created.")

# Define the list of models
models = [
    "PowerInfer/SmallThinker-3B-Preview", #OK
    "Qwen/QwQ-32B-Preview", #OK
    "Qwen/Qwen2.5-Coder-32B-Instruct", #OK
    "meta-llama/Llama-3.2-3B-Instruct", #OK
    #"Qwen/Qwen2.5-32B-Instruct", #fail, too large
    "microsoft/Phi-3-mini-128k-instruct", #fail
    #"microsoft/Phi-3-medium-128k-instruct", #fail
    #"microsoft/phi-4", #fail, too large to be loaded automatically (29GB > 10GB)
    #"meta-llama/Llama-3.3-70B-Instruct", #fail, need HF Pro subscription
]

# Add a title and move the model dropdown to the top
with gr.Blocks() as demo:
    gr.Markdown("# LLM Test")  # Add a title to the top of the UI
    
    # Add the model dropdown above the chatbot
    model_dropdown = gr.Dropdown(choices=models, value=models[0], label="Select Model:")
    
    # Use the existing ChatInterface
    gr.ChatInterface(
        respond,
        chatbot=chatbot,
        additional_inputs=[
            gr.Textbox(value="", label="Additional System Prompt:"),
            gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens:"),
            gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Temperature:"),
            gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P:"),
            model_dropdown,  # Pass the dropdown as an additional input
        ],
        fill_height=True,
    )

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()