Spaces:
Running
Running
File size: 3,456 Bytes
1be87ac fc98e77 21a478e 1be87ac 21a478e 8037c4b 1be87ac 5d7db46 1be87ac 4facf91 1be87ac 80eed0f 1be87ac fc98e77 1be87ac 4facf91 1be87ac 50119d1 1be87ac 4facf91 d04ced8 0f5768c d04ced8 dcc0728 fc98e77 40b508f 50119d1 40b508f 80eed0f 40b508f 1f94115 40b508f 80eed0f 4063429 40b508f 1be87ac fc98e77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
#refer llama recipes for more info https://github.com/huggingface/huggingface-llama-recipes/blob/main/inference-api.ipynb
#huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main
import gradio as gr
from openai import OpenAI
import os
ACCESS_TOKEN = os.getenv("myHFtoken")
print("Access token loaded.")
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("Client initialized.")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
model_name, # New parameter for model selection
):
print(f"Received message: {message}")
print(f"History: {history}")
print(f"System message: {system_message}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
print(f"Selected model: {model_name}")
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
print(f"Added user message to context: {val[0]}")
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
print(f"Added assistant message to context: {val[1]}")
messages.append({"role": "user", "content": message})
response = ""
print("Sending request to OpenAI API.")
for message in client.chat.completions.create(
model=model_name, # Use the selected model
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
messages=messages,
):
token = message.choices[0].delta.content
print(f"Received token: {token}")
response += token
yield response
print("Completed response generation.")
chatbot = gr.Chatbot(height=400)
print("Chatbot interface created.")
# Define the list of models
models = [
"PowerInfer/SmallThinker-3B-Preview", #OK
"Qwen/QwQ-32B-Preview", #OK
"Qwen/Qwen2.5-Coder-32B-Instruct", #OK
"meta-llama/Llama-3.2-3B-Instruct", #OK
#"Qwen/Qwen2.5-32B-Instruct", #fail, too large
"microsoft/Phi-3-mini-128k-instruct", #fail
#"microsoft/Phi-3-medium-128k-instruct", #fail
#"microsoft/phi-4", #fail, too large to be loaded automatically (29GB > 10GB)
#"meta-llama/Llama-3.3-70B-Instruct", #fail, need HF Pro subscription
]
# Add a title and move the model dropdown to the top
with gr.Blocks() as demo:
gr.Markdown("# LLM Test") # Add a title to the top of the UI
# Add the model dropdown above the chatbot
model_dropdown = gr.Dropdown(choices=models, value=models[0], label="Select Model:")
# Use the existing ChatInterface
gr.ChatInterface(
respond,
chatbot=chatbot,
additional_inputs=[
gr.Textbox(value="", label="Additional System Prompt:"),
gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens:"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Temperature:"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P:"),
model_dropdown, # Pass the dropdown as an additional input
],
fill_height=True,
)
print("Gradio interface initialized.")
if __name__ == "__main__":
print("Launching the demo application.")
demo.launch()
|