Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Serverless-TextGen-Hub / app.py

Nymbo

Update app.py

77298b9 verified 10 months ago

raw

history blame

10.7 kB

	import gradio as gr
	import os
	from openai import OpenAI

	################################################
	# INITIAL SETUP
	################################################

	# Retrieve the access token from the environment variable
	ACCESS_TOKEN = os.getenv("HF_TOKEN")
	print("Access token loaded.")

	# Initialize the OpenAI client with the Hugging Face Inference API endpoint
	client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=ACCESS_TOKEN,
	)
	print("OpenAI client initialized.")

	# Our main response-generating function
	def respond(
	user_message,
	history,
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	featured_model,
	custom_model
	):
	"""
	This function handles the chatbot response. It takes in:
	- user_message: the user's new message
	- history: the list of previous messages, each as [user_text, assistant_text]
	- system_message: the system prompt
	- max_tokens: the maximum number of tokens to generate in the response
	- temperature: sampling temperature
	- top_p: top-p (nucleus) sampling
	- frequency_penalty: penalize repeated tokens in the output
	- seed: a fixed seed for reproducibility; -1 will mean 'random'
	- featured_model: the user-chosen model from the radio button
	- custom_model: a user-specified custom model that overrides featured_model if not empty
	"""

	print(f"New user message: {user_message}")
	print(f"History so far: {history}")
	print(f"System message: {system_message}")
	print(f"max_tokens: {max_tokens}, temperature: {temperature}, top_p: {top_p}")
	print(f"frequency_penalty: {frequency_penalty}, seed: {seed}")
	print(f"Featured Model: {featured_model}")
	print(f"Custom Model: {custom_model}")

	# Convert seed to None if -1 (meaning random)
	if seed == -1:
	seed = None

	# Determine which model to use
	# If the user typed something in custom_model, that overrides the featured model
	# Otherwise we use the model selected in the radio. If neither, default to the example "meta-llama..."
	model_to_use = None
	if custom_model.strip():
	model_to_use = custom_model.strip()
	elif featured_model is not None and featured_model.strip():
	model_to_use = featured_model.strip()
	else:
	model_to_use = "meta-llama/Llama-3.3-70B-Instruct"

	print(f"Model selected for inference: {model_to_use}")

	# Construct the conversation messages for the HF Inference API
	messages = [{"role": "system", "content": system_message}]
	for user_text, assistant_text in history:
	if user_text:
	messages.append({"role": "user", "content": user_text})
	if assistant_text:
	messages.append({"role": "assistant", "content": assistant_text})
	messages.append({"role": "user", "content": user_message})

	# We'll collect and stream the response
	response_so_far = ""

	# Make the streaming request to the HF Inference API
	print("Sending request to OpenAI/Hugging Face Inference API...")
	for message_chunk in client.chat.completions.create(
	model=model_to_use,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=frequency_penalty,
	seed=seed,
	messages=messages,
	):
	# The content for the partial chunk
	token_text = message_chunk.choices[0].delta.content
	response_so_far += token_text
	# Return partial response to Gradio to display in real-time
	yield response_so_far

	print("Completed response generation.")

	################################################
	# GRADIO UI + STATE MANAGEMENT
	################################################

	def user_submit(user_message, history):
	"""
	This function is called when the user sends a message.
	We simply add the user message to the conversation history.
	"""
	print("user_submit triggered.")
	# Append the new user message to history
	if not history:
	history = []
	history = history + [[user_message, None]]
	return history, ""

	def bot_reply(history, system_message, max_tokens, temperature, top_p,
	frequency_penalty, seed, featured_model, custom_model):
	"""
	This function is triggered to produce the bot's response after the user has submitted.
	We call 'respond' for streaming text.
	"""
	print("bot_reply triggered.")

	# The last conversation item has user_message, None
	user_message = history[-1][0]

	# We will stream the partial responses from 'respond'
	bot_response = respond(
	user_message=user_message,
	history=history[:-1], # all items except the last, because we pass the last user msg separately
	system_message=system_message,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=frequency_penalty,
	seed=seed,
	featured_model=featured_model,
	custom_model=custom_model
	)

	# As we yield from the generator, we update the last item in history with the partial response
	# Gradio streaming logic: yield the partial updates as they come in
	for partial_text in bot_response:
	history[-1][1] = partial_text
	yield history

	# We define a small list of placeholder featured models for demonstration
	models_list = [
	"meta-llama/Llama-2-13B-Chat-hf",
	"bigscience/bloom",
	"EleutherAI/gpt-neo-2.7B",
	"meta-llama/Llama-3.3-70B-Instruct"
	]

	def filter_models(search_term):
	"""
	Filter function triggered when user types in the model_search box.
	Returns an updated list of models that contain the search term.
	"""
	filtered = [m for m in models_list if search_term.lower() in m.lower()]
	return gr.update(choices=filtered)


	################################################
	# BUILDING THE GRADIO LAYOUT
	################################################

	with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
	gr.Markdown(
	"""
	# Serverless-TextGen-Hub
	A UI for text generation using Hugging Face's Inference API.

	Below is a simple chat interface. You can pick from Featured Models or specify a Custom Model
	to override the choice. If you're not sure, just use the default.
	"""
	)

	# State to hold the conversation history, will be a list of [user, bot]
	conversation_state = gr.State([])

	# Row for system message + advanced settings
	with gr.Accordion("Advanced Settings", open=False):
	system_message = gr.Textbox(
	label="System Message",
	value="You are a helpful assistant.",
	lines=2,
	info="Provides background or personality instructions to the model."
	)
	max_tokens = gr.Slider(
	minimum=1,
	maximum=4096,
	value=512,
	step=1,
	label="Max new tokens"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-P"
	)
	frequency_penalty = gr.Slider(
	minimum=-2.0,
	maximum=2.0,
	value=0.0,
	step=0.1,
	label="Frequency Penalty"
	)
	seed = gr.Slider(
	minimum=-1,
	maximum=65535,
	value=-1,
	step=1,
	label="Seed (-1 for random)"
	)

	# Featured Models + filtering
	with gr.Accordion("Featured Models", open=False):
	model_search = gr.Textbox(
	label="Filter Models",
	placeholder="Search for a featured model...",
	lines=1
	)
	featured_model_radio = gr.Radio(
	label="Select a featured model below",
	choices=models_list,
	value=models_list[0], # default selection
	interactive=True
	)
	model_search.change(
	filter_models,
	inputs=model_search,
	outputs=featured_model_radio
	)

	# This is the Custom Model box (overrides Featured Models if not empty)
	custom_model = gr.Textbox(
	label="Custom Model",
	value="",
	info="(Optional) Provide a custom HF model path. If not empty, it overrides the Featured Model."
	)

	# The main Chatbot interface
	chatbot = gr.Chatbot(height=600)

	# Textbox for the user to type a new message
	with gr.Row():
	user_input = gr.Textbox(
	show_label=False,
	placeholder="Type your message here (press enter or click 'Submit')",
	lines=2
	)
	submit_btn = gr.Button("Submit", variant="primary")

	# The user submits -> we update the conversation state
	submit_btn.click(
	fn=user_submit,
	inputs=[user_input, conversation_state],
	outputs=[conversation_state, user_input],
	)

	# Then the bot replies, streaming the output
	# We pass all required arguments from the advanced settings, plus the model selection boxes
	submit_btn.click(
	fn=bot_reply,
	inputs=[
	conversation_state,
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	featured_model_radio,
	custom_model
	],
	outputs=[chatbot],
	# 'bot_reply' is a generator, so we set streaming=True:
	queue=True
	)

	# We also allow pressing Enter in user_input to do the same thing
	user_input.submit(
	fn=user_submit,
	inputs=[user_input, conversation_state],
	outputs=[conversation_state, user_input],
	)
	user_input.submit(
	fn=bot_reply,
	inputs=[
	conversation_state,
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	featured_model_radio,
	custom_model
	],
	outputs=[chatbot],
	queue=True
	)

	gr.HTML("""
	<br>
	<p style='text-align:center;'>
	Developed by <strong>Nymbo</strong>.
	Powered by <strong>Hugging Face Inference API</strong>.
	</p>
	""")

	# Finally, launch the app
	if __name__ == "__main__":
	print("Launching the Serverless-TextGen-Hub application...")
	demo.launch()