LlamaVerse_new

Paused

App Files Files Community

LlamaVerse_new / app.py

Slimy619

Update space

dfc360e 2 months ago

raw

history blame contribute delete

9.38 kB

	import spaces
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp_cuda_tensorcores import Llama

	REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
	MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
	MAX_CONTEXT_LENGTH = 8192
	CUDA = True

	SYSTEM_PROMPT = """You are controlling a 2 DOF robot on a 50x50 grid. The robot can move one step in any of the four cardinal directions. The robot can perform the following actions:

	- 'up': Move one unit up (increasing y coordinate by 1).
	- 'down': Move one unit down (decreasing y coordinate by 1).
	- 'left': Move one unit left (decreasing x coordinate by 1).
	- 'right': Move one unit right (increasing x coordinate by 1).
	Given a target coordinate, your task is to calculate and output the shortest sequence of commands that will move the robot from its current position to the target position.

	Output Format:
	- Begin with the exact phrase: 'The full list is:'.
	- Provide the sequence of commands as a JSON array, with each command as a string. Commands must be exactly 'up', 'down', 'left', or 'right'.
	- All coordinates should be formatted as JSON objects with keys 'x' and 'y' and integer values. For example, the starting position should be output as {'x': 0, 'y': 0}.
	- When calling tools, ensure that all arguments use this JSON object format for coordinates, with keys 'x' and 'y'.
	- Example of correct output:
	If the target coordinate is {'x': 2, 'y': 3}, your response should include:
	'The full list is: ["right", "right", "up", "up", "up"]'
	And for tool calls, use:
	'tool_calls': [{'function': {'name': 'validate_path', 'arguments': {'commands': ["right", "right", "up", "up", "up"], 'start_position': {'x': 0, 'y': 0}, 'target_position': {'x': 2, 'y': 3}}}}]'

	Please ensure that all output strictly adheres to these formats. If any output is not in the correct format, redo the task and correct the output before providing the final answer."""

	TOKEN_STOP = ["<\|eot_id\|>"]
	SYS_MSG = "<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n\nSYSTEM_PROMPT<\|eot_id\|>\n"
	USER_PROMPT = (
	"<\|start_header_id\|>user<\|end_header_id\|>\n\nUSER_PROMPT<\|eot_id\|>\n"
	)
	ASSIS_PROMPT = "<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	END_ASSIS_PREVIOUS_RESPONSE = "<\|eot_id\|>\n"

	TASK_PROMPT = {
	"Assistant": SYSTEM_PROMPT,
	}

	# css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}"


	class ChatLLM:
	def __init__(self, config_model):
	self.llm = None
	self.config_model = config_model
	# self.load_cpp_model()

	def load_cpp_model(self):
	self.llm = Llama(**config_model)

	def apply_chat_template(
	self,
	history,
	system_message,
	):
	history = history or []

	messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
	for msg in history:
	messages += (
	USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
	)
	messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""

	print(messages)

	# messages = messages[:-1]

	return messages

	@spaces.GPU(duration=30)
	def response(
	self,
	history,
	system_message,
	max_tokens,
	temperature,
	top_p,
	top_k,
	repeat_penalty,
	):

	messages = self.apply_chat_template(history, system_message)

	history[-1][1] = ""

	if not self.llm:
	print("Loading model")
	self.load_cpp_model()

	for output in self.llm(
	messages,
	echo=False,
	stream=True,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repeat_penalty,
	stop=TOKEN_STOP,
	):
	answer = output["choices"][0]["text"]
	history[-1][1] += answer # here we append the answer to the last message in the history
	# stream the response
	yield history, history


	def user(message, history):
	history = history or []
	# Append the user's message to the conversation history
	history.append([message, ""])
	return "", history


	def clear_chat(chat_history_state, chat_message):
	chat_history_state = []
	chat_message = ""
	return chat_history_state, chat_message


	def gui(llm_chat):
	with gr.Blocks() as app:
	gr.Markdown("# Llama 3 70B Instruct GGUF")
	gr.Markdown(
	f"""
	### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend.
	"""
	)
	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	label="Chat",
	height=700,
	avatar_images=(
	"assets/avatar_user.jpeg",
	"assets/avatar_llama.jpeg",
	),
	)
	with gr.Column(scale=1):
	with gr.Row():
	message = gr.Textbox(
	label="Message",
	placeholder="Ask me anything.",
	lines=3,
	)
	with gr.Row():
	submit = gr.Button(value="Send message", variant="primary")
	clear = gr.Button(value="New chat", variant="primary")
	stop = gr.Button(value="Stop", variant="secondary")

	with gr.Accordion("Contextual Prompt Editor"):
	default_task = "Assistant"
	task_prompts_gui = gr.Dropdown(
	TASK_PROMPT,
	value=default_task,
	label="Prompt selector",
	visible=True,
	interactive=True,
	)
	system_msg = gr.Textbox(
	TASK_PROMPT[default_task],
	label="System Message",
	placeholder="system prompt",
	lines=4,
	)

	def task_selector(choice):
	return gr.update(value=TASK_PROMPT[choice])

	task_prompts_gui.change(
	task_selector,
	[task_prompts_gui],
	[system_msg],
	)

	with gr.Accordion("Advanced settings", open=False):
	with gr.Column():
	max_tokens = gr.Slider(
	20, 4096, label="Max Tokens", step=20, value=400
	)
	temperature = gr.Slider(
	0.2, 2.0, label="Temperature", step=0.1, value=0.8
	)
	top_p = gr.Slider(
	0.0, 1.0, label="Top P", step=0.05, value=0.95
	)
	top_k = gr.Slider(
	0, 100, label="Top K", step=1, value=40
	)
	repeat_penalty = gr.Slider(
	0.0,
	2.0,
	label="Repetition Penalty",
	step=0.1,
	value=1.1,
	)

	chat_history_state = gr.State()
	clear.click(
	clear_chat,
	inputs=[chat_history_state, message],
	outputs=[chat_history_state, message],
	queue=False,
	)
	clear.click(lambda: None, None, chatbot, queue=False)

	submit_click_event = submit.click(
	fn=user,
	inputs=[message, chat_history_state],
	outputs=[message, chat_history_state],
	queue=True,
	).then(
	fn=llm_chat.response,
	inputs=[
	chat_history_state,
	system_msg,
	max_tokens,
	temperature,
	top_p,
	top_k,
	repeat_penalty,
	],
	outputs=[chatbot, chat_history_state],
	queue=True,
	)
	stop.click(
	fn=None,
	inputs=None,
	outputs=None,
	cancels=[submit_click_event],
	queue=False,
	)
	return app


	if __name__ == "__main__":

	model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)

	config_model = {
	"model_path": model_path,
	"n_ctx": MAX_CONTEXT_LENGTH,
	"n_gpu_layers": -1 if CUDA else 0,
	}

	llm_chat = ChatLLM(config_model)

	app = gui(llm_chat)

	app.queue(default_concurrency_limit=40)

	app.launch(
	max_threads=40,
	share=False,
	show_error=True,
	quiet=False,
	debug=True,
	allowed_paths=["./assets/"],
	)