LlamaVerse_new / app.py
Slimy619's picture
Update space
dfc360e
import spaces
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp_cuda_tensorcores import Llama
REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF"
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf"
MAX_CONTEXT_LENGTH = 8192
CUDA = True
SYSTEM_PROMPT = """You are controlling a 2 DOF robot on a 50x50 grid. The robot can move one step in any of the four cardinal directions. The robot can perform the following actions:
- 'up': Move one unit up (increasing y coordinate by 1).
- 'down': Move one unit down (decreasing y coordinate by 1).
- 'left': Move one unit left (decreasing x coordinate by 1).
- 'right': Move one unit right (increasing x coordinate by 1).
Given a target coordinate, your task is to calculate and output the shortest sequence of commands that will move the robot from its current position to the target position.
Output Format:
- Begin with the exact phrase: 'The full list is:'.
- Provide the sequence of commands as a JSON array, with each command as a string. Commands must be exactly 'up', 'down', 'left', or 'right'.
- All coordinates should be formatted as JSON objects with keys 'x' and 'y' and integer values. For example, the starting position should be output as {'x': 0, 'y': 0}.
- When calling tools, ensure that all arguments use this JSON object format for coordinates, with keys 'x' and 'y'.
- Example of correct output:
If the target coordinate is {'x': 2, 'y': 3}, your response should include:
'The full list is: ["right", "right", "up", "up", "up"]'
And for tool calls, use:
'tool_calls': [{'function': {'name': 'validate_path', 'arguments': {'commands': ["right", "right", "up", "up", "up"], 'start_position': {'x': 0, 'y': 0}, 'target_position': {'x': 2, 'y': 3}}}}]'
Please ensure that all output strictly adheres to these formats. If any output is not in the correct format, redo the task and correct the output before providing the final answer."""
TOKEN_STOP = ["<|eot_id|>"]
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n"
USER_PROMPT = (
"<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n"
)
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n"
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n"
TASK_PROMPT = {
"Assistant": SYSTEM_PROMPT,
}
# css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}"
class ChatLLM:
def __init__(self, config_model):
self.llm = None
self.config_model = config_model
# self.load_cpp_model()
def load_cpp_model(self):
self.llm = Llama(**config_model)
def apply_chat_template(
self,
history,
system_message,
):
history = history or []
messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip())
for msg in history:
messages += (
USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1]
)
messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else ""
print(messages)
# messages = messages[:-1]
return messages
@spaces.GPU(duration=30)
def response(
self,
history,
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
):
messages = self.apply_chat_template(history, system_message)
history[-1][1] = ""
if not self.llm:
print("Loading model")
self.load_cpp_model()
for output in self.llm(
messages,
echo=False,
stream=True,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stop=TOKEN_STOP,
):
answer = output["choices"][0]["text"]
history[-1][1] += answer # here we append the answer to the last message in the history
# stream the response
yield history, history
def user(message, history):
history = history or []
# Append the user's message to the conversation history
history.append([message, ""])
return "", history
def clear_chat(chat_history_state, chat_message):
chat_history_state = []
chat_message = ""
return chat_history_state, chat_message
def gui(llm_chat):
with gr.Blocks() as app:
gr.Markdown("# Llama 3 70B Instruct GGUF")
gr.Markdown(
f"""
### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend.
"""
)
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="Chat",
height=700,
avatar_images=(
"assets/avatar_user.jpeg",
"assets/avatar_llama.jpeg",
),
)
with gr.Column(scale=1):
with gr.Row():
message = gr.Textbox(
label="Message",
placeholder="Ask me anything.",
lines=3,
)
with gr.Row():
submit = gr.Button(value="Send message", variant="primary")
clear = gr.Button(value="New chat", variant="primary")
stop = gr.Button(value="Stop", variant="secondary")
with gr.Accordion("Contextual Prompt Editor"):
default_task = "Assistant"
task_prompts_gui = gr.Dropdown(
TASK_PROMPT,
value=default_task,
label="Prompt selector",
visible=True,
interactive=True,
)
system_msg = gr.Textbox(
TASK_PROMPT[default_task],
label="System Message",
placeholder="system prompt",
lines=4,
)
def task_selector(choice):
return gr.update(value=TASK_PROMPT[choice])
task_prompts_gui.change(
task_selector,
[task_prompts_gui],
[system_msg],
)
with gr.Accordion("Advanced settings", open=False):
with gr.Column():
max_tokens = gr.Slider(
20, 4096, label="Max Tokens", step=20, value=400
)
temperature = gr.Slider(
0.2, 2.0, label="Temperature", step=0.1, value=0.8
)
top_p = gr.Slider(
0.0, 1.0, label="Top P", step=0.05, value=0.95
)
top_k = gr.Slider(
0, 100, label="Top K", step=1, value=40
)
repeat_penalty = gr.Slider(
0.0,
2.0,
label="Repetition Penalty",
step=0.1,
value=1.1,
)
chat_history_state = gr.State()
clear.click(
clear_chat,
inputs=[chat_history_state, message],
outputs=[chat_history_state, message],
queue=False,
)
clear.click(lambda: None, None, chatbot, queue=False)
submit_click_event = submit.click(
fn=user,
inputs=[message, chat_history_state],
outputs=[message, chat_history_state],
queue=True,
).then(
fn=llm_chat.response,
inputs=[
chat_history_state,
system_msg,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
],
outputs=[chatbot, chat_history_state],
queue=True,
)
stop.click(
fn=None,
inputs=None,
outputs=None,
cancels=[submit_click_event],
queue=False,
)
return app
if __name__ == "__main__":
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME)
config_model = {
"model_path": model_path,
"n_ctx": MAX_CONTEXT_LENGTH,
"n_gpu_layers": -1 if CUDA else 0,
}
llm_chat = ChatLLM(config_model)
app = gui(llm_chat)
app.queue(default_concurrency_limit=40)
app.launch(
max_threads=40,
share=False,
show_error=True,
quiet=False,
debug=True,
allowed_paths=["./assets/"],
)