loki / app.py
sandz7's picture
added condition for loki being active and removed prints
e7f4aa1 verified
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
import threading
import spaces
from openai import OpenAI
import sys
# Init ZeroGPU
# spaces.initialize_zero_gpu()
TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
add_to_git_credential=False)
# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Loki πŸ‘οΈ</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''
# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
llama_tokenizer.eos_token_id,
llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
# The output
def output_list(output: list):
"""
Grabs the output from the first position in list,
and returns it as a string as a response
"""
cleaned_output = ''.join(filter(None, output))
return cleaned_output
# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_generation(input: str,
llama_output: str,
mode: str):
"""
Passes the llama output and all input,
returns the stream, so we can yield it in final generation.
"""
if llama_output is not None:
base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
Llama3 LLM gave the user this response:\n\n {llama_output}\n
Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
than ignore it's output and give your's alone.'''
prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
else:
base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
Respond in a thorough and complete way.'''
prompt = base_prompt.format(llama_input=input)
# Setup the client
client = OpenAI(api_key=API_KEY)
stream = client.chat.completions.create(
model=mode,
messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
{"role": "user", "content": prompt}],
stream=True,
)
return stream
# Place just input pass and return generation output
def llama_generation(input_text: str,
history: list,
temperature: float,
max_new_tokens: int):
"""
Pass input texts, tokenize, output and back to text.
"""
conversation = []
for user, assistant in history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": input_text})
input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
# generation arguments to pass in llm generate() eventually
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
eos_token_id=terminators[0]
)
# This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
if temperature == 0:
generate_kwargs["do_sample"] = False
# start the thread
thread = threading.Thread(target=llama_model.generate, kwargs=generate_kwargs)
thread.start()
thread.join()
return streamer
def check_cuda():
if torch.cuda.is_available():
return f"GPU Being Used: {torch.cuda.get_device_name(0)}"
else:
return "No GPU is being used right now."
first_time = True
llm_mode = ""
@spaces.GPU(decoration=30)
def bot_comms(input_text: str,
history: list,
temperature: float,
max_new_tokens: int):
"""
The connection between gradio and the LLM's
"""
global first_time
global llm_mode
if input_text == "system details":
yield f"Python: {sys.version}\nGradio Version: {gr.__version__}\nPyTorch Version: {torch.__version__}"
return
if input_text == "mode":
if llm_mode == "":
yield "The mode is currently at Loki Default mode"
return
else:
yield f"The current mode: {llm_mode}"
return
if input_text == "check cuda":
cuda_info = check_cuda()
yield cuda_info
return
if input_text == "switch to loki":
llm_mode = input_text
yield "Loki is on πŸ‘οΈ"
return
if input_text == "switch to llama":
llm_mode = input_text
yield "Got it! Llama is now activate for your questions only πŸ¦™"
return
if input_text == "switch to gpt-4o":
llm_mode = input_text
yield "Understood! GPT-4o is now hearing your responses only πŸ‘Ύ"
return
if input_text == "switch to gpt-3.5-turbo":
llm_mode = input_text
yield "Done. GPT-3.5-turbo is ready for your questions! πŸƒ"
return
if llm_mode == "switch to llama":
streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
if llm_mode == "switch to gpt-4o":
stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o")
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
if llm_mode == "switch to gpt-3.5-turbo":
stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo")
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
if llm_mode is None or llm_mode == "" or llm_mode == "switch to loki":
streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
output_text = output_list([text for text in streamer])
stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o")
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
chatbot=gr.Chatbot(height=600, label="Loki AI")
with gr.Blocks(fill_height=True) as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=bot_comms,
chatbot=chatbot,
fill_height=True,
# These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False, render=False),
additional_inputs=[
# Slider feature users can interactive to effect the temperature of model
gr.Slider(minimum=0,
maximum=1,
step=0.1,
value=0.95,
label="Temperature",
render=False),
# Sliding feature for the max tokens for generation on model
gr.Slider(minimum=128,
maximum=1500,
step=1,
value=512,
label="Max new tokens",
render=False),
],
examples=[
["Make a poem of batman inside willy wonka"],
["How can you a burrito with just flour?"],
["How was saturn formed in 3 sentences"],
["How does the frontal lobe effect playing soccer"],
],
cache_examples=False
)
if __name__ == "__main__":
demo.launch()