Spaces:

sandz7
/

loki

Runtime error

App Files Files Community

loki / app.py

sandz7

added condition for loki being active and removed prints

e7f4aa1 verified 12 months ago

raw

history blame contribute delete

8.83 kB

	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
	from huggingface_hub import login
	import os
	import threading
	import spaces
	from openai import OpenAI
	import sys

	# Init ZeroGPU
	# spaces.initialize_zero_gpu()

	TOKEN = os.getenv('HF_AUTH_TOKEN')
	login(token=TOKEN,
	add_to_git_credential=False)

	# Open ai api key
	API_KEY = os.getenv('OPEN_AI_API_KEY')

	DESCRIPTION = '''
	<div>
	<h1 style="text-align: center;">Loki 👁️</h1>
	<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
	</div>
	'''

	# Place transformers in hardware to prepare for process and generation
	llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
	llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
	terminators = [
	llama_tokenizer.eos_token_id,
	llama_tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	]

	# The output
	def output_list(output: list):
	"""
	Grabs the output from the first position in list,
	and returns it as a string as a response
	"""
	cleaned_output = ''.join(filter(None, output))

	return cleaned_output

	# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
	# Prompt for gpt-4o
	def gpt_generation(input: str,
	llama_output: str,
	mode: str):
	"""
	Passes the llama output and all input,
	returns the stream, so we can yield it in final generation.
	"""
	if llama_output is not None:
	base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
	Llama3 LLM gave the user this response:\n\n {llama_output}\n
	Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
	than ignore it's output and give your's alone.'''

	prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
	else:
	base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
	Respond in a thorough and complete way.'''

	prompt = base_prompt.format(llama_input=input)

	# Setup the client
	client = OpenAI(api_key=API_KEY)

	stream = client.chat.completions.create(
	model=mode,
	messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
	{"role": "user", "content": prompt}],
	stream=True,
	)

	return stream

	# Place just input pass and return generation output
	def llama_generation(input_text: str,
	history: list,
	temperature: float,
	max_new_tokens: int):
	"""
	Pass input texts, tokenize, output and back to text.
	"""

	conversation = []
	for user, assistant in history:
	conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
	conversation.append({"role": "user", "content": input_text})

	input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

	streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

	# generation arguments to pass in llm generate() eventually
	generate_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=temperature,
	eos_token_id=terminators[0]
	)

	# This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
	if temperature == 0:
	generate_kwargs["do_sample"] = False

	# start the thread
	thread = threading.Thread(target=llama_model.generate, kwargs=generate_kwargs)
	thread.start()
	thread.join()
	return streamer

	def check_cuda():
	if torch.cuda.is_available():
	return f"GPU Being Used: {torch.cuda.get_device_name(0)}"
	else:
	return "No GPU is being used right now."

	first_time = True
	llm_mode = ""

	@spaces.GPU(decoration=30)
	def bot_comms(input_text: str,
	history: list,
	temperature: float,
	max_new_tokens: int):
	"""
	The connection between gradio and the LLM's
	"""
	global first_time
	global llm_mode

	if input_text == "system details":
	yield f"Python: {sys.version}\nGradio Version: {gr.__version__}\nPyTorch Version: {torch.__version__}"
	return

	if input_text == "mode":
	if llm_mode == "":
	yield "The mode is currently at Loki Default mode"
	return
	else:
	yield f"The current mode: {llm_mode}"
	return

	if input_text == "check cuda":
	cuda_info = check_cuda()
	yield cuda_info
	return

	if input_text == "switch to loki":
	llm_mode = input_text
	yield "Loki is on 👁️"
	return

	if input_text == "switch to llama":
	llm_mode = input_text
	yield "Got it! Llama is now activate for your questions only 🦙"
	return

	if input_text == "switch to gpt-4o":
	llm_mode = input_text
	yield "Understood! GPT-4o is now hearing your responses only 👾"
	return

	if input_text == "switch to gpt-3.5-turbo":
	llm_mode = input_text
	yield "Done. GPT-3.5-turbo is ready for your questions! 🏃"
	return

	if llm_mode == "switch to llama":
	streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)

	if llm_mode == "switch to gpt-4o":
	stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o")
	outputs = []
	for chunk in stream:
	if chunk.choices[0].delta.content is not None:
	text = chunk.choices[0].delta.content
	outputs.append(text)
	yield "".join(outputs)

	if llm_mode == "switch to gpt-3.5-turbo":
	stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo")
	outputs = []
	for chunk in stream:
	if chunk.choices[0].delta.content is not None:
	text = chunk.choices[0].delta.content
	outputs.append(text)
	yield "".join(outputs)

	if llm_mode is None or llm_mode == "" or llm_mode == "switch to loki":
	streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
	output_text = output_list([text for text in streamer])
	stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o")

	outputs = []
	for chunk in stream:
	if chunk.choices[0].delta.content is not None:
	text = chunk.choices[0].delta.content
	outputs.append(text)
	yield "".join(outputs)

	chatbot=gr.Chatbot(height=600, label="Loki AI")

	with gr.Blocks(fill_height=True) as demo:
	gr.Markdown(DESCRIPTION)
	gr.ChatInterface(
	fn=bot_comms,
	chatbot=chatbot,
	fill_height=True,
	# These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
	additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
	additional_inputs=[
	# Slider feature users can interactive to effect the temperature of model
	gr.Slider(minimum=0,
	maximum=1,
	step=0.1,
	value=0.95,
	label="Temperature",
	render=False),
	# Sliding feature for the max tokens for generation on model
	gr.Slider(minimum=128,
	maximum=1500,
	step=1,
	value=512,
	label="Max new tokens",
	render=False),
	],
	examples=[
	["Make a poem of batman inside willy wonka"],
	["How can you a burrito with just flour?"],
	["How was saturn formed in 3 sentences"],
	["How does the frontal lobe effect playing soccer"],
	],
	cache_examples=False
	)

	if __name__ == "__main__":
	demo.launch()