gemma 9b with llama cpp b3259
Using llama.cpp release b3259 for quantization
i want python code to run gemma 9b with llama cpp b3259
like
from llama_cpp import Llama
llm = Llama(
model_path="./models/7B/llama-model.gguf",
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
# n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
"Q: Name the planets in the solar system? A: ", # Prompt
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
Determine the number of CPU cores available
num_cores = os.cpu_count()
num_threads = num_cores - 1 # Use all but one core
print(f'number of available cores is {num_cores}')
print(f'number of used threads is {num_threads}')
Download the model file
model_file = "file name in repo"
repo_id = "repo-id"
model_path = hf_hub_download(repo_id=repo_id, filename=model_file) # you can replace this with your path if the model is in the same directory
print(f"Model downloaded to: {model_path}")
Load the GGUF model
model = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=num_threads,
)
def generate_response(message, history):
prompt = f""" Very optimized prompt here concise answers to the following question:
Question: {message}
Answer:"""
try:
response = model(prompt, max_tokens=512, temperature=0.7, stream=True)
full_response = ""
for token in response:
full_response += token['choices'][0]['text']
yield full_response
except Exception as e:
yield f"An error occurred: {str(e)}"
Specify the correct paths for your images
header_image_path = "12f.jpeg" # Update this path
user_avatar_path = "profile.png" # Update this path
assistant_avatar_path = "1.jpg" # Update this path
with gr.Blocks() as demo:
gr.Image(header_image_path, show_label=False)
chatbot = gr.Chatbot(
avatar_images=(user_avatar_path, assistant_avatar_path)
)
msg = gr.Textbox(label="Type your message here...")
clear = gr.Button("Clear Chat")
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history):
bot_message = generate_response(history[-1][0], history)
for partial_message in bot_message:
history[-1][1] = partial_message
yield history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, chatbot, chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
demo.queue()
demo.launch()
this is a gradio app i made , i think you can use the model loading , response and clean_response as stand alone functions with no problems