Using llama.cpp release b3259 for quantization
i want python code to run gemma 9b with llama cpp b3259
like
from llama_cpp import Llama

llm = Llama(
model_path="./models/7B/llama-model.gguf",
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
# n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
"Q: Name the planets in the solar system? A: ", # Prompt
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

Determine the number of CPU cores available

num_cores = os.cpu_count()
num_threads = num_cores - 1 # Use all but one core
print(f'number of available cores is {num_cores}')
print(f'number of used threads is {num_threads}')

Download the model file

model_file = "file name in repo"
repo_id = "repo-id"
model_path = hf_hub_download(repo_id=repo_id, filename=model_file) # you can replace this with your path if the model is in the same directory
print(f"Model downloaded to: {model_path}")

Load the GGUF model

model = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=num_threads,
)

def generate_response(message, history):
prompt = f""" Very optimized prompt here concise answers to the following question:
Question: {message}
Answer:"""

try:
    response = model(prompt, max_tokens=512, temperature=0.7, stream=True)
    full_response = ""
    for token in response:
        full_response += token['choices'][0]['text']
        yield full_response
except Exception as e:
    yield f"An error occurred: {str(e)}"

Specify the correct paths for your images

header_image_path = "12f.jpeg" # Update this path
user_avatar_path = "profile.png" # Update this path
assistant_avatar_path = "1.jpg" # Update this path

with gr.Blocks() as demo:
gr.Image(header_image_path, show_label=False)
chatbot = gr.Chatbot(
avatar_images=(user_avatar_path, assistant_avatar_path)
)
msg = gr.Textbox(label="Type your message here...")
clear = gr.Button("Clear Chat")

def user(user_message, history):
    return "", history + [[user_message, None]]

def bot(history):
    bot_message = generate_response(history[-1][0], history)
    for partial_message in bot_message:
        history[-1][1] = partial_message
        yield history

msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
    bot, chatbot, chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()

this is a gradio app i made , i think you can use the model loading , response and clean_response as stand alone functions with no problems