|
import os |
|
from llama_cpp import Llama |
|
from fastapi import FastAPI, Query |
|
import gradio as gr |
|
|
|
|
|
TOKEN_LIMIT = 256 |
|
|
|
|
|
SYSTEM_MESSAGE = """You are Bella, an expert AI assistant dedicated to supporting users across diverse domains such as coding, academic assignments (homework, computer science projects), and professional document creation. Your responses should always be accurate, comprehensive, and tailored to the user's needs, whether they are beginners or advanced learners. Prioritize clear explanations, practical advice, and step-by-step guidance to ensure user success. Do not engage in conversational filler; focus strictly on providing direct and valuable assistance.""" |
|
|
|
|
|
llm = None |
|
try: |
|
print("Loading MiniCPM-V-2_6-gguf model...") |
|
llm = Llama.from_pretrained( |
|
repo_id="openbmb/MiniCPM-V-2_6-gguf", |
|
filename="ggml-model-Q4_K_M.gguf", |
|
n_ctx=4096, |
|
n_threads=os.cpu_count(), |
|
n_batch=512, |
|
n_gpu_layers=0, |
|
verbose=False, |
|
) |
|
print("MiniCPM-V-2_6-gguf model loaded successfully.") |
|
except Exception as e: |
|
print(f"Error loading MiniCPM-V-2_6-gguf model: {e}") |
|
|
|
|
|
|
|
|
|
|
|
def llm_query(messages_history: list, max_tokens: int) -> str: |
|
if llm is None: |
|
yield "Error: LLM model not loaded. Cannot generate response." |
|
return |
|
|
|
try: |
|
common_stop_tokens = ["<|im_end|>", "</s>", "<|end_of_text|>"] |
|
|
|
response_generator = llm.create_chat_completion( |
|
messages=messages_history, |
|
stream=True, |
|
max_tokens=max_tokens, |
|
temperature=0.7, |
|
top_p=0.9, |
|
repeat_penalty=1.1, |
|
|
|
stop=common_stop_tokens |
|
) |
|
|
|
full_response = "" |
|
for chunk in response_generator: |
|
token = chunk["choices"][0]["delta"].get("content", "") |
|
full_response += token |
|
yield full_response |
|
|
|
except Exception as e: |
|
print(f"Error during LLM inference: {e}") |
|
yield f"An error occurred during generation: {e}" |
|
|
|
|
|
|
|
app = FastAPI() |
|
|
|
@app.get("/ask") |
|
def ask_api(q: str = Query(...), tokens: int = Query(TOKEN_LIMIT)): |
|
messages_for_api = [ |
|
{"role": "system", "content": SYSTEM_MESSAGE}, |
|
{"role": "user", "content": q} |
|
] |
|
try: |
|
response = llm.create_chat_completion( |
|
messages=messages_for_api, |
|
max_tokens=tokens, |
|
temperature=0.7, |
|
top_p=0.9, |
|
repeat_penalty=1.1, |
|
repeat_last_n=256, |
|
stop=["<|im_end|>", "</s>", "<|end_of_text|>"] |
|
) |
|
return {"answer": response["choices"][0]["message"]["content"]} |
|
except Exception as e: |
|
return {"error": str(e)}, 500 |
|
|
|
@app.post("/ask") |
|
def ask_post_api(body: dict): |
|
return ask_api(q=body.get("q", ""), tokens=body.get("tokens", TOKEN_LIMIT)) |
|
|
|
|
|
|
|
def chat_fn(message, history, max_tokens): |
|
new_history = history + [{"role": "user", "content": message}] |
|
yield new_history, gr.update(value="") |
|
|
|
messages_for_llm = [{"role": "system", "content": SYSTEM_MESSAGE}] + new_history |
|
|
|
full_bot_response = "" |
|
for chunk in llm_query(messages_for_llm, max_tokens): |
|
full_bot_response = chunk |
|
if len(new_history) > 0 and new_history[-1]["role"] == "assistant": |
|
new_history[-1]["content"] = full_bot_response |
|
else: |
|
new_history.append({"role": "assistant", "content": full_bot_response}) |
|
|
|
yield new_history, gr.update(value="") |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
# 🧠 Bella: MiniCPM-V-2_6-gguf AI Assistant |
|
Welcome! I'm Bella, designed to assist you with coding, homework, computer science projects, |
|
and document writing. I provide accurate, comprehensive, and tailored guidance. |
|
""" |
|
) |
|
|
|
chatbot = gr.Chatbot( |
|
height=500, |
|
label="Bella's Responses", |
|
type="messages", |
|
autoscroll=True, |
|
resizable=True, |
|
show_copy_button=True |
|
) |
|
|
|
msg = gr.Textbox(placeholder="Ask Bella a question...", show_label=False, submit_btn="Ask") |
|
|
|
token_slider = gr.Slider(64, 1024, value=256, step=16, label="Max tokens") |
|
|
|
clear_btn = gr.ClearButton([msg, chatbot]) |
|
|
|
msg.submit( |
|
fn=chat_fn, |
|
inputs=[msg, chatbot, token_slider], |
|
outputs=[chatbot, msg], |
|
queue=True |
|
) |
|
|
|
@app.on_event("startup") |
|
async def startup_event(): |
|
print("Starting Gradio app within FastAPI startup event...") |
|
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860))) |
|
print("Gradio app launch initiated.") |
|
|
|
if __name__ == "__main__": |
|
import uvicorn |
|
print("Running FastAPI app locally (if not in Hugging Face Space)...") |
|
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860))) |