import os, gradio as gr, requests, tempfile, logging, time from llama_cpp import Llama MODEL_URL = ( "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/" "resolve/main/foundation-sec-8b-q4_k_m.gguf" ) CACHE_DIR = "/tmp" MODEL_PATH = os.path.join(CACHE_DIR, "foundation-sec-8b-q4_k_m.gguf") # silence matplotlib cache warning os.environ["MPLCONFIGDIR"] = CACHE_DIR # download once if not os.path.exists(MODEL_PATH): logging.info("Downloading model …") with requests.get(MODEL_URL, stream=True) as r: r.raise_for_status() with open(MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) logging.info("Download finished.") llm = Llama(model_path=MODEL_PATH, n_ctx=4096, verbose=False) # correct signature: message, history def chat_fn(message, history): messages = [] for human, ai in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": ai}) messages.append({"role": "user", "content": message}) out = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, stream=False, ) return out["choices"][0]["message"]["content"] demo = gr.ChatInterface(chat_fn, title="Foundation-Sec-8B") demo.launch(server_name="0.0.0.0", server_port=7860)