File size: 1,400 Bytes
722f6b0 e82b7da 90f5d7c 722f6b0 5d8cca4 722f6b0 5d8cca4 722f6b0 e82b7da 5d8cca4 222587e 722f6b0 5d8cca4 722f6b0 5d8cca4 222587e 5d8cca4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import os, gradio as gr, requests, tempfile, logging, time
from llama_cpp import Llama
MODEL_URL = (
"https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
"resolve/main/foundation-sec-8b-q4_k_m.gguf"
)
CACHE_DIR = "/tmp"
MODEL_PATH = os.path.join(CACHE_DIR, "foundation-sec-8b-q4_k_m.gguf")
# silence matplotlib cache warning
os.environ["MPLCONFIGDIR"] = CACHE_DIR
# download once
if not os.path.exists(MODEL_PATH):
logging.info("Downloading model …")
with requests.get(MODEL_URL, stream=True) as r:
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logging.info("Download finished.")
llm = Llama(model_path=MODEL_PATH, n_ctx=4096, verbose=False)
# correct signature: message, history
def chat_fn(message, history):
messages = []
for human, ai in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": ai})
messages.append({"role": "user", "content": message})
out = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
stream=False,
)
return out["choices"][0]["message"]["content"]
demo = gr.ChatInterface(chat_fn, title="Foundation-Sec-8B")
demo.launch(server_name="0.0.0.0", server_port=7860)
|