File size: 1,400 Bytes
722f6b0
e82b7da
 
 
 
 
 
 
90f5d7c
 
 
722f6b0
 
 
 
5d8cca4
722f6b0
5d8cca4
 
 
 
 
722f6b0
e82b7da
5d8cca4
222587e
722f6b0
5d8cca4
722f6b0
 
 
 
 
 
 
 
 
 
 
 
5d8cca4
222587e
5d8cca4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os, gradio as gr, requests, tempfile, logging, time
from llama_cpp import Llama

MODEL_URL = (
    "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
    "resolve/main/foundation-sec-8b-q4_k_m.gguf"
)

CACHE_DIR = "/tmp"
MODEL_PATH = os.path.join(CACHE_DIR, "foundation-sec-8b-q4_k_m.gguf")

# silence matplotlib cache warning
os.environ["MPLCONFIGDIR"] = CACHE_DIR

# download once
if not os.path.exists(MODEL_PATH):
    logging.info("Downloading model …")
    with requests.get(MODEL_URL, stream=True) as r:
        r.raise_for_status()
        with open(MODEL_PATH, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    logging.info("Download finished.")

llm = Llama(model_path=MODEL_PATH, n_ctx=4096, verbose=False)

# correct signature: message, history
def chat_fn(message, history):
    messages = []
    for human, ai in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": ai})
    messages.append({"role": "user", "content": message})

    out = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        stream=False,
    )
    return out["choices"][0]["message"]["content"]

demo = gr.ChatInterface(chat_fn, title="Foundation-Sec-8B")
demo.launch(server_name="0.0.0.0", server_port=7860)