llm_server / app.py
omaryasserhassan's picture
Update app.py
c06971b verified
raw
history blame
1.33 kB
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
os.makedirs(CACHE_DIR, exist_ok=True)
app = FastAPI()
_model = None
def get_model():
global _model
if _model is not None:
return _model
local_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=CACHE_DIR,
local_dir_use_symlinks=False,
)
_model = Llama(
model_path=local_path,
n_ctx=2048,
n_threads=os.cpu_count() or 2,
n_batch=256,
verbose=False
)
return _model
class PromptRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
@app.post("/generate")
def generate_text(req: PromptRequest):
try:
model = get_model()
output = model(
req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
stop=["</s>"]
)
return {"ok": True, "response": output["choices"][0]["text"]}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))