Spaces:
Sleeping
Sleeping
import os | |
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF" | |
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" | |
CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache") | |
os.makedirs(CACHE_DIR, exist_ok=True) | |
app = FastAPI() | |
_model = None | |
def get_model(): | |
global _model | |
if _model is not None: | |
return _model | |
local_path = hf_hub_download( | |
repo_id=REPO_ID, | |
filename=FILENAME, | |
cache_dir=CACHE_DIR, | |
local_dir_use_symlinks=False, | |
) | |
_model = Llama( | |
model_path=local_path, | |
n_ctx=2048, | |
n_threads=os.cpu_count() or 2, | |
n_batch=256, | |
verbose=False | |
) | |
return _model | |
class PromptRequest(BaseModel): | |
prompt: str | |
max_tokens: int = 256 | |
temperature: float = 0.7 | |
def generate_text(req: PromptRequest): | |
try: | |
model = get_model() | |
output = model( | |
req.prompt, | |
max_tokens=req.max_tokens, | |
temperature=req.temperature, | |
stop=["</s>"] | |
) | |
return {"ok": True, "response": output["choices"][0]["text"]} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |