import os from fastapi import FastAPI, HTTPException from pydantic import BaseModel from huggingface_hub import hf_hub_download from llama_cpp import Llama REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF" FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache") os.makedirs(CACHE_DIR, exist_ok=True) app = FastAPI() _model = None def get_model(): global _model if _model is not None: return _model local_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, cache_dir=CACHE_DIR, local_dir_use_symlinks=False, ) _model = Llama( model_path=local_path, n_ctx=2048, n_threads=os.cpu_count() or 2, n_batch=256, verbose=False ) return _model class PromptRequest(BaseModel): prompt: str max_tokens: int = 256 temperature: float = 0.7 @app.post("/generate") def generate_text(req: PromptRequest): try: model = get_model() output = model( req.prompt, max_tokens=req.max_tokens, temperature=req.temperature, stop=[""] ) return {"ok": True, "response": output["choices"][0]["text"]} except Exception as e: raise HTTPException(status_code=500, detail=str(e))