import os import traceback from fastapi import FastAPI, HTTPException from pydantic import BaseModel from huggingface_hub import hf_hub_download from ctransformers import LLM # --- Config --- REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF" FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" # Low quantization MODEL_TYPE = "llama" # --- Cache dir --- CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache") os.makedirs(CACHE_DIR, exist_ok=True) # --- FastAPI App --- app = FastAPI(title="Llama 3.2 3B Instruct API") _model = None # --- Load Model --- def get_model(): global _model if _model is not None: return _model local_file = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, cache_dir=CACHE_DIR, local_dir_use_symlinks=False, ) _model = LLM( model=local_file, # direct file path model_type=MODEL_TYPE, gpu_layers=0, threads=os.cpu_count() or 2 ) return _model # --- Request Schema --- class PromptRequest(BaseModel): prompt: str max_new_tokens: int = 256 temperature: float = 0.7 # --- API Endpoint --- @app.post("/generate") def generate_text(req: PromptRequest): try: model = get_model() output = model( req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature ) return {"ok": True, "response": output} except Exception as e: tb = traceback.format_exc() print("❌ ERROR in /generate:\n", tb) raise HTTPException(status_code=500, detail={"error": str(e), "trace": tb}) # --- Health Check --- @app.get("/health") def health(): try: _ = get_model() return {"ok": True} except Exception as e: return {"ok": False, "error": str(e)}