import os
import traceback
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from ctransformers import LLM

# --- Config ---
REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"  # Low quantization
MODEL_TYPE = "llama"

# --- Cache dir ---
CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
os.makedirs(CACHE_DIR, exist_ok=True)

# --- FastAPI App ---
app = FastAPI(title="Llama 3.2 3B Instruct API")
_model = None

# --- Load Model ---
def get_model():
    global _model
    if _model is not None:
        return _model

    local_file = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        cache_dir=CACHE_DIR,
        local_dir_use_symlinks=False,
    )

    _model = LLM(
        model=local_file,     # direct file path
        model_type=MODEL_TYPE,
        gpu_layers=0,
        threads=os.cpu_count() or 2
    )
    return _model

# --- Request Schema ---
class PromptRequest(BaseModel):
    prompt: str
    max_new_tokens: int = 256
    temperature: float = 0.7

# --- API Endpoint ---
@app.post("/generate")
def generate_text(req: PromptRequest):
    try:
        model = get_model()
        output = model(
            req.prompt,
            max_new_tokens=req.max_new_tokens,
            temperature=req.temperature
        )
        return {"ok": True, "response": output}
    except Exception as e:
        tb = traceback.format_exc()
        print("❌ ERROR in /generate:\n", tb)
        raise HTTPException(status_code=500, detail={"error": str(e), "trace": tb})

# --- Health Check ---
@app.get("/health")
def health():
    try:
        _ = get_model()
        return {"ok": True}
    except Exception as e:
        return {"ok": False, "error": str(e)}