import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
os.makedirs(CACHE_DIR, exist_ok=True)

app = FastAPI()
_model = None

def get_model():
    global _model
    if _model is not None:
        return _model
    local_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        cache_dir=CACHE_DIR,
        local_dir_use_symlinks=False,
    )
    _model = Llama(
        model_path=local_path,
        n_ctx=2048,
        n_threads=os.cpu_count() or 2,
        n_batch=256,
        verbose=False
    )
    return _model

class PromptRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7

@app.post("/generate")
def generate_text(req: PromptRequest):
    try:
        model = get_model()
        output = model(
            req.prompt,
            max_tokens=req.max_tokens,
            temperature=req.temperature,
            stop=["</s>"]
        )
        return {"ok": True, "response": output["choices"][0]["text"]}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))