Spaces:

hackeracademy
/

foundation-sec-llm-api

Sleeping

App Files Files Community

hackeracademy commited on 17 days ago

Commit

222587e

1 Parent(s): 95ee2a7

Serve immediate health endpoint; download in background

Browse files

Files changed (2) hide show

Dockerfile +2 -11
app.py +27 -24

Dockerfile CHANGED Viewed

@@ -1,20 +1,11 @@
-# ---- 1. Base image that already has musl ----
 FROM python:3.11-alpine
-# ---- 2. System deps for llama-cpp-python (CPU) ----
-RUN apk add --no-cache \
-    build-base \
-    libffi-dev \
-    cmake \
-    git
-# ---- 3. Python deps ----
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# ---- 4. Copy rest of the code ----
 COPY . .
 EXPOSE 7860
 CMD ["python", "-u", "app.py"]

 FROM python:3.11-alpine
+RUN apk add --no-cache build-base libffi-dev cmake git curl
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 7860
 CMD ["python", "-u", "app.py"]

app.py CHANGED Viewed

@@ -1,31 +1,34 @@
-import os, logging, requests, time
-from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
-# Direct public download link
 MODEL_URL = (
     "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
     "resolve/main/foundation-sec-8b-q4_k_m.gguf"
 )
 MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    logging.basicConfig(level=logging.INFO)
-    # Download once; skip if already present
     if not os.path.exists(MODEL_PATH):
-        logging.info("Downloading model … (~4.9 GB)")
         with requests.get(MODEL_URL, stream=True, timeout=30) as r:
             r.raise_for_status()
             with open(MODEL_PATH, "wb") as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     f.write(chunk)
         logging.info("Download finished.")
-    logging.info("Loading model …")
     app.state.llm = Llama(
         model_path=MODEL_PATH,
         n_ctx=4096,
@@ -33,27 +36,27 @@ async def lifespan(app: FastAPI):
         verbose=False
     )
     logging.info("Model ready.")
-    yield
-    logging.info("Shutting down.")
-app = FastAPI(lifespan=lifespan)
 class ChatRequest(BaseModel):
     messages: list[dict]
     max_tokens: int = 256
     temperature: float = 0.7
-@app.get("/")
-def root():
-    return {"message": "Foundation-Sec-8B API running on HF Space"}
 @app.post("/v1/chat/completions")
 def chat(req: ChatRequest):
-    try:
-        return app.state.llm.create_chat_completion(
-            messages=req.messages,
-            max_tokens=req.max_tokens,
-            temperature=req.temperature
         )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))

+import os, logging, requests, threading, uvicorn
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from llama_cpp import Llama
 MODEL_URL = (
     "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
     "resolve/main/foundation-sec-8b-q4_k_m.gguf"
 )
 MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
+logging.basicConfig(level=logging.INFO)
+# --- tiny “alive” route so HF sees the container immediately ---
+app = FastAPI()
+@app.get("/")
+def root():
+    return {"status": "loading model …"}
+# --- download once, in a background thread so / stays alive ---
+def download_model():
     if not os.path.exists(MODEL_PATH):
+        logging.info("Downloading model …")
         with requests.get(MODEL_URL, stream=True, timeout=30) as r:
             r.raise_for_status()
             with open(MODEL_PATH, "wb") as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     f.write(chunk)
         logging.info("Download finished.")
+    logging.info("Loading model into RAM …")
     app.state.llm = Llama(
         model_path=MODEL_PATH,
         n_ctx=4096,
         verbose=False
     )
     logging.info("Model ready.")
+threading.Thread(target=download_model, daemon=True).start()
 class ChatRequest(BaseModel):
     messages: list[dict]
     max_tokens: int = 256
     temperature: float = 0.7
 @app.post("/v1/chat/completions")
 def chat(req: ChatRequest):
+    if not hasattr(app.state, "llm"):
+        raise HTTPException(
+            status_code=503,
+            detail="Model still loading, please retry in ~30 s"
         )
+    return app.state.llm.create_chat_completion(
+        messages=req.messages,
+        max_tokens=req.max_tokens,
+        temperature=req.temperature
+    )
+# --- start uvicorn on port 7860 (HF expects this) ---
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)