hackeracademy commited on
Commit
222587e
·
1 Parent(s): 95ee2a7

Serve immediate health endpoint; download in background

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -11
  2. app.py +27 -24
Dockerfile CHANGED
@@ -1,20 +1,11 @@
1
- # ---- 1. Base image that already has musl ----
2
  FROM python:3.11-alpine
3
 
4
- # ---- 2. System deps for llama-cpp-python (CPU) ----
5
- RUN apk add --no-cache \
6
- build-base \
7
- libffi-dev \
8
- cmake \
9
- git
10
-
11
- # ---- 3. Python deps ----
12
  WORKDIR /app
 
13
  COPY requirements.txt .
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
- # ---- 4. Copy rest of the code ----
17
  COPY . .
18
-
19
  EXPOSE 7860
20
  CMD ["python", "-u", "app.py"]
 
 
1
  FROM python:3.11-alpine
2
 
3
+ RUN apk add --no-cache build-base libffi-dev cmake git curl
 
 
 
 
 
 
 
4
  WORKDIR /app
5
+
6
  COPY requirements.txt .
7
  RUN pip install --no-cache-dir -r requirements.txt
8
 
 
9
  COPY . .
 
10
  EXPOSE 7860
11
  CMD ["python", "-u", "app.py"]
app.py CHANGED
@@ -1,31 +1,34 @@
1
- import os, logging, requests, time
2
- from contextlib import asynccontextmanager
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
  from llama_cpp import Llama
6
 
7
- # Direct public download link
8
  MODEL_URL = (
9
  "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
10
  "resolve/main/foundation-sec-8b-q4_k_m.gguf"
11
  )
12
  MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
13
 
14
- @asynccontextmanager
15
- async def lifespan(app: FastAPI):
16
- logging.basicConfig(level=logging.INFO)
17
 
18
- # Download once; skip if already present
 
 
 
 
 
 
 
 
19
  if not os.path.exists(MODEL_PATH):
20
- logging.info("Downloading model … (~4.9 GB)")
21
  with requests.get(MODEL_URL, stream=True, timeout=30) as r:
22
  r.raise_for_status()
23
  with open(MODEL_PATH, "wb") as f:
24
  for chunk in r.iter_content(chunk_size=8192):
25
  f.write(chunk)
26
  logging.info("Download finished.")
27
-
28
- logging.info("Loading model …")
29
  app.state.llm = Llama(
30
  model_path=MODEL_PATH,
31
  n_ctx=4096,
@@ -33,27 +36,27 @@ async def lifespan(app: FastAPI):
33
  verbose=False
34
  )
35
  logging.info("Model ready.")
36
- yield
37
- logging.info("Shutting down.")
38
 
39
- app = FastAPI(lifespan=lifespan)
40
 
41
  class ChatRequest(BaseModel):
42
  messages: list[dict]
43
  max_tokens: int = 256
44
  temperature: float = 0.7
45
 
46
- @app.get("/")
47
- def root():
48
- return {"message": "Foundation-Sec-8B API running on HF Space"}
49
-
50
  @app.post("/v1/chat/completions")
51
  def chat(req: ChatRequest):
52
- try:
53
- return app.state.llm.create_chat_completion(
54
- messages=req.messages,
55
- max_tokens=req.max_tokens,
56
- temperature=req.temperature
57
  )
58
- except Exception as e:
59
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
1
+ import os, logging, requests, threading, uvicorn
 
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from llama_cpp import Llama
5
 
 
6
  MODEL_URL = (
7
  "https://huggingface.co/fdtn-ai/Foundation-Sec-8B-Q4_K_M-GGUF/"
8
  "resolve/main/foundation-sec-8b-q4_k_m.gguf"
9
  )
10
  MODEL_PATH = "foundation-sec-8b-q4_k_m.gguf"
11
 
12
+ logging.basicConfig(level=logging.INFO)
 
 
13
 
14
+ # --- tiny “alive” route so HF sees the container immediately ---
15
+ app = FastAPI()
16
+
17
+ @app.get("/")
18
+ def root():
19
+ return {"status": "loading model …"}
20
+
21
+ # --- download once, in a background thread so / stays alive ---
22
+ def download_model():
23
  if not os.path.exists(MODEL_PATH):
24
+ logging.info("Downloading model …")
25
  with requests.get(MODEL_URL, stream=True, timeout=30) as r:
26
  r.raise_for_status()
27
  with open(MODEL_PATH, "wb") as f:
28
  for chunk in r.iter_content(chunk_size=8192):
29
  f.write(chunk)
30
  logging.info("Download finished.")
31
+ logging.info("Loading model into RAM …")
 
32
  app.state.llm = Llama(
33
  model_path=MODEL_PATH,
34
  n_ctx=4096,
 
36
  verbose=False
37
  )
38
  logging.info("Model ready.")
 
 
39
 
40
+ threading.Thread(target=download_model, daemon=True).start()
41
 
42
  class ChatRequest(BaseModel):
43
  messages: list[dict]
44
  max_tokens: int = 256
45
  temperature: float = 0.7
46
 
 
 
 
 
47
  @app.post("/v1/chat/completions")
48
  def chat(req: ChatRequest):
49
+ if not hasattr(app.state, "llm"):
50
+ raise HTTPException(
51
+ status_code=503,
52
+ detail="Model still loading, please retry in ~30 s"
 
53
  )
54
+ return app.state.llm.create_chat_completion(
55
+ messages=req.messages,
56
+ max_tokens=req.max_tokens,
57
+ temperature=req.temperature
58
+ )
59
+
60
+ # --- start uvicorn on port 7860 (HF expects this) ---
61
+ if __name__ == "__main__":
62
+ uvicorn.run(app, host="0.0.0.0", port=7860)