Tim Luka Horstmann commited on
Commit
b173427
Β·
1 Parent(s): 6aaa9c3

Similar setup to other model

Browse files
Files changed (2) hide show
  1. Dockerfile +23 -16
  2. llm_server.py +71 -20
Dockerfile CHANGED
@@ -1,34 +1,41 @@
1
  FROM python:3.10-slim
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive \
 
 
 
4
  MODEL_REPO="unsloth/Qwen3-0.6B-GGUF" \
5
  MODEL_FILE="Qwen3-0.6B-Q4_K_M.gguf" \
6
- HF_HOME=/app/cache
 
7
 
8
- # system deps + rust for llama-cpp
 
 
9
  RUN apt-get update && \
10
  apt-get install -y --no-install-recommends \
11
- build-essential cmake git curl wget libgomp1 ca-certificates && \
 
12
  rm -rf /var/lib/apt/lists/* && \
13
  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
14
- . "$HOME/.cargo/env" && rustup default stable
15
 
16
- WORKDIR /app
17
-
18
- RUN mkdir -p /app/cache /app/logs /app/workspace /app/pretrained_models/llm \
19
- && chmod -R 777 /app/cache /app/logs /app/workspace /app/pretrained_models/llm
20
 
 
21
  COPY requirements.txt .
22
- RUN pip install --no-cache-dir -r requirements.txt
23
-
24
- RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python \
25
- && cd /tmp/llama-cpp-python \
26
- && FORCE_CMAKE=1 pip install --no-cache-dir . \
27
- && pip install --no-cache-dir "llama-cpp-python[server]" huggingface_hub \
28
- && rm -rf /tmp/llama-cpp-python
29
 
 
 
 
 
 
 
30
 
31
- # Copy the LLM server code
32
  COPY llm_server.py /app/llm_server.py
33
 
34
  EXPOSE 7860
 
1
  FROM python:3.10-slim
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive \
4
+ RUSTUP_HOME=/root/.rustup \
5
+ CARGO_HOME=/root/.cargo \
6
+ PATH=/root/.cargo/bin:$PATH \
7
  MODEL_REPO="unsloth/Qwen3-0.6B-GGUF" \
8
  MODEL_FILE="Qwen3-0.6B-Q4_K_M.gguf" \
9
+ HF_HOME=/app/cache \
10
+ TRANSFORMERS_CACHE=/app/cache
11
 
12
+ WORKDIR /app
13
+
14
+ # install system deps + Rust toolchain
15
  RUN apt-get update && \
16
  apt-get install -y --no-install-recommends \
17
+ build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
18
+ gcc g++ libffi-dev libopenblas-dev libstdc++6 libgcc-s1 && \
19
  rm -rf /var/lib/apt/lists/* && \
20
  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
21
+ rustup default stable
22
 
23
+ # cache dirs
24
+ RUN mkdir -p /app/cache /app/pretrained_models/llm && chmod -R 777 /app/cache /app/pretrained_models/llm
 
 
25
 
26
+ # Python deps (except llama-cpp-python)
27
  COPY requirements.txt .
28
+ RUN sed -i '/llama-cpp-python/d' requirements.txt && \
29
+ pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
30
 
31
+ # build llama-cpp-python from source (with its llama.cpp submodule)
32
+ RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
33
+ cd /tmp/llama-cpp-python && \
34
+ git submodule update --init --recursive && \
35
+ python -m pip install --no-cache-dir . && \
36
+ rm -rf /tmp/llama-cpp-python
37
 
38
+ # copy the server code
39
  COPY llm_server.py /app/llm_server.py
40
 
41
  EXPOSE 7860
llm_server.py CHANGED
@@ -1,11 +1,21 @@
1
- import os, time, logging
 
 
 
2
  from pathlib import Path
 
3
  from fastapi import FastAPI, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from fastapi.responses import JSONResponse
 
 
6
  from llama_cpp import Llama
7
- from huggingface_hub import hf_hub_download, login
8
 
 
 
 
 
 
9
  app = FastAPI()
10
  app.add_middleware(
11
  CORSMiddleware,
@@ -14,54 +24,95 @@ app.add_middleware(
14
  allow_headers=["*"],
15
  )
16
 
17
- MODEL_DIR = Path("/app/pretrained_models/llm")
 
18
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
19
- MODEL_FILE = os.getenv("MODEL_FILE")
 
20
  MODEL_PATH = MODEL_DIR / MODEL_FILE
21
 
 
 
 
22
  @app.on_event("startup")
23
- async def startup():
24
- logging.info("Starting LLM service…")
 
 
 
 
 
 
25
  if not MODEL_PATH.exists():
26
- token = os.getenv("HF_TOKEN")
27
- if token:
28
- login(token=token)
29
  hf_hub_download(
30
  repo_id=os.getenv("MODEL_REPO"),
31
  filename=MODEL_FILE,
32
- local_dir=str(MODEL_DIR)
 
33
  )
 
 
 
34
  global llm
35
  llm = Llama(
36
  model_path=str(MODEL_PATH),
37
  n_ctx=1024,
38
- n_threads=2,
39
  n_gpu_layers=0,
40
  use_mlock=True,
41
  f16_kv=True,
 
 
 
42
  )
43
- logging.info("LLM loaded.")
 
 
 
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @app.post("/v1/chat/completions")
46
- async def chat(req: dict):
47
  if req.get("model") != "llama-cpp":
48
  raise HTTPException(status_code=404, detail="Model not found")
 
 
49
  resp = llm.create_chat_completion(
50
  messages=req["messages"],
51
  max_tokens=req.get("max_tokens", 256),
52
  temperature=req.get("temperature", 0.7),
53
  top_p=req.get("top_p", 1.0),
54
- stream=False
55
  )
 
 
56
  return JSONResponse({
57
- "id": resp["id"],
58
- "object": "chat.completion",
59
- "created": resp.get("created", int(time.time())),
60
- "model": "llama-cpp",
61
  "choices": [{
62
- "index": 0,
63
  "message": {
64
- "role": resp["choices"][0]["message"]["role"],
65
  "content": resp["choices"][0]["message"]["content"],
66
  },
67
  "finish_reason": resp["choices"][0].get("finish_reason", "stop"),
 
1
+ import os
2
+ import time
3
+ import logging
4
+ import asyncio
5
  from pathlib import Path
6
+
7
  from fastapi import FastAPI, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import JSONResponse
10
+
11
+ from huggingface_hub import login, hf_hub_download
12
  from llama_cpp import Llama
 
13
 
14
+ # ─── logging setup ────────────────────────────────────────────────────────────
15
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
16
+ logger = logging.getLogger("llm_server")
17
+
18
+ # ─── FastAPI setup ────────────────────────────────────────────────────────────
19
  app = FastAPI()
20
  app.add_middleware(
21
  CORSMiddleware,
 
24
  allow_headers=["*"],
25
  )
26
 
27
+ # ─── model paths & env ────────────────────────────────────────────────────────
28
+ MODEL_DIR = Path("/app/pretrained_models/llm")
29
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
30
+
31
+ MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
32
  MODEL_PATH = MODEL_DIR / MODEL_FILE
33
 
34
+ HF_TOKEN = os.getenv("HF_TOKEN")
35
+
36
+ # ─── startup: download, init, warm-up, schedule keep-alive ────────────────────
37
  @app.on_event("startup")
38
+ async def startup_event():
39
+ logger.info("πŸ”§ Starting LLM service…")
40
+
41
+ # login if we have a token
42
+ if HF_TOKEN:
43
+ login(token=HF_TOKEN)
44
+
45
+ # download weights if missing
46
  if not MODEL_PATH.exists():
47
+ logger.info(f"β†’ Downloading {MODEL_FILE} from {os.getenv('MODEL_REPO')}")
 
 
48
  hf_hub_download(
49
  repo_id=os.getenv("MODEL_REPO"),
50
  filename=MODEL_FILE,
51
+ local_dir=str(MODEL_DIR),
52
+ token=HF_TOKEN,
53
  )
54
+ logger.info("βœ” Download complete")
55
+
56
+ # instantiate llama.cpp
57
  global llm
58
  llm = Llama(
59
  model_path=str(MODEL_PATH),
60
  n_ctx=1024,
61
+ n_threads=os.cpu_count(), # use all available vCPUs
62
  n_gpu_layers=0,
63
  use_mlock=True,
64
  f16_kv=True,
65
+ batch_prefill=True,
66
+ prefill_logits=False,
67
+ verbose=False,
68
  )
69
+ logger.info("βœ” Llama model loaded")
70
+
71
+ # schedule periodic keep-alive so the Space never goes idle
72
+ asyncio.create_task(_keep_model_warm())
73
+ logger.info("πŸ”„ Keep-alive warm-up task scheduled (every 12 min)")
74
 
75
+ async def _keep_model_warm():
76
+ while True:
77
+ try:
78
+ logger.debug("…warm-up ping")
79
+ # a 1-token echo
80
+ llm.create_chat_completion(
81
+ messages=[{"role": "user", "content": "/no_think ok"}],
82
+ max_tokens=1,
83
+ stream=False,
84
+ )
85
+ logger.debug("…ping done")
86
+ except Exception as e:
87
+ logger.warning(f"Warm-up ping failed: {e}")
88
+ # HF Spaces idle timeout is ~15 min; ping every 12
89
+ await asyncio.sleep(12 * 60)
90
+
91
+ # ─── OpenAI‐compatible endpoint ───────────────────────────────────────────────
92
  @app.post("/v1/chat/completions")
93
+ async def chat_completions(req: dict):
94
  if req.get("model") != "llama-cpp":
95
  raise HTTPException(status_code=404, detail="Model not found")
96
+
97
+ # call into llama.cpp
98
  resp = llm.create_chat_completion(
99
  messages=req["messages"],
100
  max_tokens=req.get("max_tokens", 256),
101
  temperature=req.get("temperature", 0.7),
102
  top_p=req.get("top_p", 1.0),
103
+ stream=False,
104
  )
105
+
106
+ # repackage into OpenAI JSON
107
  return JSONResponse({
108
+ "id": resp["id"],
109
+ "object": "chat.completion",
110
+ "created": resp.get("created", int(time.time())),
111
+ "model": "llama-cpp",
112
  "choices": [{
113
+ "index": 0,
114
  "message": {
115
+ "role": resp["choices"][0]["message"]["role"],
116
  "content": resp["choices"][0]["message"]["content"],
117
  },
118
  "finish_reason": resp["choices"][0].get("finish_reason", "stop"),