newsglobe-backend / Dockerfile
MANOJSEQ's picture
Upload 2 files
54cb7c1 verified
# ---- Base image ----
FROM python:3.10-slim
ENV PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
HF_HUB_DISABLE_TELEMETRY=1 \
PORT=7860 \
# βœ… Writable + persistent on Spaces
HF_HOME=/data/hf_cache \
SENTENCE_TRANSFORMERS_HOME=/data/hf_cache \
NLTK_DATA=/data/nltk_data \
TLDEXTRACT_CACHE=/data/tld_cache \
HOME=/data
# Handy tools
RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
# ---- Python deps ----
COPY requirements.txt ./
RUN python -m pip install --upgrade pip && \
# CPU-only PyTorch first
pip install torch --index-url https://download.pytorch.org/whl/cpu && \
pip install -r requirements.txt && \
pip install sentencepiece
# ---- App code ----
COPY . .
# βœ… Make caches writable for the runtime user
RUN mkdir -p /data/hf_cache /data/nltk_data /data/tld_cache && chmod -R 777 /data
# ---- Warm caches into the image layer ----
# 1) Cache SBERT
RUN python - <<'PY'
from sentence_transformers import SentenceTransformer
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("βœ… SBERT cached")
PY
# 2) Cache NLTK VADER
RUN python - <<'PY'
import os, nltk
os.makedirs(os.getenv("NLTK_DATA","/data/nltk_data"), exist_ok=True)
nltk.download("vader_lexicon", download_dir=os.getenv("NLTK_DATA","/data/nltk_data"))
print("βœ… VADER cached")
PY
# 3) (Recommended) Pre-warm tweet-topic model so first request is instant
RUN python - <<'PY'
from transformers import pipeline
p = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", top_k=1)
p("warmup")
print("βœ… Topic model cached")
PY
# Ensure everything under /data is writable after warms
RUN chmod -R 777 /data
EXPOSE 7860
# ---- Run ----
CMD ["sh","-c","uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]