# ---- Base image ----
FROM python:3.10-slim

ENV PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
    HF_HUB_DISABLE_TELEMETRY=1 \
    PORT=7860 \
    # ✅ Writable + persistent on Spaces
    HF_HOME=/data/hf_cache \
    SENTENCE_TRANSFORMERS_HOME=/data/hf_cache \
    NLTK_DATA=/data/nltk_data \
    TLDEXTRACT_CACHE=/data/tld_cache \
    HOME=/data

# Handy tools
RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /app

# ---- Python deps ----
COPY requirements.txt ./
RUN python -m pip install --upgrade pip && \
    # CPU-only PyTorch first
    pip install torch --index-url https://download.pytorch.org/whl/cpu && \
    pip install -r requirements.txt && \
    pip install sentencepiece

# ---- App code ----
COPY . .

# ✅ Make caches writable for the runtime user
RUN mkdir -p /data/hf_cache /data/nltk_data /data/tld_cache && chmod -R 777 /data

# ---- Warm caches into the image layer ----
# 1) Cache SBERT
RUN python - <<'PY'
from sentence_transformers import SentenceTransformer
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("✅ SBERT cached")
PY

# 2) Cache NLTK VADER
RUN python - <<'PY'
import os, nltk
os.makedirs(os.getenv("NLTK_DATA","/data/nltk_data"), exist_ok=True)
nltk.download("vader_lexicon", download_dir=os.getenv("NLTK_DATA","/data/nltk_data"))
print("✅ VADER cached")
PY

# 3) (Recommended) Pre-warm tweet-topic model so first request is instant
RUN python - <<'PY'
from transformers import pipeline
p = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", top_k=1)
p("warmup")
print("✅ Topic model cached")
PY

# Ensure everything under /data is writable after warms
RUN chmod -R 777 /data

EXPOSE 7860

# ---- Run ----
CMD ["sh","-c","uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]