FROM python:3.10-slim # Install system dependencies for llama.cpp RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ && rm -rf /var/lib/apt/lists/* WORKDIR /app # Copy requirement list first for caching COPY requirements.txt . # Install Python dependencies (llama-cpp-python compiled with BLAS disabled for HF CPU Spaces) RUN CMAKE_ARGS="-DLLAMA_BLAS=OFF -DLLAMA_CUBLAS=OFF" \ pip install --no-cache-dir -r requirements.txt # Copy app files COPY . . # Pre-download model at build time to speed up startup RUN python -c "from huggingface_hub import hf_hub_download; \ hf_hub_download(repo_id='bartowski/Llama-3.2-3B-Instruct-GGUF', \ filename='Llama-3.2-3B-Instruct-Q4_K_M.gguf', \ cache_dir='/app/models', local_dir_use_symlinks=False)" EXPOSE 7860 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]