Spaces:
Sleeping
Sleeping
Joash
commited on
Commit
·
a307172
1
Parent(s):
5f0bb6b
Add comprehensive memory optimizations for model and Docker
Browse files- Dockerfile +16 -6
- src/model_manager.py +31 -7
Dockerfile
CHANGED
|
@@ -12,8 +12,8 @@ RUN apt-get update && apt-get install -y \
|
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
# Create necessary directories with proper permissions
|
| 15 |
-
RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local \
|
| 16 |
-
&& chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local
|
| 17 |
|
| 18 |
# Create non-root user
|
| 19 |
RUN useradd -m -u 1000 user \
|
|
@@ -29,8 +29,18 @@ ENV HF_HOME=/home/user/.cache/huggingface
|
|
| 29 |
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
| 30 |
# Set logging to stdout
|
| 31 |
ENV LOG_FILE=/dev/stdout
|
| 32 |
-
#
|
| 33 |
ENV MALLOC_ARENA_MAX=2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Switch to non-root user
|
| 36 |
USER user
|
|
@@ -42,7 +52,7 @@ RUN pip install --user --no-cache-dir "numpy<2.0.0"
|
|
| 42 |
# Copy requirements first to leverage Docker cache
|
| 43 |
COPY --chown=user:user requirements.txt .
|
| 44 |
|
| 45 |
-
# Install Python dependencies with
|
| 46 |
RUN pip install --user --no-cache-dir -r requirements.txt
|
| 47 |
|
| 48 |
# Copy application code
|
|
@@ -51,5 +61,5 @@ COPY --chown=user:user . .
|
|
| 51 |
# Expose port for Hugging Face Spaces
|
| 52 |
EXPOSE 7860
|
| 53 |
|
| 54 |
-
# Run the application with
|
| 55 |
-
CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1"]
|
|
|
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
# Create necessary directories with proper permissions
|
| 15 |
+
RUN mkdir -p /app/logs /app/src/static /home/user/.cache/huggingface /home/user/.local /app/offload \
|
| 16 |
+
&& chmod -R 777 /app/logs /home/user/.cache/huggingface /home/user/.local /app/offload
|
| 17 |
|
| 18 |
# Create non-root user
|
| 19 |
RUN useradd -m -u 1000 user \
|
|
|
|
| 29 |
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
| 30 |
# Set logging to stdout
|
| 31 |
ENV LOG_FILE=/dev/stdout
|
| 32 |
+
# Memory optimizations
|
| 33 |
ENV MALLOC_ARENA_MAX=2
|
| 34 |
+
ENV MALLOC_TRIM_THRESHOLD_=100000
|
| 35 |
+
ENV MALLOC_MMAP_THRESHOLD_=100000
|
| 36 |
+
# Transformers optimizations
|
| 37 |
+
ENV TRANSFORMERS_OFFLINE=1
|
| 38 |
+
ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
|
| 39 |
+
ENV CUDA_LAUNCH_BLOCKING=1
|
| 40 |
+
# Model optimizations
|
| 41 |
+
ENV OMP_NUM_THREADS=1
|
| 42 |
+
ENV MKL_NUM_THREADS=1
|
| 43 |
+
ENV NUMEXPR_NUM_THREADS=1
|
| 44 |
|
| 45 |
# Switch to non-root user
|
| 46 |
USER user
|
|
|
|
| 52 |
# Copy requirements first to leverage Docker cache
|
| 53 |
COPY --chown=user:user requirements.txt .
|
| 54 |
|
| 55 |
+
# Install Python dependencies with memory optimizations
|
| 56 |
RUN pip install --user --no-cache-dir -r requirements.txt
|
| 57 |
|
| 58 |
# Copy application code
|
|
|
|
| 61 |
# Expose port for Hugging Face Spaces
|
| 62 |
EXPOSE 7860
|
| 63 |
|
| 64 |
+
# Run the application with memory optimizations
|
| 65 |
+
CMD ["python", "-u", "-m", "uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug", "--workers", "1", "--limit-concurrency", "1", "--timeout-keep-alive", "120"]
|
src/model_manager.py
CHANGED
|
@@ -33,7 +33,8 @@ class ModelManager:
|
|
| 33 |
logger.info(f"Loading tokenizer: {self.model_name}")
|
| 34 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 35 |
self.model_name,
|
| 36 |
-
token=Config.HUGGING_FACE_TOKEN
|
|
|
|
| 37 |
)
|
| 38 |
# Ensure we have the necessary special tokens
|
| 39 |
special_tokens = {
|
|
@@ -62,14 +63,22 @@ class ModelManager:
|
|
| 62 |
bnb_4bit_quant_type="nf4"
|
| 63 |
)
|
| 64 |
|
| 65 |
-
# Load model with
|
| 66 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 67 |
self.model_name,
|
| 68 |
device_map={"": self.device},
|
| 69 |
quantization_config=quantization_config,
|
| 70 |
token=Config.HUGGING_FACE_TOKEN,
|
| 71 |
-
low_cpu_mem_usage=True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
# Resize embeddings to match tokenizer
|
| 74 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
| 75 |
logger.info("Model loaded successfully")
|
|
@@ -78,18 +87,24 @@ class ModelManager:
|
|
| 78 |
logger.error(f"Error loading model: {str(e)}")
|
| 79 |
raise
|
| 80 |
|
| 81 |
-
def generate_text(self, prompt: str, max_new_tokens: int =
|
| 82 |
"""Generate text from prompt."""
|
| 83 |
try:
|
| 84 |
logger.info("Starting text generation")
|
| 85 |
logger.debug(f"Prompt length: {len(prompt)}")
|
| 86 |
|
| 87 |
-
# Encode the prompt
|
| 88 |
-
inputs = self.tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 90 |
logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
|
| 91 |
|
| 92 |
-
# Generate response
|
| 93 |
logger.info("Generating response")
|
| 94 |
with torch.no_grad():
|
| 95 |
outputs = self.model.generate(
|
|
@@ -100,8 +115,15 @@ class ModelManager:
|
|
| 100 |
top_p=Config.TOP_P,
|
| 101 |
pad_token_id=self.tokenizer.pad_token_id,
|
| 102 |
eos_token_id=self.tokenizer.eos_token_id,
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
# Decode and return the generated text
|
| 106 |
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 107 |
response = generated_text[len(prompt):].strip()
|
|
@@ -113,4 +135,6 @@ class ModelManager:
|
|
| 113 |
except Exception as e:
|
| 114 |
logger.error(f"Error generating text: {str(e)}")
|
| 115 |
logger.error(f"Error details: {type(e).__name__}")
|
|
|
|
|
|
|
| 116 |
raise
|
|
|
|
| 33 |
logger.info(f"Loading tokenizer: {self.model_name}")
|
| 34 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 35 |
self.model_name,
|
| 36 |
+
token=Config.HUGGING_FACE_TOKEN,
|
| 37 |
+
model_max_length=1024 # Limit max length to save memory
|
| 38 |
)
|
| 39 |
# Ensure we have the necessary special tokens
|
| 40 |
special_tokens = {
|
|
|
|
| 63 |
bnb_4bit_quant_type="nf4"
|
| 64 |
)
|
| 65 |
|
| 66 |
+
# Load model with memory optimizations
|
| 67 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 68 |
self.model_name,
|
| 69 |
device_map={"": self.device},
|
| 70 |
quantization_config=quantization_config,
|
| 71 |
token=Config.HUGGING_FACE_TOKEN,
|
| 72 |
+
low_cpu_mem_usage=True,
|
| 73 |
+
torch_dtype=torch.float16, # Use fp16 for additional memory savings
|
| 74 |
+
max_memory={0: "4GB"}, # Limit memory usage
|
| 75 |
+
offload_folder="offload", # Enable CPU offloading
|
| 76 |
+
use_cache=False # Disable KV cache to save memory
|
| 77 |
)
|
| 78 |
+
|
| 79 |
+
# Enable gradient checkpointing
|
| 80 |
+
self.model.gradient_checkpointing_enable()
|
| 81 |
+
|
| 82 |
# Resize embeddings to match tokenizer
|
| 83 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
| 84 |
logger.info("Model loaded successfully")
|
|
|
|
| 87 |
logger.error(f"Error loading model: {str(e)}")
|
| 88 |
raise
|
| 89 |
|
| 90 |
+
def generate_text(self, prompt: str, max_new_tokens: int = 512) -> str:
|
| 91 |
"""Generate text from prompt."""
|
| 92 |
try:
|
| 93 |
logger.info("Starting text generation")
|
| 94 |
logger.debug(f"Prompt length: {len(prompt)}")
|
| 95 |
|
| 96 |
+
# Encode the prompt with reduced max length
|
| 97 |
+
inputs = self.tokenizer(
|
| 98 |
+
prompt,
|
| 99 |
+
return_tensors="pt",
|
| 100 |
+
truncation=True,
|
| 101 |
+
max_length=512, # Reduced max length
|
| 102 |
+
padding=True
|
| 103 |
+
)
|
| 104 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 105 |
logger.debug(f"Input tensor shape: {inputs['input_ids'].shape}")
|
| 106 |
|
| 107 |
+
# Generate response with memory optimizations
|
| 108 |
logger.info("Generating response")
|
| 109 |
with torch.no_grad():
|
| 110 |
outputs = self.model.generate(
|
|
|
|
| 115 |
top_p=Config.TOP_P,
|
| 116 |
pad_token_id=self.tokenizer.pad_token_id,
|
| 117 |
eos_token_id=self.tokenizer.eos_token_id,
|
| 118 |
+
num_beams=1, # Disable beam search to save memory
|
| 119 |
+
use_cache=False, # Disable KV cache
|
| 120 |
+
early_stopping=True
|
| 121 |
)
|
| 122 |
|
| 123 |
+
# Clear CUDA cache after generation
|
| 124 |
+
if torch.cuda.is_available():
|
| 125 |
+
torch.cuda.empty_cache()
|
| 126 |
+
|
| 127 |
# Decode and return the generated text
|
| 128 |
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 129 |
response = generated_text[len(prompt):].strip()
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
logger.error(f"Error generating text: {str(e)}")
|
| 137 |
logger.error(f"Error details: {type(e).__name__}")
|
| 138 |
+
if torch.cuda.is_available():
|
| 139 |
+
torch.cuda.empty_cache()
|
| 140 |
raise
|