Spaces:
Sleeping
Sleeping
Joash
commited on
Commit
·
69455b9
1
Parent(s):
b4ae3b7
Fix offline mode and improve model loading
Browse files- Dockerfile +3 -7
- src/model_manager.py +9 -9
Dockerfile
CHANGED
|
@@ -26,21 +26,17 @@ ENV PYTHONDONTWRITEBYTECODE=1
|
|
| 26 |
ENV PORT=7860
|
| 27 |
ENV PATH="/home/user/.local/bin:${PATH}"
|
| 28 |
ENV HF_HOME=/home/user/.cache/huggingface
|
| 29 |
-
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface
|
| 30 |
-
# Set logging to stdout
|
| 31 |
-
ENV LOG_FILE=/dev/stdout
|
| 32 |
# Memory optimizations
|
| 33 |
ENV MALLOC_ARENA_MAX=2
|
| 34 |
ENV MALLOC_TRIM_THRESHOLD_=100000
|
| 35 |
ENV MALLOC_MMAP_THRESHOLD_=100000
|
| 36 |
-
# Transformers optimizations
|
| 37 |
-
ENV TRANSFORMERS_OFFLINE=1
|
| 38 |
-
ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX"
|
| 39 |
-
ENV CUDA_LAUNCH_BLOCKING=1
|
| 40 |
# Model optimizations
|
| 41 |
ENV OMP_NUM_THREADS=1
|
| 42 |
ENV MKL_NUM_THREADS=1
|
| 43 |
ENV NUMEXPR_NUM_THREADS=1
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Switch to non-root user
|
| 46 |
USER user
|
|
|
|
| 26 |
ENV PORT=7860
|
| 27 |
ENV PATH="/home/user/.local/bin:${PATH}"
|
| 28 |
ENV HF_HOME=/home/user/.cache/huggingface
|
|
|
|
|
|
|
|
|
|
| 29 |
# Memory optimizations
|
| 30 |
ENV MALLOC_ARENA_MAX=2
|
| 31 |
ENV MALLOC_TRIM_THRESHOLD_=100000
|
| 32 |
ENV MALLOC_MMAP_THRESHOLD_=100000
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Model optimizations
|
| 34 |
ENV OMP_NUM_THREADS=1
|
| 35 |
ENV MKL_NUM_THREADS=1
|
| 36 |
ENV NUMEXPR_NUM_THREADS=1
|
| 37 |
+
# Ensure offline mode is disabled
|
| 38 |
+
ENV HF_HUB_OFFLINE=0
|
| 39 |
+
ENV TRANSFORMERS_OFFLINE=0
|
| 40 |
|
| 41 |
# Switch to non-root user
|
| 42 |
USER user
|
src/model_manager.py
CHANGED
|
@@ -3,6 +3,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
| 3 |
import torch
|
| 4 |
from huggingface_hub import login
|
| 5 |
from .config import Config
|
|
|
|
| 6 |
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
|
@@ -13,11 +14,15 @@ class ModelManager:
|
|
| 13 |
self.model = None
|
| 14 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Login to Hugging Face Hub
|
| 17 |
if Config.HUGGING_FACE_TOKEN:
|
| 18 |
logger.info("Logging in to Hugging Face Hub")
|
| 19 |
try:
|
| 20 |
-
login(token=Config.HUGGING_FACE_TOKEN)
|
| 21 |
logger.info("Successfully logged in to Hugging Face Hub")
|
| 22 |
except Exception as e:
|
| 23 |
logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
|
|
@@ -34,7 +39,8 @@ class ModelManager:
|
|
| 34 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 35 |
self.model_name,
|
| 36 |
token=Config.HUGGING_FACE_TOKEN,
|
| 37 |
-
model_max_length=1024 # Limit max length to save memory
|
|
|
|
| 38 |
)
|
| 39 |
# Ensure we have the necessary special tokens
|
| 40 |
special_tokens = {
|
|
@@ -71,14 +77,8 @@ class ModelManager:
|
|
| 71 |
token=Config.HUGGING_FACE_TOKEN,
|
| 72 |
low_cpu_mem_usage=True,
|
| 73 |
torch_dtype=torch.float16, # Use fp16 for additional memory savings
|
| 74 |
-
|
| 75 |
-
offload_folder="offload", # Enable CPU offloading
|
| 76 |
-
use_cache=False # Disable KV cache to save memory
|
| 77 |
)
|
| 78 |
-
|
| 79 |
-
# Enable gradient checkpointing
|
| 80 |
-
self.model.gradient_checkpointing_enable()
|
| 81 |
-
|
| 82 |
# Resize embeddings to match tokenizer
|
| 83 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
| 84 |
logger.info("Model loaded successfully")
|
|
|
|
| 3 |
import torch
|
| 4 |
from huggingface_hub import login
|
| 5 |
from .config import Config
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
|
|
|
| 14 |
self.model = None
|
| 15 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
|
| 17 |
+
# Ensure offline mode is disabled
|
| 18 |
+
os.environ['HF_HUB_OFFLINE'] = '0'
|
| 19 |
+
os.environ['TRANSFORMERS_OFFLINE'] = '0'
|
| 20 |
+
|
| 21 |
# Login to Hugging Face Hub
|
| 22 |
if Config.HUGGING_FACE_TOKEN:
|
| 23 |
logger.info("Logging in to Hugging Face Hub")
|
| 24 |
try:
|
| 25 |
+
login(token=Config.HUGGING_FACE_TOKEN, add_to_git_credential=False)
|
| 26 |
logger.info("Successfully logged in to Hugging Face Hub")
|
| 27 |
except Exception as e:
|
| 28 |
logger.error(f"Failed to login to Hugging Face Hub: {str(e)}")
|
|
|
|
| 39 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 40 |
self.model_name,
|
| 41 |
token=Config.HUGGING_FACE_TOKEN,
|
| 42 |
+
model_max_length=1024, # Limit max length to save memory
|
| 43 |
+
trust_remote_code=True
|
| 44 |
)
|
| 45 |
# Ensure we have the necessary special tokens
|
| 46 |
special_tokens = {
|
|
|
|
| 77 |
token=Config.HUGGING_FACE_TOKEN,
|
| 78 |
low_cpu_mem_usage=True,
|
| 79 |
torch_dtype=torch.float16, # Use fp16 for additional memory savings
|
| 80 |
+
trust_remote_code=True
|
|
|
|
|
|
|
| 81 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# Resize embeddings to match tokenizer
|
| 83 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
| 84 |
logger.info("Model loaded successfully")
|