Update app.py
Browse files
app.py
CHANGED
@@ -44,6 +44,7 @@ def load_model():
|
|
44 |
# β
Force CPU placement before moving anything to CUDA
|
45 |
torch.cuda.is_available = lambda: False # π Trick PyTorch to think CUDA isn't available at first
|
46 |
|
|
|
47 |
# β
Configure BitsAndBytes to use CPU first
|
48 |
quantization_config = BitsAndBytesConfig(
|
49 |
load_in_8bit=True, # β
Uses 8-bit instead of 4-bit
|
@@ -53,12 +54,12 @@ def load_model():
|
|
53 |
# bnb_4bit_use_double_quant=True,
|
54 |
# bnb_4bit_quant_type="nf4"
|
55 |
)
|
56 |
-
|
57 |
model = AutoModelForCausalLM.from_pretrained(
|
58 |
base_model_name,
|
59 |
-
quantization_config=quantization_config,
|
60 |
# load_in_4bit=True, # Reduces memory, but requires a GPU
|
61 |
-
|
62 |
# llm_int8_enable_fp32_cpu_offload=True, # Offload some layers to CPU
|
63 |
device_map={"": "cpu"} # Load everything on CPU first
|
64 |
)
|
|
|
44 |
# β
Force CPU placement before moving anything to CUDA
|
45 |
torch.cuda.is_available = lambda: False # π Trick PyTorch to think CUDA isn't available at first
|
46 |
|
47 |
+
"""
|
48 |
# β
Configure BitsAndBytes to use CPU first
|
49 |
quantization_config = BitsAndBytesConfig(
|
50 |
load_in_8bit=True, # β
Uses 8-bit instead of 4-bit
|
|
|
54 |
# bnb_4bit_use_double_quant=True,
|
55 |
# bnb_4bit_quant_type="nf4"
|
56 |
)
|
57 |
+
"""
|
58 |
model = AutoModelForCausalLM.from_pretrained(
|
59 |
base_model_name,
|
60 |
+
# quantization_config=quantization_config,
|
61 |
# load_in_4bit=True, # Reduces memory, but requires a GPU
|
62 |
+
torch_dtype=torch.float16,
|
63 |
# llm_int8_enable_fp32_cpu_offload=True, # Offload some layers to CPU
|
64 |
device_map={"": "cpu"} # Load everything on CPU first
|
65 |
)
|