Spaces:
Paused
Paused
Commit
·
f093c76
1
Parent(s):
ce75bc0
Update app_v3.py
Browse files
app_v3.py
CHANGED
@@ -6,9 +6,9 @@ import os
|
|
6 |
import torch
|
7 |
import subprocess
|
8 |
|
9 |
-
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
|
10 |
-
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
|
11 |
-
os.environ["CUDA_VISIBLE_DEVICES"]="1"
|
12 |
|
13 |
|
14 |
# Define pretrained and quantized model directories
|
@@ -27,8 +27,6 @@ pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
|
|
27 |
#model_name_or_path = quantized_model_dir
|
28 |
model_basename = "Jackson2-4bit-128g-GPTQ"
|
29 |
|
30 |
-
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
31 |
-
|
32 |
# Before allocating or loading the model, clear up memory
|
33 |
#gc.collect()
|
34 |
#torch.cuda.empty_cache()
|
@@ -38,15 +36,15 @@ use_triton = False
|
|
38 |
if torch.cuda.is_available():
|
39 |
torch.cuda.empty_cache()
|
40 |
|
41 |
-
|
42 |
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
|
43 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
|
44 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
45 |
model = AutoGPTQForCausalLM.from_quantized(
|
46 |
pretrained_model_dir,
|
47 |
model_basename=model_basename,
|
48 |
use_safetensors=True,
|
49 |
device=device,
|
|
|
50 |
)
|
51 |
|
52 |
|
|
|
6 |
import torch
|
7 |
import subprocess
|
8 |
|
9 |
+
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # => just makes sense with more than one GPU, since is trying to split
|
10 |
+
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # => just makes sense when more GPUs
|
11 |
+
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1" # => this is an example of numbers of devices
|
12 |
|
13 |
|
14 |
# Define pretrained and quantized model directories
|
|
|
27 |
#model_name_or_path = quantized_model_dir
|
28 |
model_basename = "Jackson2-4bit-128g-GPTQ"
|
29 |
|
|
|
|
|
30 |
# Before allocating or loading the model, clear up memory
|
31 |
#gc.collect()
|
32 |
#torch.cuda.empty_cache()
|
|
|
36 |
if torch.cuda.is_available():
|
37 |
torch.cuda.empty_cache()
|
38 |
|
|
|
39 |
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
|
40 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
|
41 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu" # best configuration besides the auto option
|
42 |
model = AutoGPTQForCausalLM.from_quantized(
|
43 |
pretrained_model_dir,
|
44 |
model_basename=model_basename,
|
45 |
use_safetensors=True,
|
46 |
device=device,
|
47 |
+
max_memory={0: "10GIB"}
|
48 |
)
|
49 |
|
50 |
|