starnernj commited on
Commit
15fc625
Β·
verified Β·
1 Parent(s): ed893ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -53
app.py CHANGED
@@ -9,6 +9,7 @@ import time
9
  import bitsandbytes
10
  import traceback
11
  import threading
 
12
 
13
  @spaces.GPU # Forces GPU allocation before execution
14
  def force_gpu_allocation():
@@ -21,70 +22,32 @@ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  print(f"Using device: {device}") # Debugging info
23
 
24
- # Global variables (initialized later in worker thread)
25
- model = None
26
- tokenizer = None
 
 
 
 
 
 
 
27
 
28
- def load_model():
29
- print("Initializing model in background thread...")
30
- global model, tokenizer # Use global variables
31
-
32
- # Base model (LLaMA 3.1 8B) from Meta
33
- base_model_name = "meta-llama/Llama-3.1-8B"
34
-
35
- # Your fine-tuned LoRA adapter (uploaded to Hugging Face)
36
- lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
37
-
38
  # Login because LLaMA 3.1 8B is a gated model
39
  login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
40
-
41
- # Load base model - can't do this on the free tier - not enough memory
42
- # model = AutoModelForCausalLM.from_pretrained(base_model_name)
43
 
44
- # βœ… Force CPU placement before moving anything to CUDA
45
- torch.cuda.is_available = lambda: False # πŸ‘ˆ Trick PyTorch to think CUDA isn't available at first
46
-
47
- """
48
- # βœ… Configure BitsAndBytes to use CPU first
49
- quantization_config = BitsAndBytesConfig(
50
- load_in_8bit=True, # βœ… Uses 8-bit instead of 4-bit
51
- device_map={"": "cpu"},
52
- # load_in_4bit=True,
53
- # bnb_4bit_compute_dtype=torch.float16,
54
- # bnb_4bit_use_double_quant=True,
55
- # bnb_4bit_quant_type="nf4"
56
- )
57
- """
58
  model = AutoModelForCausalLM.from_pretrained(
59
  base_model_name,
60
- # quantization_config=quantization_config,
61
- # load_in_4bit=True, # Reduces memory, but requires a GPU
62
- torch_dtype=torch.float16,
63
- # llm_int8_enable_fp32_cpu_offload=True, # Offload some layers to CPU
64
- device_map={"": "cpu"} # Load everything on CPU first
65
  )
66
 
67
  # Load LoRA adapter
68
- model = PeftModel.from_pretrained(model, lora_model_name, device_map={"": "cpu"})
69
-
70
- # βœ… Now, allow CUDA again and move everything to GPU
71
- torch.cuda.is_available = lambda: True
72
 
73
- # Move model to GPU *AFTER* loading LoRA to avoid CUDA init errors
74
- model = model.to("cuda" if torch.cuda.is_available() else "cpu")
75
 
76
- # Load tokenizer
77
- tokenizer = AutoTokenizer.from_pretrained(base_model_name)
78
-
79
- print("Model successfully loaded!")
80
-
81
- # Start model loading in a background thread
82
- threading.Thread(target=load_model, daemon=True).start()
83
-
84
- # Function to generate responses
85
- def chatbot_response(user_input):
86
- if model is None or tokenizer is None:
87
- return "Model is still loading. Please wait..."
88
  try:
89
  inputs = tokenizer(user_input, return_tensors="pt").to(device)
90
  outputs = model.generate(**inputs, max_length=200)
 
9
  import bitsandbytes
10
  import traceback
11
  import threading
12
+ from accelerate import Accelerator
13
 
14
  @spaces.GPU # Forces GPU allocation before execution
15
  def force_gpu_allocation():
 
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
  print(f"Using device: {device}") # Debugging info
24
 
25
+ # Base model (LLaMA 3.1 8B) from Meta
26
+ base_model_name = "meta-llama/Llama-3.1-8B"
27
+
28
+ # Your fine-tuned LoRA adapter (uploaded to Hugging Face)
29
+ lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
30
+
31
+
32
+ # Function to generate responses
33
+ def chatbot_response(user_input):
34
+ accelerator = Accelerator()
35
 
 
 
 
 
 
 
 
 
 
 
36
  # Login because LLaMA 3.1 8B is a gated model
37
  login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
 
 
 
38
 
39
+ # Load tokenizer
40
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
41
+
 
 
 
 
 
 
 
 
 
 
 
42
  model = AutoModelForCausalLM.from_pretrained(
43
  base_model_name,
 
 
 
 
 
44
  )
45
 
46
  # Load LoRA adapter
47
+ model = PeftModel.from_pretrained(model, lora_model_name)
 
 
 
48
 
49
+ model = accelerator.prepare(model)
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  try:
52
  inputs = tokenizer(user_input, return_tensors="pt").to(device)
53
  outputs = model.generate(**inputs, max_length=200)