Spaces:

starnernj
/

Early-Christian-Church-Fathers

Paused

App Files Files Community

starnernj commited on Feb 23

Commit

15fc625

verified ·

1 Parent(s): ed893ee

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -53

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import time
 import bitsandbytes
 import traceback
 import threading
 @spaces.GPU  # Forces GPU allocation before execution
 def force_gpu_allocation():
@@ -21,70 +22,32 @@ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")  # Debugging info
-# Global variables (initialized later in worker thread)
-model = None
-tokenizer = None
-def load_model():
-    print("Initializing model in background thread...")
-    global model, tokenizer  # Use global variables
-    # Base model (LLaMA 3.1 8B) from Meta
-    base_model_name = "meta-llama/Llama-3.1-8B"
-    # Your fine-tuned LoRA adapter (uploaded to Hugging Face)
-    lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
     # Login because LLaMA 3.1 8B is a gated model
     login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
-    # Load base model - can't do this on the free tier - not enough memory
-    # model = AutoModelForCausalLM.from_pretrained(base_model_name)
-    # ✅ Force CPU placement before moving anything to CUDA
-    torch.cuda.is_available = lambda: False  # 👈 Trick PyTorch to think CUDA isn't available at first
-    """
-    # ✅ Configure BitsAndBytes to use CPU first
-    quantization_config = BitsAndBytesConfig(
-        load_in_8bit=True,  # ✅ Uses 8-bit instead of 4-bit
-        device_map={"": "cpu"},
-        # load_in_4bit=True,
-        # bnb_4bit_compute_dtype=torch.float16,
-        # bnb_4bit_use_double_quant=True,
-        # bnb_4bit_quant_type="nf4"
-    )
-    """
     model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
-        # quantization_config=quantization_config,
-        # load_in_4bit=True,  # Reduces memory, but requires a GPU
-        torch_dtype=torch.float16,
-        # llm_int8_enable_fp32_cpu_offload=True,  # Offload some layers to CPU
-        device_map={"": "cpu"}  # Load everything on CPU first
     )
     # Load LoRA adapter
-    model = PeftModel.from_pretrained(model, lora_model_name, device_map={"": "cpu"})
-    # ✅ Now, allow CUDA again and move everything to GPU
-    torch.cuda.is_available = lambda: True
-    # Move model to GPU *AFTER* loading LoRA to avoid CUDA init errors
-    model = model.to("cuda" if torch.cuda.is_available() else "cpu")
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-    print("Model successfully loaded!")
-# Start model loading in a background thread
-threading.Thread(target=load_model, daemon=True).start()
-# Function to generate responses
-def chatbot_response(user_input):
-    if model is None or tokenizer is None:
-        return "Model is still loading. Please wait..."
     try:
         inputs = tokenizer(user_input, return_tensors="pt").to(device)
         outputs = model.generate(**inputs, max_length=200)

 import bitsandbytes
 import traceback
 import threading
+from accelerate import Accelerator
 @spaces.GPU  # Forces GPU allocation before execution
 def force_gpu_allocation():
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")  # Debugging info
+# Base model (LLaMA 3.1 8B) from Meta
+base_model_name = "meta-llama/Llama-3.1-8B"
+# Your fine-tuned LoRA adapter (uploaded to Hugging Face)
+lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
+# Function to generate responses
+def chatbot_response(user_input):
+    accelerator = Accelerator()
     # Login because LLaMA 3.1 8B is a gated model
     login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
     model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
     )
     # Load LoRA adapter
+    model = PeftModel.from_pretrained(model, lora_model_name)
+    model = accelerator.prepare(model)
     try:
         inputs = tokenizer(user_input, return_tensors="pt").to(device)
         outputs = model.generate(**inputs, max_length=200)