Spaces:

starnernj
/

Early-Christian-Church-Fathers

Paused

starnernj commited on Feb 23

Commit

106ed70

verified ·

1 Parent(s): f7b9082

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,7 +3,9 @@ import gradio as gr
 from huggingface_hub import InferenceClient, login
 import os
 import time
-import threading
 @spaces.GPU  # Forces GPU allocation before execution
 def force_gpu_allocation():
@@ -18,6 +20,9 @@ lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned
 # Function to generate responses
 def chatbot_response(user_input):
     import traceback
     from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     from peft import PeftModel, PeftConfig
@@ -36,7 +41,8 @@ def chatbot_response(user_input):
     model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
-        device_map="cpu"
     )
     # Load tokenizer

 from huggingface_hub import InferenceClient, login
 import os
 import time
+# Disable CUDA visibility at the start
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Prevents CUDA initialization
 @spaces.GPU  # Forces GPU allocation before execution
 def force_gpu_allocation():
 # Function to generate responses
 def chatbot_response(user_input):
+    # Re-enable CUDA inside the function for accelerate to manage
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Adjust based on ZeroGPU setup
     import traceback
     from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     from peft import PeftModel, PeftConfig
     model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
+        device_map="cpu",
+        torch_dtype=torch.float32  # Avoid any GPU-related dtype defaults
     )
     # Load tokenizer