Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,9 @@ import gradio as gr
|
|
3 |
from huggingface_hub import InferenceClient, login
|
4 |
import os
|
5 |
import time
|
6 |
-
|
|
|
|
|
7 |
|
8 |
@spaces.GPU # Forces GPU allocation before execution
|
9 |
def force_gpu_allocation():
|
@@ -18,6 +20,9 @@ lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned
|
|
18 |
|
19 |
# Function to generate responses
|
20 |
def chatbot_response(user_input):
|
|
|
|
|
|
|
21 |
import traceback
|
22 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
23 |
from peft import PeftModel, PeftConfig
|
@@ -36,7 +41,8 @@ def chatbot_response(user_input):
|
|
36 |
|
37 |
model = AutoModelForCausalLM.from_pretrained(
|
38 |
base_model_name,
|
39 |
-
device_map="cpu"
|
|
|
40 |
)
|
41 |
|
42 |
# Load tokenizer
|
|
|
3 |
from huggingface_hub import InferenceClient, login
|
4 |
import os
|
5 |
import time
|
6 |
+
|
7 |
+
# Disable CUDA visibility at the start
|
8 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Prevents CUDA initialization
|
9 |
|
10 |
@spaces.GPU # Forces GPU allocation before execution
|
11 |
def force_gpu_allocation():
|
|
|
20 |
|
21 |
# Function to generate responses
|
22 |
def chatbot_response(user_input):
|
23 |
+
# Re-enable CUDA inside the function for accelerate to manage
|
24 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Adjust based on ZeroGPU setup
|
25 |
+
|
26 |
import traceback
|
27 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
28 |
from peft import PeftModel, PeftConfig
|
|
|
41 |
|
42 |
model = AutoModelForCausalLM.from_pretrained(
|
43 |
base_model_name,
|
44 |
+
device_map="cpu",
|
45 |
+
torch_dtype=torch.float32 # Avoid any GPU-related dtype defaults
|
46 |
)
|
47 |
|
48 |
# Load tokenizer
|