Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import os
|
2 |
-
|
3 |
-
# Disable CUDA visibility at the start
|
4 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Prevents CUDA initialization
|
5 |
-
|
6 |
import spaces
|
7 |
import gradio as gr
|
8 |
from huggingface_hub import InferenceClient, login
|
9 |
import time
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
@spaces.GPU # Forces GPU allocation before execution
|
@@ -19,47 +20,34 @@ base_model_name = "meta-llama/Llama-3.1-8B"
|
|
19 |
# Your fine-tuned LoRA adapter (uploaded to Hugging Face)
|
20 |
lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
|
21 |
|
|
|
|
|
22 |
|
23 |
-
#
|
24 |
-
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
from peft import PeftModel, PeftConfig
|
31 |
-
import bitsandbytes
|
32 |
-
from accelerate import Accelerator
|
33 |
-
import torch
|
34 |
-
|
35 |
-
# Check if CUDA is already initialized (for debugging)
|
36 |
-
if torch.cuda.is_initialized():
|
37 |
-
print("CUDA was already initialized before Accelerator!")
|
38 |
|
|
|
|
|
|
|
|
|
|
|
39 |
|
|
|
|
|
40 |
|
41 |
-
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
model = AutoModelForCausalLM.from_pretrained(
|
47 |
-
base_model_name,
|
48 |
-
device_map="cpu",
|
49 |
-
torch_dtype=torch.float32 # Avoid any GPU-related dtype defaults
|
50 |
-
)
|
51 |
-
|
52 |
-
# Load tokenizer
|
53 |
-
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
54 |
-
|
55 |
-
|
56 |
-
# Load LoRA adapter
|
57 |
-
# model = PeftModel.from_pretrained(model, lora_model_name)
|
58 |
-
|
59 |
-
model = accelerator.prepare(model)
|
60 |
-
|
61 |
try:
|
62 |
-
inputs = tokenizer(user_input, return_tensors="pt").to(
|
63 |
outputs = model.generate(**inputs, max_length=200)
|
64 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
65 |
except Exception as e:
|
@@ -77,4 +65,5 @@ interface = gr.Interface(
|
|
77 |
description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
|
78 |
)
|
79 |
|
80 |
-
|
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
2 |
import spaces
|
3 |
import gradio as gr
|
4 |
from huggingface_hub import InferenceClient, login
|
5 |
import time
|
6 |
+
import traceback
|
7 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
8 |
+
from peft import PeftModel, PeftConfig
|
9 |
+
import bitsandbytes
|
10 |
+
import torch
|
11 |
|
12 |
|
13 |
@spaces.GPU # Forces GPU allocation before execution
|
|
|
20 |
# Your fine-tuned LoRA adapter (uploaded to Hugging Face)
|
21 |
lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
|
22 |
|
23 |
+
# Login because LLaMA 3.1 8B is a gated model
|
24 |
+
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
|
25 |
|
26 |
+
# Enable 4-bit Quantization with BitsAndBytes
|
27 |
+
quantization_config = BitsAndBytesConfig(
|
28 |
+
load_in_4bit=True, # ✅ Enables 4-bit quantization for memory efficiency
|
29 |
+
bnb_4bit_compute_dtype=torch.float16, # ✅ Uses float16 for performance
|
30 |
+
bnb_4bit_use_double_quant=True, # ✅ Optimizes quantization
|
31 |
+
bnb_4bit_quant_type="nf4" # ✅ Normalized Float-4 for better accuracy
|
32 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
35 |
+
base_model_name,
|
36 |
+
quantization_config=quantization_config,
|
37 |
+
device_map="auto"
|
38 |
+
)
|
39 |
|
40 |
+
# Load tokenizer
|
41 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
42 |
|
43 |
+
# Load LoRA Adapter
|
44 |
+
print("Loading LoRA adapter...")
|
45 |
+
model = PeftModel.from_pretrained(base_model, lora_model_name)
|
46 |
|
47 |
+
# Function to generate responses
|
48 |
+
def chatbot_response(user_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
try:
|
50 |
+
inputs = tokenizer(user_input, return_tensors="pt").to("cuda")
|
51 |
outputs = model.generate(**inputs, max_length=200)
|
52 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
53 |
except Exception as e:
|
|
|
65 |
description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
|
66 |
)
|
67 |
|
68 |
+
if __name__ == "__main__":
|
69 |
+
interface.launch()
|