Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
print("
|
2 |
import os
|
3 |
import spaces
|
4 |
import gradio as gr
|
@@ -9,7 +9,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
9 |
from peft import PeftModel, PeftConfig
|
10 |
import bitsandbytes
|
11 |
import torch
|
12 |
-
|
|
|
13 |
|
14 |
@spaces.GPU # Forces GPU allocation before execution
|
15 |
def force_gpu_allocation():
|
@@ -23,7 +24,8 @@ lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned
|
|
23 |
|
24 |
# Login because LLaMA 3.1 8B is a gated model
|
25 |
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
|
26 |
-
print("
|
|
|
27 |
# Enable 4-bit Quantization with BitsAndBytes
|
28 |
quantization_config = BitsAndBytesConfig(
|
29 |
load_in_4bit=True, # ✅ Enables 4-bit quantization for memory efficiency
|
@@ -32,19 +34,24 @@ quantization_config = BitsAndBytesConfig(
|
|
32 |
bnb_4bit_quant_type="nf4" # ✅ Normalized Float-4 for better accuracy
|
33 |
)
|
34 |
|
|
|
35 |
base_model = AutoModelForCausalLM.from_pretrained(
|
36 |
base_model_name,
|
37 |
quantization_config=quantization_config,
|
38 |
device_map="auto"
|
39 |
)
|
40 |
-
print("
|
|
|
41 |
# Load tokenizer
|
|
|
42 |
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
43 |
-
print("
|
|
|
44 |
# Load LoRA Adapter
|
45 |
-
print("Loading LoRA adapter...")
|
46 |
model = PeftModel.from_pretrained(base_model, lora_model_name)
|
47 |
-
print("
|
|
|
48 |
# Function to generate responses
|
49 |
def chatbot_response(user_input):
|
50 |
try:
|
|
|
1 |
+
print("Beginning import")
|
2 |
import os
|
3 |
import spaces
|
4 |
import gradio as gr
|
|
|
9 |
from peft import PeftModel, PeftConfig
|
10 |
import bitsandbytes
|
11 |
import torch
|
12 |
+
|
13 |
+
print("Imports completed")
|
14 |
|
15 |
@spaces.GPU # Forces GPU allocation before execution
|
16 |
def force_gpu_allocation():
|
|
|
24 |
|
25 |
# Login because LLaMA 3.1 8B is a gated model
|
26 |
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
|
27 |
+
print("Login to Huggin Face successful")
|
28 |
+
|
29 |
# Enable 4-bit Quantization with BitsAndBytes
|
30 |
quantization_config = BitsAndBytesConfig(
|
31 |
load_in_4bit=True, # ✅ Enables 4-bit quantization for memory efficiency
|
|
|
34 |
bnb_4bit_quant_type="nf4" # ✅ Normalized Float-4 for better accuracy
|
35 |
)
|
36 |
|
37 |
+
print("Loading base model")
|
38 |
base_model = AutoModelForCausalLM.from_pretrained(
|
39 |
base_model_name,
|
40 |
quantization_config=quantization_config,
|
41 |
device_map="auto"
|
42 |
)
|
43 |
+
print("Basemodel loaded successfully")
|
44 |
+
|
45 |
# Load tokenizer
|
46 |
+
print("Loading tokenizer")
|
47 |
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
48 |
+
print("Tokenizer loaded successfully")
|
49 |
+
|
50 |
# Load LoRA Adapter
|
51 |
+
print("Loading Peft LoRA adapter...")
|
52 |
model = PeftModel.from_pretrained(base_model, lora_model_name)
|
53 |
+
print("Peft LoRA model loaded successfully")
|
54 |
+
|
55 |
# Function to generate responses
|
56 |
def chatbot_response(user_input):
|
57 |
try:
|