Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import spaces
|
3 |
import gradio as gr
|
@@ -8,7 +9,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
8 |
from peft import PeftModel, PeftConfig
|
9 |
import bitsandbytes
|
10 |
import torch
|
11 |
-
|
12 |
|
13 |
@spaces.GPU # Forces GPU allocation before execution
|
14 |
def force_gpu_allocation():
|
@@ -22,7 +23,7 @@ lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned
|
|
22 |
|
23 |
# Login because LLaMA 3.1 8B is a gated model
|
24 |
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
|
25 |
-
|
26 |
# Enable 4-bit Quantization with BitsAndBytes
|
27 |
quantization_config = BitsAndBytesConfig(
|
28 |
load_in_4bit=True, # ✅ Enables 4-bit quantization for memory efficiency
|
@@ -36,14 +37,14 @@ base_model = AutoModelForCausalLM.from_pretrained(
|
|
36 |
quantization_config=quantization_config,
|
37 |
device_map="auto"
|
38 |
)
|
39 |
-
|
40 |
# Load tokenizer
|
41 |
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
42 |
-
|
43 |
# Load LoRA Adapter
|
44 |
print("Loading LoRA adapter...")
|
45 |
model = PeftModel.from_pretrained(base_model, lora_model_name)
|
46 |
-
|
47 |
# Function to generate responses
|
48 |
def chatbot_response(user_input):
|
49 |
try:
|
|
|
1 |
+
print("Before Import")
|
2 |
import os
|
3 |
import spaces
|
4 |
import gradio as gr
|
|
|
9 |
from peft import PeftModel, PeftConfig
|
10 |
import bitsandbytes
|
11 |
import torch
|
12 |
+
print("After Import")
|
13 |
|
14 |
@spaces.GPU # Forces GPU allocation before execution
|
15 |
def force_gpu_allocation():
|
|
|
23 |
|
24 |
# Login because LLaMA 3.1 8B is a gated model
|
25 |
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
|
26 |
+
print("After login")
|
27 |
# Enable 4-bit Quantization with BitsAndBytes
|
28 |
quantization_config = BitsAndBytesConfig(
|
29 |
load_in_4bit=True, # ✅ Enables 4-bit quantization for memory efficiency
|
|
|
37 |
quantization_config=quantization_config,
|
38 |
device_map="auto"
|
39 |
)
|
40 |
+
print("After basemodel")
|
41 |
# Load tokenizer
|
42 |
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
43 |
+
print("After Tokenizer")
|
44 |
# Load LoRA Adapter
|
45 |
print("Loading LoRA adapter...")
|
46 |
model = PeftModel.from_pretrained(base_model, lora_model_name)
|
47 |
+
print("After peft model")
|
48 |
# Function to generate responses
|
49 |
def chatbot_response(user_input):
|
50 |
try:
|