Spaces:

starnernj
/

Early-Christian-Church-Fathers

Paused

File size: 2,723 Bytes

fbb2b4a
e9a38d8
a9b553f
87ed98d
9b0d920
c5f1959
9ee6131
 
 
 
 
fbb2b4a
 
4c9f7f3
c17a736
 
 
b1af9dd
15fc625
 
 
 
 
 
c53f6f7
 
fbb2b4a
 
c53f6f7
 
 
 
 
 
 
 
fbb2b4a
c53f6f7
 
 
 
 
fbb2b4a
 
c53f6f7
fbb2b4a
c53f6f7
fbb2b4a
 
c53f6f7
fbb2b4a
c53f6f7
fbb2b4a
 
c53f6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e86dea

print("Beginning import")
import os
import spaces
import gradio as gr
from huggingface_hub import InferenceClient, login
import time
import traceback
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import bitsandbytes
import torch

print("Imports completed")

@spaces.GPU  # Forces GPU allocation before execution
def force_gpu_allocation():
    pass  # Dummy function to trigger GPU setup

# Base model (LLaMA 3.1 8B) from Meta
base_model_name = "meta-llama/Llama-3.1-8B"

# Your fine-tuned LoRA adapter (uploaded to Hugging Face)
lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"

# Login because LLaMA 3.1 8B is a gated model
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
print("Login to Huggin Face successful")

# Enable 4-bit Quantization with BitsAndBytes
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # ✅ Enables 4-bit quantization for memory efficiency
    bnb_4bit_compute_dtype=torch.float16,  # ✅ Uses float16 for performance
    bnb_4bit_use_double_quant=True,  # ✅ Optimizes quantization
    bnb_4bit_quant_type="nf4"  # ✅ Normalized Float-4 for better accuracy
)

print("Loading base model")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quantization_config,
    device_map="auto"
)
print("Basemodel loaded successfully")

# Load tokenizer
print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
print("Tokenizer loaded successfully")

# Load LoRA Adapter
print("Loading Peft LoRA adapter...")
model = PeftModel.from_pretrained(base_model, lora_model_name)
print("Peft LoRA model loaded successfully")

# Function to generate responses
def chatbot_response(user_input):
    try:
        inputs = tokenizer(user_input, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_length=200)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}"
        print(error_message)  # ✅ Logs detailed error messages
        return "An error occurred. Check the logs for details."


# Launch the Gradio chatbot
interface = gr.Interface(
    fn=chatbot_response,
    inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."),
    outputs="text",
    title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA",
    description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
)

if __name__ == "__main__":
    interface.launch()