|
import spaces |
|
import gradio as gr |
|
from huggingface_hub import InferenceClient, login |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
from peft import PeftModel |
|
import os |
|
import torch |
|
import time |
|
import bitsandbytes |
|
import traceback |
|
import threading |
|
|
|
@spaces.GPU |
|
def force_gpu_allocation(): |
|
pass |
|
|
|
print(f"Is CUDA available: {torch.cuda.is_available()}") |
|
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
|
|
model = None |
|
tokenizer = None |
|
|
|
def load_model(): |
|
print("Initializing model in background thread...") |
|
global model, tokenizer |
|
|
|
|
|
base_model_name = "meta-llama/Llama-3.1-8B" |
|
|
|
|
|
lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned" |
|
|
|
|
|
login(token=os.getenv("HuggingFaceFineGrainedReadToken")) |
|
|
|
|
|
|
|
|
|
|
|
torch.cuda.is_available = lambda: False |
|
|
|
""" |
|
# β
Configure BitsAndBytes to use CPU first |
|
quantization_config = BitsAndBytesConfig( |
|
load_in_8bit=True, # β
Uses 8-bit instead of 4-bit |
|
device_map={"": "cpu"}, |
|
# load_in_4bit=True, |
|
# bnb_4bit_compute_dtype=torch.float16, |
|
# bnb_4bit_use_double_quant=True, |
|
# bnb_4bit_quant_type="nf4" |
|
) |
|
""" |
|
model = AutoModelForCausalLM.from_pretrained( |
|
base_model_name, |
|
|
|
|
|
torch_dtype=torch.float16, |
|
|
|
device_map={"": "cpu"} |
|
) |
|
|
|
|
|
model = PeftModel.from_pretrained(model, lora_model_name, device_map={"": "cpu"}) |
|
|
|
|
|
torch.cuda.is_available = lambda: True |
|
|
|
|
|
model = model.to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model_name) |
|
|
|
print("Model successfully loaded!") |
|
|
|
|
|
threading.Thread(target=load_model, daemon=True).start() |
|
|
|
|
|
def chatbot_response(user_input): |
|
if model is None or tokenizer is None: |
|
return "Model is still loading. Please wait..." |
|
try: |
|
inputs = tokenizer(user_input, return_tensors="pt").to(device) |
|
outputs = model.generate(**inputs, max_length=200) |
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
except Exception as e: |
|
error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}" |
|
print(error_message) |
|
return "An error occurred. Check the logs for details." |
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=chatbot_response, |
|
inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."), |
|
outputs="text", |
|
title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA", |
|
description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.", |
|
) |
|
|
|
interface.launch() |