import spaces import gradio as gr from huggingface_hub import InferenceClient, login from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel import os import torch import time import bitsandbytes import traceback import threading from accelerate import Accelerator @spaces.GPU # Forces GPU allocation before execution def force_gpu_allocation(): pass # Dummy function to trigger GPU setup print(f"Is CUDA available: {torch.cuda.is_available()}") # True print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # Define the device correctly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Debugging info # Base model (LLaMA 3.1 8B) from Meta base_model_name = "meta-llama/Llama-3.1-8B" # Your fine-tuned LoRA adapter (uploaded to Hugging Face) lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned" # Function to generate responses def chatbot_response(user_input): accelerator = Accelerator() # Login because LLaMA 3.1 8B is a gated model login(token=os.getenv("HuggingFaceFineGrainedReadToken")) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) model = AutoModelForCausalLM.from_pretrained( base_model_name, ) # Load LoRA adapter model = PeftModel.from_pretrained(model, lora_model_name) model = accelerator.prepare(model) try: inputs = tokenizer(user_input, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_length=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception as e: error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}" print(error_message) # ✅ Logs detailed error messages return "An error occurred. Check the logs for details." # Launch the Gradio chatbot interface = gr.Interface( fn=chatbot_response, inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."), outputs="text", title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA", description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.", ) interface.launch()