print("Beginning import") import os import spaces import gradio as gr from huggingface_hub import InferenceClient, login import time import traceback from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel, PeftConfig import bitsandbytes import torch print("Imports completed") @spaces.GPU # Forces GPU allocation before execution def force_gpu_allocation(): pass # Dummy function to trigger GPU setup # Base model (LLaMA 3.1 8B) from Meta base_model_name = "meta-llama/Llama-3.1-8B" # Your fine-tuned LoRA adapter (uploaded to Hugging Face) lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned" # Login because LLaMA 3.1 8B is a gated model login(token=os.getenv("HuggingFaceFineGrainedReadToken")) print("Login to Huggin Face successful") # Enable 4-bit Quantization with BitsAndBytes quantization_config = BitsAndBytesConfig( load_in_4bit=True, # ✅ Enables 4-bit quantization for memory efficiency bnb_4bit_compute_dtype=torch.float16, # ✅ Uses float16 for performance bnb_4bit_use_double_quant=True, # ✅ Optimizes quantization bnb_4bit_quant_type="nf4" # ✅ Normalized Float-4 for better accuracy ) print("Loading base model") base_model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=quantization_config, device_map="auto" ) print("Basemodel loaded successfully") # Load tokenizer print("Loading tokenizer") tokenizer = AutoTokenizer.from_pretrained(base_model_name) print("Tokenizer loaded successfully") # Load LoRA Adapter print("Loading Peft LoRA adapter...") model = PeftModel.from_pretrained(base_model, lora_model_name) print("Peft LoRA model loaded successfully") # Function to generate responses def chatbot_response(user_input): try: inputs = tokenizer(user_input, return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_length=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception as e: error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}" print(error_message) # ✅ Logs detailed error messages return "An error occurred. Check the logs for details." # Launch the Gradio chatbot interface = gr.Interface( fn=chatbot_response, inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."), outputs="text", title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA", description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.", ) if __name__ == "__main__": interface.launch()