import os # Disable CUDA visibility at the start os.environ["CUDA_VISIBLE_DEVICES"] = "" # Prevents CUDA initialization import spaces import gradio as gr from huggingface_hub import InferenceClient, login import time # Base model (LLaMA 3.1 8B) from Meta base_model_name = "meta-llama/Llama-3.1-8B" # Your fine-tuned LoRA adapter (uploaded to Hugging Face) lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned" # Function to generate responses def chatbot_response(user_input): # Re-enable CUDA inside the function for accelerate to manage os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Adjust based on ZeroGPU setup import traceback from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel, PeftConfig import bitsandbytes from accelerate import Accelerator import torch # Check if CUDA is already initialized (for debugging) if torch.cuda.is_initialized(): print("CUDA was already initialized before Accelerator!") @spaces.GPU # Forces GPU allocation before execution def force_gpu_allocation(): pass # Dummy function to trigger GPU setup accelerator = Accelerator() # Login because LLaMA 3.1 8B is a gated model login(token=os.getenv("HuggingFaceFineGrainedReadToken")) model = AutoModelForCausalLM.from_pretrained( base_model_name, device_map="cpu", torch_dtype=torch.float32 # Avoid any GPU-related dtype defaults ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) # Load LoRA adapter # model = PeftModel.from_pretrained(model, lora_model_name) model = accelerator.prepare(model) try: inputs = tokenizer(user_input, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_length=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception as e: error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}" print(error_message) # ✅ Logs detailed error messages return "An error occurred. Check the logs for details." # Launch the Gradio chatbot interface = gr.Interface( fn=chatbot_response, inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."), outputs="text", title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA", description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.", ) interface.launch()