import spaces import gradio as gr from huggingface_hub import InferenceClient, login from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel import os import torch import time import bitsandbytes import traceback import threading @spaces.GPU # Forces GPU allocation before execution def force_gpu_allocation(): pass # Dummy function to trigger GPU setup print(f"Is CUDA available: {torch.cuda.is_available()}") # True print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # Define the device correctly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Debugging info # Global variables (initialized later in worker thread) model = None tokenizer = None def load_model(): print("Initializing model in background thread...") global model, tokenizer # Use global variables # Base model (LLaMA 3.1 8B) from Meta base_model_name = "meta-llama/Llama-3.1-8B" # Your fine-tuned LoRA adapter (uploaded to Hugging Face) lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned" # Login because LLaMA 3.1 8B is a gated model login(token=os.getenv("HuggingFaceFineGrainedReadToken")) # Load base model - can't do this on the free tier - not enough memory # model = AutoModelForCausalLM.from_pretrained(base_model_name) # ✅ Force CPU placement before moving anything to CUDA torch.cuda.is_available = lambda: False # 👈 Trick PyTorch to think CUDA isn't available at first """ # ✅ Configure BitsAndBytes to use CPU first quantization_config = BitsAndBytesConfig( load_in_8bit=True, # ✅ Uses 8-bit instead of 4-bit device_map={"": "cpu"}, # load_in_4bit=True, # bnb_4bit_compute_dtype=torch.float16, # bnb_4bit_use_double_quant=True, # bnb_4bit_quant_type="nf4" ) """ model = AutoModelForCausalLM.from_pretrained( base_model_name, # quantization_config=quantization_config, # load_in_4bit=True, # Reduces memory, but requires a GPU torch_dtype=torch.float16, # llm_int8_enable_fp32_cpu_offload=True, # Offload some layers to CPU device_map={"": "cpu"} # Load everything on CPU first ) # Load LoRA adapter model = PeftModel.from_pretrained(model, lora_model_name, device_map={"": "cpu"}) # ✅ Now, allow CUDA again and move everything to GPU torch.cuda.is_available = lambda: True # Move model to GPU *AFTER* loading LoRA to avoid CUDA init errors model = model.to("cuda" if torch.cuda.is_available() else "cpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) print("Model successfully loaded!") # Start model loading in a background thread threading.Thread(target=load_model, daemon=True).start() # Function to generate responses def chatbot_response(user_input): if model is None or tokenizer is None: return "Model is still loading. Please wait..." try: inputs = tokenizer(user_input, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_length=200) return tokenizer.decode(outputs[0], skip_special_tokens=True) except Exception as e: error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}" print(error_message) # ✅ Logs detailed error messages return "An error occurred. Check the logs for details." # Launch the Gradio chatbot interface = gr.Interface( fn=chatbot_response, inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."), outputs="text", title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA", description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.", ) interface.launch()