import gradio as gr from huggingface_hub import InferenceClient, login from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import os import torch import bitsandbytes # Base model (LLaMA 3.1 8B) from Meta base_model_name = "meta-llama/Llama-3.1-8B" # Your fine-tuned LoRA adapter (uploaded to Hugging Face) lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned" # Login because LLaMA 3.1 8B is a gated model login(token=os.getenv("HuggingFaceFineGrainedReadToken")) # Load base model - can't do this on the free tier - not enough memory # model = AutoModelForCausalLM.from_pretrained(base_model_name) model = AutoModelForCausalLM.from_pretrained( base_model_name, # load_in_4bit=True, # ✅ Reduces memory, but requires a GPU torch_dtype=torch.float16, # device_map="auto" # Since I'm running on a free instance that doesn't have a GPU, I'll need to force CPU device_map="cpu" ) # Load LoRA adapter model = PeftModel.from_pretrained(model, lora_model_name) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) # Function to generate responses def chatbot_response(user_input): inputs = tokenizer(user_input, return_tensors="pt") outputs = model.generate(**inputs, max_length=400) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Launch the Gradio chatbot interface = gr.Interface( fn=chatbot_response, inputs=gr.Textbox(lines=2, placeholder="Ask me anything..."), outputs="text", title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA", description="A chatbot using my fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.", ) interface.launch(share=True)