Spaces:

starnernj
/

Early-Christian-Church-Fathers

Paused

File size: 1,946 Bytes

a9b553f
87ed98d
9b0d920
9702672
 
4c9f7f3
20d8f1c
ce3da34
4c9f7f3
82843e4
 
 
6cb24cb
 
9702672
9b73d65
9702672
 
 
 
4c9f7f3
 
 
20d8f1c
 
 
 
 
6cb24cb
20d8f1c
82843e4
20d8f1c
9702672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cb24cb

import spaces
import gradio as gr
from huggingface_hub import InferenceClient, login
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
import torch
import bitsandbytes

print(f"Is CUDA available: {torch.cuda.is_available()}")  # True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")  # Tesla T4

"""

# Base model (LLaMA 3.1 8B) from Meta
base_model_name = "meta-llama/Llama-3.1-8B"

# Your fine-tuned LoRA adapter (uploaded to Hugging Face)
lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"

# Login because LLaMA 3.1 8B is a gated model
login(token=os.getenv("HuggingFaceFineGrainedReadToken"))

# Load base model - can't do this on the free tier - not enough memory
# model = AutoModelForCausalLM.from_pretrained(base_model_name)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    load_in_4bit=True,  # Reduces memory, but requires a GPU
    torch_dtype=torch.float16,
    device_map="auto"  # Since I'm running on a free instance that doesn't have a GPU, I'll need to force CPU
)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, lora_model_name)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Function to generate responses
def chatbot_response(user_input):
    inputs = tokenizer(user_input, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=400)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Launch the Gradio chatbot
interface = gr.Interface(
    fn=chatbot_response,
    inputs=gr.Textbox(lines=2, placeholder="Ask me anything..."),
    outputs="text",
    title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA",
    description="A chatbot using my fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
)

interface.launch(share=True)
"""