File size: 2,651 Bytes
e9a38d8
 
44e15dc
 
 
a9b553f
87ed98d
9b0d920
e9a38d8
c5f1959
106ed70
4c9f7f3
44e15dc
b1af9dd
15fc625
 
 
 
 
 
 
 
 
106ed70
 
 
dee8762
 
 
 
 
 
 
 
 
 
 
44e15dc
 
 
 
15fc625
a2e8f1b
cb1d07a
 
15fc625
cb1d07a
 
106ed70
 
cb1d07a
dee8762
 
 
 
cb1d07a
 
f7b9082
4dbc1c3
15fc625
cb1d07a
14eb8c8
 
 
 
 
 
 
 
 
9702672
 
 
 
cb1d07a
9702672
 
cb1d07a
9702672
 
0e2ae71
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os

# Disable CUDA visibility at the start
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Prevents CUDA initialization

import spaces
import gradio as gr
from huggingface_hub import InferenceClient, login

import time




# Base model (LLaMA 3.1 8B) from Meta
base_model_name = "meta-llama/Llama-3.1-8B"

# Your fine-tuned LoRA adapter (uploaded to Hugging Face)
lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"


# Function to generate responses
def chatbot_response(user_input):
    # Re-enable CUDA inside the function for accelerate to manage
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Adjust based on ZeroGPU setup
    
    import traceback
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    from peft import PeftModel, PeftConfig
    import bitsandbytes
    from accelerate import Accelerator
    import torch

    # Check if CUDA is already initialized (for debugging)
    if torch.cuda.is_initialized():
        print("CUDA was already initialized before Accelerator!")

    @spaces.GPU  # Forces GPU allocation before execution
    def force_gpu_allocation():
        pass  # Dummy function to trigger GPU setup

    accelerator = Accelerator()

    # Login because LLaMA 3.1 8B is a gated model
    login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
    
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="cpu",
        torch_dtype=torch.float32  # Avoid any GPU-related dtype defaults
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    
    
    # Load LoRA adapter
    # model = PeftModel.from_pretrained(model, lora_model_name)
    
    model = accelerator.prepare(model)
    
    try:
        inputs = tokenizer(user_input, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_length=200)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        error_message = f"AssertionError: {str(e)}\n{traceback.format_exc()}"
        print(error_message)  # ✅ Logs detailed error messages
        return "An error occurred. Check the logs for details."


# Launch the Gradio chatbot
interface = gr.Interface(
    fn=chatbot_response,
    inputs=gr.Textbox(lines=2, placeholder="Ask me about the Christian Church Fathers..."),
    outputs="text",
    title="Early Christian Church Fathers Fine-Tuned LLaMA 3.1 8B with LoRA",
    description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
)

interface.launch()