starnernj commited on
Commit
9ee6131
·
verified ·
1 Parent(s): c17a736

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -40
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import os
2
-
3
- # Disable CUDA visibility at the start
4
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Prevents CUDA initialization
5
-
6
  import spaces
7
  import gradio as gr
8
  from huggingface_hub import InferenceClient, login
9
  import time
 
 
 
 
 
10
 
11
 
12
  @spaces.GPU # Forces GPU allocation before execution
@@ -19,47 +20,34 @@ base_model_name = "meta-llama/Llama-3.1-8B"
19
  # Your fine-tuned LoRA adapter (uploaded to Hugging Face)
20
  lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
21
 
 
 
22
 
23
- # Function to generate responses
24
- def chatbot_response(user_input):
25
- # Re-enable CUDA inside the function for accelerate to manage
26
- os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Adjust based on ZeroGPU setup
27
-
28
- import traceback
29
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
30
- from peft import PeftModel, PeftConfig
31
- import bitsandbytes
32
- from accelerate import Accelerator
33
- import torch
34
-
35
- # Check if CUDA is already initialized (for debugging)
36
- if torch.cuda.is_initialized():
37
- print("CUDA was already initialized before Accelerator!")
38
 
 
 
 
 
 
39
 
 
 
40
 
41
- accelerator = Accelerator()
 
 
42
 
43
- # Login because LLaMA 3.1 8B is a gated model
44
- login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
45
-
46
- model = AutoModelForCausalLM.from_pretrained(
47
- base_model_name,
48
- device_map="cpu",
49
- torch_dtype=torch.float32 # Avoid any GPU-related dtype defaults
50
- )
51
-
52
- # Load tokenizer
53
- tokenizer = AutoTokenizer.from_pretrained(base_model_name)
54
-
55
-
56
- # Load LoRA adapter
57
- # model = PeftModel.from_pretrained(model, lora_model_name)
58
-
59
- model = accelerator.prepare(model)
60
-
61
  try:
62
- inputs = tokenizer(user_input, return_tensors="pt").to(device)
63
  outputs = model.generate(**inputs, max_length=200)
64
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
65
  except Exception as e:
@@ -77,4 +65,5 @@ interface = gr.Interface(
77
  description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
78
  )
79
 
80
- interface.launch()
 
 
1
  import os
 
 
 
 
2
  import spaces
3
  import gradio as gr
4
  from huggingface_hub import InferenceClient, login
5
  import time
6
+ import traceback
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
8
+ from peft import PeftModel, PeftConfig
9
+ import bitsandbytes
10
+ import torch
11
 
12
 
13
  @spaces.GPU # Forces GPU allocation before execution
 
20
  # Your fine-tuned LoRA adapter (uploaded to Hugging Face)
21
  lora_model_name = "starnernj/Early-Christian-Church-Fathers-LLaMA-3.1-Fine-Tuned"
22
 
23
+ # Login because LLaMA 3.1 8B is a gated model
24
+ login(token=os.getenv("HuggingFaceFineGrainedReadToken"))
25
 
26
+ # Enable 4-bit Quantization with BitsAndBytes
27
+ quantization_config = BitsAndBytesConfig(
28
+ load_in_4bit=True, # Enables 4-bit quantization for memory efficiency
29
+ bnb_4bit_compute_dtype=torch.float16, # Uses float16 for performance
30
+ bnb_4bit_use_double_quant=True, # ✅ Optimizes quantization
31
+ bnb_4bit_quant_type="nf4" # ✅ Normalized Float-4 for better accuracy
32
+ )
 
 
 
 
 
 
 
 
33
 
34
+ base_model = AutoModelForCausalLM.from_pretrained(
35
+ base_model_name,
36
+ quantization_config=quantization_config,
37
+ device_map="auto"
38
+ )
39
 
40
+ # Load tokenizer
41
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
42
 
43
+ # Load LoRA Adapter
44
+ print("Loading LoRA adapter...")
45
+ model = PeftModel.from_pretrained(base_model, lora_model_name)
46
 
47
+ # Function to generate responses
48
+ def chatbot_response(user_input):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
+ inputs = tokenizer(user_input, return_tensors="pt").to("cuda")
51
  outputs = model.generate(**inputs, max_length=200)
52
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
53
  except Exception as e:
 
65
  description="A chatbot using a fine-tuned LoRA adapter on LLaMA 3.1 8B, tuned on thousands of writings of the early Christian Church Fathers.",
66
  )
67
 
68
+ if __name__ == "__main__":
69
+ interface.launch()