Borislav18 commited on
Commit
f77e8bf
·
verified ·
1 Parent(s): c3dfa5f

Upload 2 files

Browse files
Files changed (2) hide show
  1. finetune_llama3.py +254 -0
  2. pdf_processor.py +112 -0
finetune_llama3.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ from datetime import datetime
5
+ from typing import Dict, List, Any
6
+
7
+ try:
8
+ import datasets
9
+ from transformers import AutoTokenizer, TrainingArguments
10
+ from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
11
+ from trl import SFTTrainer
12
+ import torch
13
+ except ImportError:
14
+ print("Installing required packages...")
15
+ import subprocess
16
+ subprocess.check_call(["pip", "install",
17
+ "transformers>=4.36.0",
18
+ "peft>=0.7.0",
19
+ "datasets>=2.14.0",
20
+ "accelerate>=0.25.0",
21
+ "trl>=0.7.1",
22
+ "bitsandbytes>=0.40.0",
23
+ "torch>=2.0.0"])
24
+ import datasets
25
+ from transformers import AutoTokenizer, TrainingArguments
26
+ from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
27
+ from trl import SFTTrainer
28
+ import torch
29
+
30
+ def load_model_and_tokenizer(model_name_or_path: str,
31
+ adapter_path: str = None,
32
+ quantize: bool = True,
33
+ token: str = None):
34
+ """
35
+ Load the model and tokenizer, with optional adapter and quantization.
36
+
37
+ This will load the model in 4-bit quantization by default (which is needed
38
+ for such a large model) and can optionally load an existing adapter.
39
+ """
40
+ from transformers import BitsAndBytesConfig, AutoModelForCausalLM
41
+
42
+ print(f"Loading model: {model_name_or_path}")
43
+
44
+ # Configure for quantization
45
+ quantization_config = BitsAndBytesConfig(
46
+ load_in_4bit=quantize,
47
+ bnb_4bit_compute_dtype=torch.float16,
48
+ bnb_4bit_quant_type="nf4",
49
+ bnb_4bit_use_double_quant=True
50
+ ) if quantize else None
51
+
52
+ # Load the model
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ model_name_or_path,
55
+ quantization_config=quantization_config,
56
+ device_map="auto",
57
+ token=token
58
+ )
59
+
60
+ # Load adapter if provided
61
+ if adapter_path:
62
+ print(f"Loading adapter from {adapter_path}")
63
+ from peft import PeftModel
64
+ model = PeftModel.from_pretrained(model, adapter_path)
65
+
66
+ # Load tokenizer
67
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=token)
68
+
69
+ # Ensure we have a pad token
70
+ if tokenizer.pad_token is None:
71
+ tokenizer.pad_token = tokenizer.eos_token
72
+
73
+ return model, tokenizer
74
+
75
+ def prepare_dataset(data_path: str):
76
+ """Load and prepare datasets from JSON files."""
77
+ # Load datasets
78
+ if os.path.isdir(data_path):
79
+ train_path = os.path.join(data_path, "train.json")
80
+ val_path = os.path.join(data_path, "validation.json")
81
+
82
+ if not (os.path.exists(train_path) and os.path.exists(val_path)):
83
+ raise ValueError(f"Training data files not found in {data_path}")
84
+ else:
85
+ raise ValueError(f"Data path {data_path} is not a directory")
86
+
87
+ # Load JSON files
88
+ with open(train_path, 'r', encoding='utf-8') as f:
89
+ train_data = json.load(f)
90
+
91
+ with open(val_path, 'r', encoding='utf-8') as f:
92
+ val_data = json.load(f)
93
+
94
+ # Convert to datasets
95
+ train_dataset = datasets.Dataset.from_list(train_data)
96
+ eval_dataset = datasets.Dataset.from_list(val_data)
97
+
98
+ print(f"Loaded {len(train_dataset)} training examples and {len(eval_dataset)} validation examples")
99
+ return train_dataset, eval_dataset
100
+
101
+ def finetune(
102
+ model_name: str,
103
+ dataset_path: str,
104
+ output_dir: str,
105
+ hub_model_id: str = None,
106
+ hf_token: str = None,
107
+ use_peft: bool = True,
108
+ num_train_epochs: int = 3,
109
+ learning_rate: float = 2e-5,
110
+ bf16: bool = True,
111
+ quantize: bool = True,
112
+ max_seq_length: int = 2048,
113
+ gradient_accumulation_steps: int = 2
114
+ ):
115
+ """Fine-tune the model with PEFT on the provided dataset."""
116
+ # Set up output directory
117
+ if not output_dir:
118
+ output_dir = f"llama3-finetuned-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
119
+ os.makedirs(output_dir, exist_ok=True)
120
+
121
+ # Load datasets
122
+ train_dataset, eval_dataset = prepare_dataset(dataset_path)
123
+
124
+ # Load base model
125
+ model, tokenizer = load_model_and_tokenizer(
126
+ model_name,
127
+ quantize=quantize,
128
+ token=hf_token
129
+ )
130
+
131
+ # Set up PEFT configuration if using PEFT
132
+ if use_peft:
133
+ print("Setting up PEFT (Parameter-Efficient Fine-Tuning)")
134
+
135
+ # Prepare model for k-bit training if quantized
136
+ if quantize:
137
+ model = prepare_model_for_kbit_training(model)
138
+
139
+ # Set up LoRA configuration
140
+ peft_config = LoraConfig(
141
+ r=16, # Rank dimension
142
+ lora_alpha=32, # Scale parameter
143
+ lora_dropout=0.05,
144
+ bias="none",
145
+ task_type="CAUSAL_LM",
146
+ target_modules=[
147
+ "q_proj",
148
+ "k_proj",
149
+ "v_proj",
150
+ "o_proj",
151
+ "gate_proj",
152
+ "up_proj",
153
+ "down_proj"
154
+ ]
155
+ )
156
+ else:
157
+ peft_config = None
158
+
159
+ # Training arguments
160
+ training_args = TrainingArguments(
161
+ output_dir=output_dir,
162
+ num_train_epochs=num_train_epochs,
163
+ per_device_train_batch_size=1, # Adjust based on GPU memory
164
+ gradient_accumulation_steps=gradient_accumulation_steps,
165
+ learning_rate=learning_rate,
166
+ weight_decay=0.01,
167
+ max_grad_norm=0.3,
168
+ logging_steps=10,
169
+ optim="paged_adamw_32bit",
170
+ lr_scheduler_type="cosine",
171
+ warmup_ratio=0.03,
172
+ evaluation_strategy="steps",
173
+ eval_steps=0.1, # Evaluate every 10% of training
174
+ save_strategy="steps",
175
+ save_steps=0.1, # Save every 10% of training
176
+ save_total_limit=3,
177
+ bf16=bf16, # Use bfloat16 precision if available
178
+ push_to_hub=bool(hub_model_id),
179
+ hub_model_id=hub_model_id,
180
+ hub_token=hf_token,
181
+ )
182
+
183
+ # Initialize the SFT trainer
184
+ trainer = SFTTrainer(
185
+ model=model,
186
+ args=training_args,
187
+ train_dataset=train_dataset,
188
+ eval_dataset=eval_dataset,
189
+ peft_config=peft_config,
190
+ tokenizer=tokenizer,
191
+ max_seq_length=max_seq_length,
192
+ )
193
+
194
+ # Train the model
195
+ print("Starting training...")
196
+ trainer.train()
197
+
198
+ # Save the fine-tuned model
199
+ print(f"Saving model to {output_dir}")
200
+ trainer.save_model()
201
+
202
+ # Push to hub if specified
203
+ if hub_model_id and hf_token:
204
+ print(f"Pushing model to Hugging Face Hub: {hub_model_id}")
205
+ trainer.push_to_hub()
206
+
207
+ return output_dir
208
+
209
+ if __name__ == "__main__":
210
+ parser = argparse.ArgumentParser(description="Fine-tune Llama 3.3 with your data")
211
+ parser.add_argument("--model_name", type=str, default="nvidia/Llama-3_3-Nemotron-Super-49B-v1",
212
+ help="Base model to fine-tune")
213
+ parser.add_argument("--dataset_path", type=str, required=True,
214
+ help="Path to the directory containing train.json and validation.json")
215
+ parser.add_argument("--output_dir", type=str, default=None,
216
+ help="Directory to save the fine-tuned model")
217
+ parser.add_argument("--hub_model_id", type=str, default=None,
218
+ help="Hugging Face Hub model ID to push the model to")
219
+ parser.add_argument("--hf_token", type=str, default=None,
220
+ help="Hugging Face token for accessing gated models and pushing to hub")
221
+ parser.add_argument("--no_peft", action='store_true',
222
+ help="Disable PEFT/LoRA (not recommended for large models)")
223
+ parser.add_argument("--no_quantize", action='store_true',
224
+ help="Disable quantization (requires much more VRAM)")
225
+ parser.add_argument("--no_bf16", action='store_true',
226
+ help="Disable bf16 precision")
227
+ parser.add_argument("--epochs", type=int, default=3,
228
+ help="Number of training epochs")
229
+ parser.add_argument("--learning_rate", type=float, default=2e-5,
230
+ help="Learning rate")
231
+ parser.add_argument("--max_seq_length", type=int, default=2048,
232
+ help="Maximum sequence length for training")
233
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=2,
234
+ help="Gradient accumulation steps")
235
+
236
+ args = parser.parse_args()
237
+
238
+ # Get token from environment if not provided
239
+ hf_token = args.hf_token or os.environ.get("HF_TOKEN")
240
+
241
+ finetune(
242
+ model_name=args.model_name,
243
+ dataset_path=args.dataset_path,
244
+ output_dir=args.output_dir,
245
+ hub_model_id=args.hub_model_id,
246
+ hf_token=hf_token,
247
+ use_peft=not args.no_peft,
248
+ num_train_epochs=args.epochs,
249
+ learning_rate=args.learning_rate,
250
+ bf16=not args.no_bf16,
251
+ quantize=not args.no_quantize,
252
+ max_seq_length=args.max_seq_length,
253
+ gradient_accumulation_steps=args.gradient_accumulation_steps
254
+ )
pdf_processor.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any
6
+
7
+ try:
8
+ from PyPDF2 import PdfReader
9
+ from tqdm import tqdm
10
+ except ImportError:
11
+ print("Installing required dependencies...")
12
+ import subprocess
13
+ subprocess.check_call(["pip", "install", "PyPDF2", "tqdm"])
14
+ from PyPDF2 import PdfReader
15
+ from tqdm import tqdm
16
+
17
+ def extract_text_from_pdf(pdf_path: str) -> str:
18
+ """Extract text from a PDF file."""
19
+ try:
20
+ reader = PdfReader(pdf_path)
21
+ text = ""
22
+ for page in reader.pages:
23
+ text += page.extract_text() + "\n"
24
+ return text
25
+ except Exception as e:
26
+ print(f"Error extracting text from {pdf_path}: {e}")
27
+ return ""
28
+
29
+ def process_pdfs(pdf_dir: str, output_dir: str, chunk_size: int = 1000) -> List[Dict[str, Any]]:
30
+ """Process all PDFs in a directory and save the extracted text."""
31
+ pdf_files = list(Path(pdf_dir).glob("*.pdf"))
32
+
33
+ if not pdf_files:
34
+ raise ValueError(f"No PDF files found in {pdf_dir}")
35
+
36
+ os.makedirs(output_dir, exist_ok=True)
37
+
38
+ all_data = []
39
+
40
+ for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
41
+ try:
42
+ file_name = pdf_file.stem
43
+ print(f"Processing {file_name}")
44
+
45
+ text = extract_text_from_pdf(str(pdf_file))
46
+ if not text.strip():
47
+ print(f"Warning: No text extracted from {file_name}")
48
+ continue
49
+
50
+ # Split into chunks to avoid context length issues
51
+ words = text.split()
52
+ for i in range(0, len(words), chunk_size):
53
+ chunk = " ".join(words[i:i+chunk_size])
54
+ if len(chunk.strip()) > 100: # Ensure chunk has enough content
55
+ data_point = {
56
+ "text": chunk,
57
+ "source": file_name,
58
+ "chunk_id": i // chunk_size
59
+ }
60
+ all_data.append(data_point)
61
+
62
+ except Exception as e:
63
+ print(f"Error processing {pdf_file}: {e}")
64
+
65
+ # Save all data to a single JSON file
66
+ with open(os.path.join(output_dir, "pdf_data.json"), "w", encoding="utf-8") as f:
67
+ json.dump(all_data, f, ensure_ascii=False, indent=2)
68
+
69
+ print(f"Processed {len(pdf_files)} PDFs into {len(all_data)} text chunks")
70
+ return all_data
71
+
72
+ def prepare_training_data(pdf_data: List[Dict[str, Any]], output_dir: str):
73
+ """Prepare data in the format needed for fine-tuning LLMs."""
74
+ training_data = []
75
+
76
+ for item in pdf_data:
77
+ # Format for instruction fine-tuning
78
+ train_item = {
79
+ "instruction": "Use the following text from the document to answer questions or generate content about the topics it covers.",
80
+ "input": item["text"][:500], # Use beginning of text as input
81
+ "output": item["text"][500:], # Use rest of text as output
82
+ }
83
+ training_data.append(train_item)
84
+
85
+ # Create train/validation split (90/10)
86
+ split_idx = int(len(training_data) * 0.9)
87
+ train_data = training_data[:split_idx]
88
+ val_data = training_data[split_idx:]
89
+
90
+ # Save splits
91
+ os.makedirs(os.path.join(output_dir, "training_data"), exist_ok=True)
92
+
93
+ with open(os.path.join(output_dir, "training_data", "train.json"), "w", encoding="utf-8") as f:
94
+ json.dump(train_data, f, ensure_ascii=False, indent=2)
95
+
96
+ with open(os.path.join(output_dir, "training_data", "validation.json"), "w", encoding="utf-8") as f:
97
+ json.dump(val_data, f, ensure_ascii=False, indent=2)
98
+
99
+ print(f"Created training dataset: {len(train_data)} train, {len(val_data)} validation examples")
100
+
101
+ if __name__ == "__main__":
102
+ parser = argparse.ArgumentParser(description="Process PDFs and prepare training data")
103
+ parser.add_argument("--pdf_dir", type=str, required=True, help="Directory containing PDF files")
104
+ parser.add_argument("--output_dir", type=str, default="processed_data", help="Output directory for processed data")
105
+ parser.add_argument("--chunk_size", type=int, default=1000, help="Number of words per chunk")
106
+
107
+ args = parser.parse_args()
108
+
109
+ pdf_data = process_pdfs(args.pdf_dir, args.output_dir, args.chunk_size)
110
+ prepare_training_data(pdf_data, args.output_dir)
111
+
112
+ print("PDF processing complete. Data is ready for fine-tuning.")