import os import argparse import json from datetime import datetime from typing import Dict, List, Any try: import datasets from transformers import AutoTokenizer, TrainingArguments from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training from trl import SFTTrainer import torch except ImportError: print("Installing required packages...") import subprocess subprocess.check_call(["pip", "install", "transformers>=4.36.0", "peft>=0.7.0", "datasets>=2.14.0", "accelerate>=0.25.0", "trl>=0.7.1", "bitsandbytes>=0.40.0", "torch>=2.0.0"]) import datasets from transformers import AutoTokenizer, TrainingArguments from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training from trl import SFTTrainer import torch def load_model_and_tokenizer(model_name_or_path: str, adapter_path: str = None, quantize: bool = True, token: str = None): """ Load the model and tokenizer, with optional adapter and quantization. This will load the model in 4-bit quantization by default (which is needed for such a large model) and can optionally load an existing adapter. """ from transformers import BitsAndBytesConfig, AutoModelForCausalLM print(f"Loading model: {model_name_or_path}") # Configure for quantization quantization_config = BitsAndBytesConfig( load_in_4bit=quantize, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True ) if quantize else None # Load the model model = AutoModelForCausalLM.from_pretrained( model_name_or_path, quantization_config=quantization_config, device_map="auto", token=token ) # Load adapter if provided if adapter_path: print(f"Loading adapter from {adapter_path}") from peft import PeftModel model = PeftModel.from_pretrained(model, adapter_path) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=token) # Ensure we have a pad token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token return model, tokenizer def prepare_dataset(data_path: str): """Load and prepare datasets from JSON files.""" # Load datasets if os.path.isdir(data_path): train_path = os.path.join(data_path, "train.json") val_path = os.path.join(data_path, "validation.json") if not (os.path.exists(train_path) and os.path.exists(val_path)): raise ValueError(f"Training data files not found in {data_path}") else: raise ValueError(f"Data path {data_path} is not a directory") # Load JSON files with open(train_path, 'r', encoding='utf-8') as f: train_data = json.load(f) with open(val_path, 'r', encoding='utf-8') as f: val_data = json.load(f) # Convert to datasets train_dataset = datasets.Dataset.from_list(train_data) eval_dataset = datasets.Dataset.from_list(val_data) print(f"Loaded {len(train_dataset)} training examples and {len(eval_dataset)} validation examples") return train_dataset, eval_dataset def finetune( model_name: str, dataset_path: str, output_dir: str, hub_model_id: str = None, hf_token: str = None, use_peft: bool = True, num_train_epochs: int = 3, learning_rate: float = 2e-5, bf16: bool = True, quantize: bool = True, max_seq_length: int = 2048, gradient_accumulation_steps: int = 2 ): """Fine-tune the model with PEFT on the provided dataset.""" # Set up output directory if not output_dir: output_dir = f"llama3-finetuned-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" os.makedirs(output_dir, exist_ok=True) # Load datasets train_dataset, eval_dataset = prepare_dataset(dataset_path) # Load base model model, tokenizer = load_model_and_tokenizer( model_name, quantize=quantize, token=hf_token ) # Set up PEFT configuration if using PEFT if use_peft: print("Setting up PEFT (Parameter-Efficient Fine-Tuning)") # Prepare model for k-bit training if quantized if quantize: model = prepare_model_for_kbit_training(model) # Set up LoRA configuration peft_config = LoraConfig( r=16, # Rank dimension lora_alpha=32, # Scale parameter lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ] ) else: peft_config = None # Training arguments training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=1, # Adjust based on GPU memory gradient_accumulation_steps=gradient_accumulation_steps, learning_rate=learning_rate, weight_decay=0.01, max_grad_norm=0.3, logging_steps=10, optim="paged_adamw_32bit", lr_scheduler_type="cosine", warmup_ratio=0.03, evaluation_strategy="steps", eval_steps=0.1, # Evaluate every 10% of training save_strategy="steps", save_steps=0.1, # Save every 10% of training save_total_limit=3, bf16=bf16, # Use bfloat16 precision if available push_to_hub=bool(hub_model_id), hub_model_id=hub_model_id, hub_token=hf_token, ) # Initialize the SFT trainer trainer = SFTTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, peft_config=peft_config, tokenizer=tokenizer, max_seq_length=max_seq_length, ) # Train the model print("Starting training...") trainer.train() # Save the fine-tuned model print(f"Saving model to {output_dir}") trainer.save_model() # Push to hub if specified if hub_model_id and hf_token: print(f"Pushing model to Hugging Face Hub: {hub_model_id}") trainer.push_to_hub() return output_dir if __name__ == "__main__": parser = argparse.ArgumentParser(description="Fine-tune Llama 3.3 with your data") parser.add_argument("--model_name", type=str, default="nvidia/Llama-3_3-Nemotron-Super-49B-v1", help="Base model to fine-tune") parser.add_argument("--dataset_path", type=str, required=True, help="Path to the directory containing train.json and validation.json") parser.add_argument("--output_dir", type=str, default=None, help="Directory to save the fine-tuned model") parser.add_argument("--hub_model_id", type=str, default=None, help="Hugging Face Hub model ID to push the model to") parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face token for accessing gated models and pushing to hub") parser.add_argument("--no_peft", action='store_true', help="Disable PEFT/LoRA (not recommended for large models)") parser.add_argument("--no_quantize", action='store_true', help="Disable quantization (requires much more VRAM)") parser.add_argument("--no_bf16", action='store_true', help="Disable bf16 precision") parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate") parser.add_argument("--max_seq_length", type=int, default=2048, help="Maximum sequence length for training") parser.add_argument("--gradient_accumulation_steps", type=int, default=2, help="Gradient accumulation steps") args = parser.parse_args() # Get token from environment if not provided hf_token = args.hf_token or os.environ.get("HF_TOKEN") finetune( model_name=args.model_name, dataset_path=args.dataset_path, output_dir=args.output_dir, hub_model_id=args.hub_model_id, hf_token=hf_token, use_peft=not args.no_peft, num_train_epochs=args.epochs, learning_rate=args.learning_rate, bf16=not args.no_bf16, quantize=not args.no_quantize, max_seq_length=args.max_seq_length, gradient_accumulation_steps=args.gradient_accumulation_steps )