File size: 17,335 Bytes

364f6c2

# train.py
# YOUR dataset.py should have create_huggingface_dataset() function like this:
# from dataset import create_huggingface_dataset
import os
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from dataclasses import dataclass
from typing import Dict
import json # Import json for dataset loading example

# Import the EmotionalLlamaModel and constants from the emotional_gemma.py file
from emotional_gemma import EmotionalLlamaModel, EMOTION_DIMENSIONS, EMOTION_DIMENSIONS_REFERENCE, MODEL_NAME



# Define the DataCollator for handling padding and adding emotion vectors
@dataclass
class DataCollatorForEmotionalLlama:
    tokenizer: AutoTokenizer
    max_length: int
    emotion_dim: int = EMOTION_DIMENSIONS # Use the constant from emotional_gemma

    def __call__(self, examples: list) -> Dict[str, torch.Tensor]:
        # Separate the components from the examples
        input_ids_list = [example.get("input_ids", []) for example in examples]
        attention_mask_list = [example.get("attention_mask", []) for example in examples]
        emotion_vectors_list = [example.get("emotion_vectors", []) for example in examples]

        # --- Find the token ID for the start of the model's turn ---
        # This is used to mask out user input and padding from the labels
        # Ensure your tokenizer and dataset preparation consistently include this sequence.
        try:
             # Tokenize the specific sequence marking the model's turn start.
             # add_special_tokens=False is crucial here to get just the tokens for the string.
             model_prompt_tokens = self.tokenizer(
                 "<start_of_turn>model\n",
                 add_special_tokens=False
             ).input_ids
             if not model_prompt_tokens:
                 raise ValueError("Tokenizer produced empty list for model prompt sequence.")
             # print(f"DEBUG: Detected model prompt start tokens: {model_prompt_tokens} (decoded: '{self.tokenizer.decode(model_prompt_tokens)}')")
        except Exception as e:
             print(f"ERROR: Could not tokenize model prompt '<start_of_turn>model\\n'. Check tokenizer and template format. Error: {e}")
             raise ValueError("Cannot proceed without identifying model start tokens for label masking.") from e

        batch_input_ids = []
        batch_attention_mask = []
        batch_labels = []
        batch_emotion_vectors = []

        # Process each example in the batch
        for i in range(len(input_ids_list)):
            input_ids = input_ids_list[i]
            attention_mask = attention_mask_list[i]
            emotion_vectors = emotion_vectors_list[i]

            # --- Padding ---
            seq_len = len(input_ids)
            pad_len = self.max_length - seq_len

            # Truncate if sequence is longer than max_length (should ideally be handled in dataset)
            if pad_len < 0:
                 input_ids = input_ids[:self.max_length]
                 attention_mask = attention_mask[:self.max_length]
                 emotion_vectors = emotion_vectors[:self.max_length]
                 seq_len = self.max_length
                 pad_len = 0 # Recalculate pad_len after truncation

            # Pad input IDs, attention mask, and emotion vectors
            padded_input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
            padded_attention_mask = attention_mask + [0] * pad_len
            # Pad emotion vectors with zero vectors
            padded_emotion_vectors = emotion_vectors + [[0.0] * self.emotion_dim] * pad_len

            # --- Create Labels and Mask User/Padding Tokens ---
            labels = list(padded_input_ids) # Start with a copy of input_ids for labels

            # Find the start index of the model's response to mask previous tokens
            model_start_idx = -1
            # Search for the model prompt token sequence within the original input_ids
            for k in range(seq_len - len(model_prompt_tokens) + 1):
                 if input_ids[k : k + len(model_prompt_tokens)] == model_prompt_tokens:
                      model_start_idx = k
                      break

            if model_start_idx != -1:
                # Mask everything before and including the model's prompt sequence
                for j in range(model_start_idx + len(model_prompt_tokens)):
                    labels[j] = -100
            else:
                print(f"Warning: Model prompt sequence not found in sample {i}. Masking all labels.")
                labels = [-100] * self.max_length # Mask everything

            # Mask padding tokens regardless of model prompt finding
            for j in range(seq_len, self.max_length): # Only mask the padded part
                 labels[j] = -100

            # Sanity check: ensure all lists have the correct length
            if len(padded_input_ids) != self.max_length or \
               len(padded_attention_mask) != self.max_length or \
               len(labels) != self.max_length or \
               len(padded_emotion_vectors) != self.max_length:
                raise ValueError(f"Length mismatch in collator for sample {i} after padding/truncation!")

            batch_input_ids.append(padded_input_ids)
            batch_attention_mask.append(padded_attention_mask)
            batch_labels.append(labels)
            batch_emotion_vectors.append(padded_emotion_vectors)

        # Convert lists to tensors
        batch = {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention_mask, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
            "emotion_vector": torch.tensor(batch_emotion_vectors, dtype=torch.float),
        }

        return batch


# Subclass Trainer to potentially customize dataloader behavior
class CustomTrainer(Trainer):
    def get_train_dataloader(self) -> DataLoader:
        """

        Overrides the method to explicitly use the provided data collator.

        This is mostly for clarity or if the default Trainer behavior needs bypassing.

        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        # Use the data_collator provided during Trainer initialization
        data_collator = self.data_collator

        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            shuffle=True,  # Important for training
            collate_fn=data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

def train_emotional_llama(

    model_name=MODEL_NAME, # Use the default model name from emotional_gemma.py

    dataset_path="./dataset.json", # Path to your dataset file

    output_dir="./emotional-gemma-output", # Directory to save results

    max_length=128, # Max sequence length for training

    learning_rate=1e-4, # Base learning rate for LoRA

    emotion_proj_lr=2e-3, # Higher learning rate for emotion projection layer

    num_train_epochs=2,

    per_device_batch_size=12,

    gradient_accumulation_steps=1,

    use_lora=True # Whether to use LoRA

):
    """

    Sets up and runs the training for the EmotionalLlamaModel.

    """
    print(f"Loading tokenizer: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    # Set pad_token to eos_token for Gemma if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Set padding side to right for causal models
    tokenizer.padding_side = "right"

    print(f"Loading base model: {model_name}")
    # Load the custom EmotionalLlamaModel
    model = EmotionalLlamaModel.from_pretrained(model_name)

    if use_lora:
        print("Applying LoRA configuration")
        # Define LoRA configuration
        peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=32, # LoRA rank
            lora_alpha=32, # LoRA scaling factor
            # lora_dropout=0.05, # Dropout for LoRA layers (optional)
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] # Modules to apply LoRA to
        )
        # Get the PEFT model by wrapping the base model
        model = get_peft_model(model, peft_config)
        # Print trainable parameters summary
        model.print_trainable_parameters()

    # Ensure the emotion projection layer is trainable
    # This is necessary if LoRA was applied, as LoRA defaults other layers to not trainable.
    print("Setting emotion_proj_embed requires_grad=True")
    for param in model.emotion_proj_embed.parameters():
        param.requires_grad = True

    # --- Load and Prepare Dataset ---
    print(f"Loading dataset from: {dataset_path}")
    # Import and use your dataset creation function
    try:
        from dataset import create_huggingface_dataset
        dataset = create_huggingface_dataset(dataset_path, tokenizer, max_length)
        print(f"Dataset loaded with {len(dataset)} examples.")
    except ImportError:
        print("Error: Could not import 'create_huggingface_dataset' from dataset.py")
        print("Please ensure dataset.py exists and contains the necessary function.")
        print("Example dummy dataset creation:")
        # --- PLACEHOLDER! Dummy Dataset Creation Example ---
        # PLACEHOLDER! if dataset.py is not available.
        # Replace this section with your actual dataset loading and processing logic.
        dummy_data = [
            {"text": "<start_of_turn>user\nHello!<end_of_turn>\n<start_of_turn>model\nHi there!", "emotion_vectors": [[0.1]*EMOTION_DIMENSIONS] * 20},
            {"text": "<start_of_turn>user\nHow are you?<end_of_turn>\n<start_of_turn>model\nI'm feeling good today.", "emotion_vectors": [[0.8]*EMOTION_DIMENSIONS] * 25},
        ]
        def dummy_process(example):
            # Simple tokenization for dummy data
            tokenized = tokenizer(example["text"], truncation=True, max_length=max_length, padding="max_length")
            tokenized["emotion_vectors"] = example["emotion_vectors"][:max_length] # Truncate/pad emotion vectors too
            if len(tokenized["emotion_vectors"]) < max_length:
                 tokenized["emotion_vectors"] += [[0.0] * EMOTION_DIMENSIONS] * (max_length - len(tokenized["emotion_vectors"]))
            return tokenized

        from datasets import Dataset
        dataset = Dataset.from_list(dummy_data).map(dummy_process)
        print("Created a dummy dataset. REPLACE THIS with your actual dataset loading!")
        # --- End Dummy Dataset Example ---

    # Initialize the data collator
    data_collator = DataCollatorForEmotionalLlama(tokenizer=tokenizer, max_length=max_length)

    # --- Training Arguments ---
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps, # Accumulate gradients over steps
        warmup_ratio=0.1, # Linear warmup over the first 10% of steps
        weight_decay=0.01, # L2 regularization for most parameters
        logging_steps=10, # Log training progress every N steps
        save_steps=200, # Save checkpoint every N steps
        save_total_limit=2, # Keep only the last N checkpoints
        report_to="none", # Disable reporting to external platforms like W&B
        push_to_hub=False, # Do not push to Hugging Face Hub
        bf16=torch.cuda.is_bf16_supported(), # Use bf16 if supported
        fp16=not torch.cuda.is_bf16_supported(), # Otherwise use fp16
        lr_scheduler_type="cosine", # Cosine annealing learning rate scheduler
        optim="adamw_torch" # PyTorch AdamW optimizer
    )

    # --- Optimizer Setup ---
    # Split parameters for different learning rates and weight decay
    # LoRA parameters and other model parameters (if any are trainable beyond LoRA)
    main_params = [p for n, p in model.named_parameters() if p.requires_grad and "emotion_proj" not in n]
    # Emotion projection layer parameters
    emotion_params = [p for n, p in model.named_parameters() if "emotion_proj" in n and p.requires_grad]

    # Define parameter groups for the optimizer
    optimizer_grouped_parameters = [
        # Group for main parameters (LoRA, etc.) with weight decay
        {"params": main_params, "lr": training_args.learning_rate, "weight_decay": training_args.weight_decay},
        # Group for emotion projection layer parameters with a higher LR and NO weight decay
        {"params": emotion_params, "lr": emotion_proj_lr, "weight_decay": 0.0}
    ]

    # Initialize the optimizer
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters)

    # --- Initialize Trainer ---
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
        optimizers=(optimizer, None), # Pass the custom optimizer
    )

    # --- Optional: Debugging Prints for Dataloader ---
    # print("\n--- Debugging Data Collator Output (First Batch) ---")
    # for step, batch in enumerate(trainer.get_train_dataloader()):
    #     print(f"  Step {step + 1}:")
    #     print(f"    input_ids shape: {batch['input_ids'].shape}")
    #     print(f"    attention_mask shape: {batch['attention_mask'].shape}")
    #     print(f"    emotion_vector shape: {batch['emotion_vector'].shape}")
    #     print(f"    labels shape: {batch['labels'].shape}")
    #     # Print slices or stats for verification
    #     # print(f"    input_ids (first row): {batch['input_ids'][0]}")
    #     # print(f"    labels (first row): {batch['labels'][0]}")
    #     # print(f"    emotion_vector (first row, few elements): {batch['emotion_vector'][0, :10, :2]}")
    #     print(f"    emotion_vector batch MIN: {batch['emotion_vector'].min()}")
    #     print(f"    emotion_vector batch MAX: {batch['emotion_vector'].max()}")
    #     print(f"    emotion_vector batch MEAN: {batch['emotion_vector'].mean()}")
    #     break # Only print the first batch for debug
    # print("--- End Debugging Data Collator Output ---\n")
    # --- End Debugging Prints ---


    # --- Start Training ---
    print("Starting training...")
    trainer.train()
    print("Training finished.")

    # --- Save the Model ---
    # Trainer.save_model saves the full model checkpoint by default.
    # If using PEFT, model.save_pretrained() saves only the adapter weights.
    # We want to save BOTH the PEFT adapter and the custom layer weights.

    # Save the PEFT adapter weights if using LoRA
    if use_lora:
        print(f"Saving PEFT adapter model to {output_dir}")
        # This saves adapter_model.safetensors and adapter_config.json
        model.save_pretrained(output_dir)
    else:
        # If not using LoRA, save the full model checkpoint
        print(f"Saving full model checkpoint to {output_dir}")
        trainer.save_model(output_dir)

    # Manually Save Custom Layer Weights (the emotion_proj_embed layer)
    print(f"Saving custom emotion_proj_embed weights...")
    # Access the custom layer, handling the case if the model is wrapped by PEFT
    if hasattr(model, "base_model"): # Check if it's a PeftModel
        emotion_layer = model.base_model.emotion_proj_embed
    else: # If not using PEFT, the layer is directly on the model
        emotion_layer = model.emotion_proj_embed

    # Get the state dictionary of the custom layer
    emotion_state_dict = emotion_layer.state_dict()
    # Define the save path within the output directory
    save_path_emotion = os.path.join(output_dir, "emotion_proj_weights.pth")
    # Save the state dictionary
    torch.save(emotion_state_dict, save_path_emotion)
    print(f"Custom emotion_proj_embed weights saved to: {save_path_emotion}")

    # Return the trained model and tokenizer
    return model, tokenizer

if __name__ == "__main__":
    # Make sure you have a dataset.py and dataset.json file or implement the dummy dataset creation above.
    # Replace the dataset_path with the actual path to your dataset.
    train_emotional_llama(
        dataset_path="./dataset.json", # Replace with your dataset path
        output_dir="./emotional-gemma-output", # Output directory
        max_length=128,
        num_train_epochs=3,
        per_device_batch_size=4, # Adjust based on your GPU memory
        gradient_accumulation_steps=8, # Adjust based on desired effective batch size
        learning_rate=2e-4, # Base LR for LoRA
        emotion_proj_lr=5e-3, # Higher LR for emotion layer
        use_lora=True
    )