Spaces:

seemggoel
/

Finetuning_Multimodal_LLM

Runtime error

App Files Files Community

seemggoel commited on Jan 7

Commit

74f9db3

verified ·

1 Parent(s): 41a91e8

Upload 3 files

Browse files

Files changed (3) hide show

config.py +65 -0
features.py +163 -0
modal.py +300 -0

config.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from transformers import (
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    TrainingArguments,
+    Trainer,
+    TrainerCallback,
+    EarlyStoppingCallback
+)
+# def get_training_args(output_dir):
+#     return TrainingArguments(
+#         output_dir=output_dir,
+#         num_train_epochs=5,  # Increased from 3
+#         per_device_train_batch_size=4,
+#         per_device_eval_batch_size=4,
+#         gradient_accumulation_steps=8,  # Increased from 4
+#         evaluation_strategy="steps",
+#         eval_steps=50,  # More frequent evaluation
+#         save_strategy="steps",
+#         save_steps=50,
+#         logging_dir=f"{output_dir}/logs",
+#         logging_strategy="steps",
+#         logging_steps=10,
+#         learning_rate=5e-5,  # Lower learning rate for continued training
+#         weight_decay=0.02,  # Increased from 0.01
+#         warmup_ratio=0.1,  # Increased from previous value
+#         lr_scheduler_type="cosine_with_restarts",  # Changed from cosine
+#         load_best_model_at_end=True,
+#         metric_for_best_model="eval_loss",
+#         greater_is_better=False,
+#         fp16=True,
+#         gradient_checkpointing=True,
+#         gradient_checkpointing_kwargs={"use_reentrant": False},
+#         report_to="tensorboard",
+#         remove_unused_columns=False,
+#         optim="adamw_torch_fused",  # Using fused optimizer
+#         max_grad_norm=0.5,  # Added gradient clipping
+    # )
+def get_training_args(output_dir):
+    return TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=3,  # Reduced epochs for continued training
+        per_device_train_batch_size=2,  # Reduced batch size
+        per_device_eval_batch_size=2,
+        gradient_accumulation_steps=16,  # Increased for stability
+        evaluation_strategy="steps",
+        eval_steps=25,  # More frequent evaluation
+        save_strategy="steps",
+        save_steps=25,
+        learning_rate=1e-5,  # Lower learning rate for fine-tuning
+        weight_decay=0.03,  # Increased for better regularization
+        warmup_ratio=0.15,  # Increased warmup
+        lr_scheduler_type="cosine_with_restarts",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        fp16=True,
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        report_to="tensorboard",
+        remove_unused_columns=False,
+        optim="adamw_torch_fused",
+        max_grad_norm=0.3,  # Reduced for stability
+    )

features.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# -*- coding: utf-8 -*-
+"""prepare_dataset_tokenise.py - Optimized for Multimodal Fine-tuning"""
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, WhisperProcessor, WhisperForConditionalGeneration, PreTrainedModel,BitsAndBytesConfig
+from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
+from datasets import Dataset, DatasetDict
+from tqdm import tqdm
+import json
+import librosa
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+import gc
+from transformers import EarlyStoppingCallback
+from torch.utils.checkpoint import checkpoint_sequential
+# Initialize Whisper components for audio transcription
+whisper_model_name = "openai/whisper-small"
+whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
+whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
+# Load embeddings with error handling
+def load_embeddings(file_path):
+    try:
+        data = np.load(file_path)
+        if 'image_ids' in data and 'embeddings' in data:
+            return {'ids': data['image_ids'], 'embeddings': data['embeddings']}
+        else:
+            raise ValueError(f"Unexpected structure in {file_path}.")
+    except Exception as e:
+        print(f"Error loading embeddings: {e}")
+        return None
+# Process audio files
+def transcribe_speech(audiopath):
+    try:
+        speech, rate = librosa.load(audiopath, sr=16000)
+        audio_input = whisper_processor(speech, return_tensors="pt", sampling_rate=16000)
+        with torch.no_grad():
+            generated_ids = whisper_model.generate(audio_input["input_features"])
+        return whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    except Exception as e:
+        print(f"Error transcribing audio: {e}")
+        return None
+@dataclass
+class MultimodalDataCollator:
+    tokenizer: Any
+    # def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    #     batch = {"input_ids": self.tokenizer.pad({"input_ids": [f["input_ids"] for f in features]}, padding=True, return_tensors="pt")["input_ids"]}
+    #     batch["attention_mask"] = torch.ones_like(batch["input_ids"])
+    #     batch["labels"] = batch["input_ids"].clone()
+    #     if "image_embeddings" in features[0]:
+    #         batch["image_embeddings"] = torch.stack([f["image_embeddings"] for f in features])
+    #     if "audio_embeddings" in features[0]:
+    #         batch["audio_embeddings"] = torch.stack([f["audio_embeddings"] for f in features])
+    #     return batch
+    #Updated on 30th November for managing the mismatching shape
+    #boolean index did not match indexed array along dimension 1; dimension is 591 but corresponding boolean dimension is 590
+    from dataclasses import dataclass
+from typing import Any, Dict, List
+import torch
+@dataclass
+class MultimodalDataCollator:
+    tokenizer: Any
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        # Extract input_ids, attention_mask, and labels
+        input_ids = [f["input_ids"] for f in features]
+        attention_mask = [f["attention_mask"] for f in features]
+        labels = [f["labels"] for f in features]
+        # Convert tensors to lists if they are tensors
+        input_ids = [ids.tolist() if isinstance(ids, torch.Tensor) else ids for ids in input_ids]
+        attention_mask = [mask.tolist() if isinstance(mask, torch.Tensor) else mask for mask in attention_mask]
+        labels = [lab.tolist() if isinstance(lab, torch.Tensor) else lab for lab in labels]
+        # Pad sequences to the maximum length in the batch
+        max_length = max(len(ids) for ids in input_ids)
+        padded_input_ids = [ids + [self.tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]
+        padded_attention_mask = [mask + [0] * (max_length - len(mask)) for mask in attention_mask]
+        padded_labels = [lab + [-100] * (max_length - len(lab)) for lab in labels]
+        # Create a batch dictionary
+        batch = {
+            "input_ids": torch.tensor(padded_input_ids),
+            "attention_mask": torch.tensor(padded_attention_mask),
+            "labels": torch.tensor(padded_labels)
+        }
+        # Handle image and audio embeddings if present
+        if "image_embeddings" in features[0]:
+            batch["image_embeddings"] = torch.stack([f["image_embeddings"] for f in features])
+        if "audio_embeddings" in features[0]:
+            batch["audio_embeddings"] = torch.stack([f["audio_embeddings"] for f in features])
+        return batch
+# Dataset preparation with better error handling and modularization
+def prepare_dataset(image_embeddings_path, dataset_path, cache_dir=None):
+    image_embeddings = load_embeddings(image_embeddings_path)
+    with open(dataset_path, 'r') as f:
+        data = json.load(f)
+    processed_data = [{"conversation": item["conversations"], "image_embedding": image_embeddings['embeddings'][np.where(image_embeddings['ids'] == item['image'])[0][0]] if image_embeddings and "image" in item else None, "audio_path": item.get("audio")} for item in data]
+    dataset = Dataset.from_dict({"conversation": [item["conversation"] for item in processed_data], "image_embedding": [item.get("image_embedding") for item in processed_data], "audio_path": [item.get("audio_path") for item in processed_data]})
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    # tokenizer.chat_template = """
+    # {% for message in messages %}
+    # {% if message.role == 'system' %}<|system|>{{message.content}}<|endoftext|>{% elif message.role == 'user' %}<|user|>{{message.content}}<|endoftext|>{% elif message.role == 'assistant' %}<|assistant|>{{message.content}}<|endoftext|>{% endif %}{% endfor %}
+    # """
+    tokenizer.chat_template = """
+    {% for message in messages %}
+    {% if message.role == 'system' %}<|system|>{{message.content}}<|endofsystem|>{% elif message.role == 'user' %}<|user|>{{message.content}}<|endoftext|>{% elif message.role == 'assistant' %}<|assistant|>{{message.content}}<|endoftext|>{% endif %}{% endfor %}
+    """
+    prepared_dataset = dataset.map(lambda examples: prepare_example(examples, tokenizer), batched=True, remove_columns=dataset.column_names, batch_size=1).with_format("torch")
+    # dataset_dict = DatasetDict({"train": prepared_dataset.train_test_split(test_size=0.1)["train"], "test": prepared_dataset.train_test_split(test_size=0.1)["test"]})
+    dataset_dict = prepared_dataset.train_test_split(test_size=0.2) # Split into train and a combined validation/test set
+    dataset_dict["validation"] = dataset_dict["test"].train_test_split(test_size=0.5)["train"] # Split the combined set in half
+    dataset_dict["test"] = dataset_dict["test"].train_test_split(test_size=0.5)["test"] # Split the combined set in half
+    # Assuming you have your dataset in 'dataset_dict'
+    drive_path = "/content/drive/MyDrive/Cap_dataset" # Replace with your desired path in Google Drive
+    dataset_dict.save_to_disk(drive_path)
+    # if cache_dir:
+    #     os.makedirs(cache_dir, exist_ok=True)
+    #     dataset_dict.save_to_disk(cache_dir)
+    return dataset_dict, tokenizer
+# Example preparation for dataset rows
+def prepare_example(examples, tokenizer):
+    image_embeddings, audio_embeddings, tokenized_inputs = [], [], []
+    for idx, conv in enumerate(examples["conversation"]):
+        image_embedding = torch.tensor(examples["image_embedding"][idx]) if examples["image_embedding"][idx] is not None else None
+        transcription = transcribe_speech(examples["audio_path"][idx]) if "audio_path" in examples and examples["audio_path"][idx] else None
+        for i in range(0, len(conv), 2):
+            if i + 1 < len(conv):
+                human_msg = conv[i]["value"].replace("<image>", "").replace("<audio>", "").strip()
+                if transcription:
+                    human_msg += f"\nAudio Transcription: {transcription}"
+                gpt_msg = conv[i + 1]["value"]
+                tokenized_input = tokenizer.apply_chat_template([{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"{human_msg}"}, {"role": "assistant", "content": gpt_msg}], return_tensors="pt", padding=True)
+                tokenized_inputs.append(tokenized_input.squeeze(0))
+                if image_embedding is not None:
+                    image_embeddings.append(image_embedding)
+    max_length = max(input.shape[0] for input in tokenized_inputs)
+    padded_inputs = [torch.nn.functional.pad(input, (0, max_length - input.shape[0])) for input in tokenized_inputs]
+    result = {"input_ids": torch.stack(padded_inputs), "attention_mask": torch.stack(padded_inputs).ne(tokenizer.pad_token_id).long(), "labels": torch.stack(padded_inputs).clone()}
+    if image_embeddings:
+        result["image_embeddings"] = torch.stack(image_embeddings)
+    if audio_embeddings:
+        result["audio_embeddings"] = torch.stack(audio_embeddings)
+    return result

modal.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, WhisperProcessor, WhisperForConditionalGeneration, PreTrainedModel,BitsAndBytesConfig
+from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
+from tqdm import tqdm
+import json
+import librosa
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+import gc
+import torch.nn.functional as F
+# # Define multimodal projector class
+# class ProjectionBlock(nn.Module):
+#     def __init__(self, input_dim, output_dim):
+#         super().__init__()
+#         self.pre_norm = nn.LayerNorm(input_dim)
+#         self.proj = nn.Sequential(nn.Linear(input_dim, output_dim), nn.GELU(), nn.Linear(output_dim, output_dim))
+#     def forward(self, x):
+#         return self.proj(self.pre_norm(x))
+import torch
+import torch.nn as nn
+class CrossAttentionBlock(nn.Module):
+    def __init__(self, embed_dim, num_heads=8, dropout=0.1):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * 4),
+            nn.GELU(),
+            nn.Linear(embed_dim * 4, embed_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, context):
+        # Self attention
+        attended, _ = self.attention(
+            query=self.norm1(x),
+            key=self.norm1(context),
+            value=self.norm1(context)
+        )
+        x = x + attended
+        # FFN
+        x = x + self.ffn(self.norm2(x))
+        return x
+## Updated on 23rd November
+class ProjectionBlock(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(input_dim)
+        self.proj = nn.Sequential(
+            nn.Linear(input_dim, output_dim * 2),  # Increase intermediate dimension
+            nn.GELU(),
+            nn.Linear(output_dim * 2, output_dim)  # Project to final dimension
+        )
+    def forward(self, x):
+        # Add shape validation
+        if len(x.shape) == 2:  # If input is [batch_size, features]
+            return self.proj(self.pre_norm(x))
+        elif len(x.shape) == 3:  # If input is [batch_size, seq_len, features]
+            return self.proj(self.pre_norm(x.mean(dim=1)))  # Pool sequence dimension
+        else:
+            raise ValueError(f"Unexpected input shape: {x.shape}")
+##Updated on 23rd November
+# class EnhancedMultimodalProjector(nn.Module):
+    # def __init__(self, image_input_dim, audio_input_dim, output_dim, num_heads=8):
+    #     super().__init__()
+    #     # Adjust projectors to match Phi-3's hidden size (1024)
+    #     self.image_proj = ProjectionBlock(image_input_dim, output_dim)
+    #     self.audio_proj = ProjectionBlock(audio_input_dim, output_dim)
+    #     # Cross-attention blocks
+    #     self.image_audio_cross_attn = CrossAttentionBlock(output_dim, num_heads)
+    #     self.audio_image_cross_attn = CrossAttentionBlock(output_dim, num_heads)
+    #     # Final fusion layer
+    #     self.fusion_layer = nn.Sequential(
+    #         nn.LayerNorm(output_dim * 2),
+    #         nn.Linear(output_dim * 2, output_dim),
+    #         nn.GELU(),
+    #         nn.Linear(output_dim, output_dim)
+    #     )
+class EnhancedMultimodalProjector(nn.Module):
+    def __init__(self, image_input_dim, audio_input_dim=1024, output_dim=1024, num_heads=8):
+        super().__init__()
+        self.image_proj = ProjectionBlock(image_input_dim, output_dim)
+        self.audio_proj = ProjectionBlock(audio_input_dim, output_dim)
+        self.image_audio_cross_attn = CrossAttentionBlock(output_dim, num_heads)
+        self.audio_image_cross_attn = CrossAttentionBlock(output_dim, num_heads)
+        self.fusion_layer = nn.Sequential(
+            nn.LayerNorm(output_dim * 2),
+            nn.Linear(output_dim * 2, output_dim),
+            nn.GELU(),
+            nn.Linear(output_dim, output_dim)
+        )
+    def forward(self, image_embedding=None, audio_embedding=None):
+        # Add shape validation and adjustment
+        if image_embedding is not None and image_embedding.dim() < 2:
+          raise ValueError("Expected `image_embedding` to have at least 2 dimensions.")
+        if audio_embedding is not None and audio_embedding.dim() < 2:
+          raise ValueError("Expected `audio_embedding` to have at least 2 dimensions.")
+        if image_embedding is not None and len(image_embedding.shape) == 2:
+            image_embedding = image_embedding.unsqueeze(1)  # Add sequence dimension
+        if audio_embedding is not None and len(audio_embedding.shape) == 2:
+            audio_embedding = audio_embedding.unsqueeze(1)  # Add sequence dimension
+        # Initial projections
+        projected_image = self.image_proj(image_embedding) if image_embedding is not None else None
+        projected_audio = self.audio_proj(audio_embedding) if audio_embedding is not None else None
+        if projected_image is not None and projected_audio is not None:
+            # Ensure correct shapes for cross-attention
+            attended_image = self.image_audio_cross_attn(projected_image, projected_audio)
+            attended_audio = self.audio_image_cross_attn(projected_audio, projected_image)
+            # Combine the attended features
+            fused_features = torch.cat([attended_image, attended_audio], dim=-1)
+            final_output = self.fusion_layer(fused_features)
+            return final_output, final_output
+        elif projected_image is not None:
+            return projected_image, None
+        elif projected_audio is not None:
+            return None, projected_audio
+        else:
+            return None, None
+# Update the Phi3WithProjector to use the enhanced projector
+class Phi3WithProjector(PreTrainedModel):
+    def __init__(self, config, phi3_model, projector):
+        super().__init__(config)
+        self.phi3_model = phi3_model
+        self.projector = projector
+        self.supports_gradient_checkpointing = True
+    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None,
+                image_embeddings=None, audio_embeddings=None, labels=None, **kwargs):
+        if inputs_embeds is None:
+            inputs_embeds = self.phi3_model.get_input_embeddings()(input_ids)
+        # Get fused embeddings from enhanced projector
+        projected_features, _ = self.projector(image_embeddings, audio_embeddings)
+        # Concatenate embeddings if we have projected features
+        if projected_features is not None:
+            combined_embeddings = torch.cat([inputs_embeds, projected_features.unsqueeze(1)], dim=1)
+            # Extend attention mask
+            extended_attention_mask = torch.cat([
+                attention_mask,
+                torch.ones((attention_mask.shape[0], 1), device=attention_mask.device)
+            ], dim=1)
+        else:
+            combined_embeddings = inputs_embeds
+            extended_attention_mask = attention_mask
+        # Adjust labels if needed
+        if labels is not None and projected_features is not None:
+            labels = torch.cat([
+                labels,
+                torch.full((labels.shape[0], 1), -100, dtype=labels.dtype, device=labels.device)
+            ], dim=1)
+        return self.phi3_model(
+            inputs_embeds=combined_embeddings,
+            attention_mask=extended_attention_mask,
+            labels=labels,
+            **kwargs
+        )
+class MultimodalProjector(nn.Module):
+    def __init__(self, image_input_dim, audio_input_dim, output_dim):
+        super().__init__()
+        self.image_proj = ProjectionBlock(image_input_dim, output_dim)
+        self.audio_proj = ProjectionBlock(audio_input_dim, output_dim)
+    def forward(self, image_embedding=None, audio_embedding=None):
+        projected_image = self.image_proj(image_embedding) if image_embedding is not None else None
+        projected_audio = self.audio_proj(audio_embedding) if audio_embedding is not None else None
+        return projected_image, projected_audio
+class Phi3WithProjector(PreTrainedModel):
+    def __init__(self, config, phi3_model, projector):
+        super().__init__(config)
+        self.phi3_model = phi3_model
+        self.projector = projector
+        self.supports_gradient_checkpointing = True
+    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None, image_embeddings=None, audio_embeddings=None, labels=None, **kwargs):
+        # Use get_input_embeddings() to retrieve the embeddings layer
+        if inputs_embeds is None:
+            inputs_embeds = self.phi3_model.get_input_embeddings()(input_ids)
+        # Project both image and audio embeddings to the appropriate dimension
+        projected_image, projected_audio = self.projector(image_embeddings, audio_embeddings)
+        # Concatenate the embeddings
+        embeddings_to_concat = [inputs_embeds]
+        if projected_image is not None:
+            embeddings_to_concat.append(projected_image.unsqueeze(1))
+        if projected_audio is not None:
+            embeddings_to_concat.append(projected_audio.unsqueeze(1))
+        combined_embeddings = torch.cat(embeddings_to_concat, dim=1)
+        # Modify how the attention mask is extended
+        extended_attention_mask = attention_mask.clone()  # Start with a copy
+        # Extend for image and audio, if present
+        if projected_image is not None:
+            extended_attention_mask = torch.cat([extended_attention_mask, torch.ones_like(extended_attention_mask[:, :1])], dim=1)
+        if projected_audio is not None:
+            extended_attention_mask = torch.cat([extended_attention_mask, torch.ones_like(extended_attention_mask[:, :1])], dim=1)
+        # Adjust labels to match the extended input sequence length
+        if labels is not None:
+            # Pad labels with -100 to ignore the added tokens in the loss calculation
+            num_added_tokens = sum(1 for emb in [projected_image, projected_audio] if emb is not None)
+            labels = torch.cat([labels, torch.full((labels.shape[0], num_added_tokens), -100, dtype=labels.dtype, device=labels.device)], dim=1)
+        outputs = self.phi3_model(
+            inputs_embeds=combined_embeddings,
+            attention_mask=extended_attention_mask,
+            labels=labels,
+            **kwargs
+        )
+        # Add auxiliary losses for multimodal alignment
+        if image_embeddings is not None or audio_embeddings is not None:
+            loss = outputs.loss
+            # Add contrastive loss for multimodal alignment
+            if image_embeddings is not None and audio_embeddings is not None:
+                img_proj, audio_proj = self.projector(image_embeddings, audio_embeddings)
+                contrastive_loss = self.compute_contrastive_loss(img_proj, audio_proj)
+                loss = loss + 0.1 * contrastive_loss  # Weight the auxiliary loss
+            outputs.loss = loss
+        return outputs
+    def get_input_embeddings(self):
+        """Returns the model's input embeddings."""
+        return self.phi3_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        """Sets the model's input embeddings."""
+        self.phi3_model.set_input_embeddings(value)
+    # Instead, use the built-in gradient checkpointing
+    def enable_gradient_checkpointing(self):
+        """Enable gradient checkpointing for the model."""
+        if hasattr(self.phi3_model, "gradient_checkpointing_enable"):
+            self.phi3_model.gradient_checkpointing_enable()
+        else:
+            self.phi3_model.config.use_cache = False
+            self.phi3_model.train()  # Ensure model is in training mode
+    def disable_gradient_checkpointing(self):
+        """Disable gradient checkpointing for the model."""
+        if hasattr(self.phi3_model, "gradient_checkpointing_disable"):
+            self.phi3_model.gradient_checkpointing_disable()
+        else:
+            self.phi3_model.config.use_cache = True
+    def compute_contrastive_loss(self, img_features, audio_features):
+        # Normalize features
+        img_features = F.normalize(img_features, dim=-1)
+        audio_features = F.normalize(audio_features, dim=-1)
+        # Compute similarity matrix
+        similarity = torch.matmul(img_features, audio_features.transpose(0, 1))
+        # Temperature-scaled cross entropy loss
+        temperature = 0.07
+        labels = torch.arange(similarity.size(0)).to(similarity.device)
+        loss = F.cross_entropy(similarity / temperature, labels)
+        return loss