KangarooGroup
/

kangaroo

@@ -107,22 +107,6 @@ class LlamaRotaryEmbedding(nn.Module):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
-    #@torch.no_grad()
-    #def forward(self, x, position_ids):
-    #    # x: [bs, num_attention_heads, seq_len, head_size]
-    #    inv_freq_expanded = self.inv_freq[None, :, None].to(torch.bfloat16).expand(position_ids.shape[0], -1, 1)
-    #    position_ids_expanded = position_ids[:, None, :].to(torch.bfloat16)
-    #    # Force float32 since bfloat16 loses precision on long contexts
-    #    # See https://github.com/huggingface/transformers/pull/29285
-    #    device_type = x.device.type
-    #    device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-    #    with torch.autocast(device_type=device_type, enabled=False):
-    #        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
-    #        emb = torch.cat((freqs, freqs), dim=-1)
-    #        cos = emb.cos()
-    #        sin = emb.sin()
-    #    return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
     @torch.no_grad()
     def forward(self, x, position_ids):
@@ -1107,9 +1091,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         super().__init__(config)
         self.model = LlamaModel(config)
         model_name = "EVA02-CLIP-L-14-448"
-        pretrained = "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-mtcv/liujiajun18/models/models--QuanSun--EVA-CLIP/snapshots/11afd202f2ae80869d6cef18b1ec775e79bd8d12/EVA02_CLIP_L_psz14_s4B.pt"
         self.vocab_size = config.vocab_size
-        model, _, preprocess = create_model_and_transforms(model_name, pretrained, force_custom_clip=True)
         model.text = None
         model.logit_scale = None
         self.vision_tower = model.visual
@@ -1121,6 +1104,7 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
         self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
         self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
                                                 out_channels=self.vision_tower.num_features,
                                                 kernel_size=(2, 2, 2),
@@ -1164,10 +1148,6 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
         image_features = image_features.permute(0, 4, 1, 2, 3)
         image_features = self.adaptive_pooling(image_features)
         image_features = image_features.permute(0, 2, 3, 4, 1)
-        #B, T, P, _, __ = image_features.shape
-        #image_features = image_features.reshape(B, T // 2, 2, P, _, __)
-        #image_features = image_features.mean(dim=2)
-        #image_features = image_features.reshape(B, T // 2, P, _, __)
         image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
         image_features = self.mm_projector(image_features)

         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
     @torch.no_grad()
     def forward(self, x, position_ids):
         super().__init__(config)
         self.model = LlamaModel(config)
         model_name = "EVA02-CLIP-L-14-448"
         self.vocab_size = config.vocab_size
+        model, _, preprocess = create_model_and_transforms(model_name, force_custom_clip=True)
         model.text = None
         model.logit_scale = None
         self.vision_tower = model.visual
         self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
         self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
+        # patchify module
         self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
                                                 out_channels=self.vision_tower.num_features,
                                                 kernel_size=(2, 2, 2),
         image_features = image_features.permute(0, 4, 1, 2, 3)
         image_features = self.adaptive_pooling(image_features)
         image_features = image_features.permute(0, 2, 3, 4, 1)
         image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
         image_features = self.mm_projector(image_features)