FrankJJHu commited on
Commit
152d493
·
verified ·
1 Parent(s): 388bd8b

Update modeling_kangaroo.py

Browse files
Files changed (1) hide show
  1. modeling_kangaroo.py +2 -22
modeling_kangaroo.py CHANGED
@@ -107,22 +107,6 @@ class LlamaRotaryEmbedding(nn.Module):
107
  self.register_buffer("inv_freq", inv_freq, persistent=False)
108
  # For BC we register cos and sin cached
109
  self.max_seq_len_cached = max_position_embeddings
110
-
111
- #@torch.no_grad()
112
- #def forward(self, x, position_ids):
113
- # # x: [bs, num_attention_heads, seq_len, head_size]
114
- # inv_freq_expanded = self.inv_freq[None, :, None].to(torch.bfloat16).expand(position_ids.shape[0], -1, 1)
115
- # position_ids_expanded = position_ids[:, None, :].to(torch.bfloat16)
116
- # # Force float32 since bfloat16 loses precision on long contexts
117
- # # See https://github.com/huggingface/transformers/pull/29285
118
- # device_type = x.device.type
119
- # device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
120
- # with torch.autocast(device_type=device_type, enabled=False):
121
- # freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
122
- # emb = torch.cat((freqs, freqs), dim=-1)
123
- # cos = emb.cos()
124
- # sin = emb.sin()
125
- # return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
126
 
127
  @torch.no_grad()
128
  def forward(self, x, position_ids):
@@ -1107,9 +1091,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
1107
  super().__init__(config)
1108
  self.model = LlamaModel(config)
1109
  model_name = "EVA02-CLIP-L-14-448"
1110
- pretrained = "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-mtcv/liujiajun18/models/models--QuanSun--EVA-CLIP/snapshots/11afd202f2ae80869d6cef18b1ec775e79bd8d12/EVA02_CLIP_L_psz14_s4B.pt"
1111
  self.vocab_size = config.vocab_size
1112
- model, _, preprocess = create_model_and_transforms(model_name, pretrained, force_custom_clip=True)
1113
  model.text = None
1114
  model.logit_scale = None
1115
  self.vision_tower = model.visual
@@ -1121,6 +1104,7 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
1121
  self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
1122
 
1123
  self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
 
1124
  self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
1125
  out_channels=self.vision_tower.num_features,
1126
  kernel_size=(2, 2, 2),
@@ -1164,10 +1148,6 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
1164
  image_features = image_features.permute(0, 4, 1, 2, 3)
1165
  image_features = self.adaptive_pooling(image_features)
1166
  image_features = image_features.permute(0, 2, 3, 4, 1)
1167
- #B, T, P, _, __ = image_features.shape
1168
- #image_features = image_features.reshape(B, T // 2, 2, P, _, __)
1169
- #image_features = image_features.mean(dim=2)
1170
- #image_features = image_features.reshape(B, T // 2, P, _, __)
1171
  image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
1172
 
1173
  image_features = self.mm_projector(image_features)
 
107
  self.register_buffer("inv_freq", inv_freq, persistent=False)
108
  # For BC we register cos and sin cached
109
  self.max_seq_len_cached = max_position_embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  @torch.no_grad()
112
  def forward(self, x, position_ids):
 
1091
  super().__init__(config)
1092
  self.model = LlamaModel(config)
1093
  model_name = "EVA02-CLIP-L-14-448"
 
1094
  self.vocab_size = config.vocab_size
1095
+ model, _, preprocess = create_model_and_transforms(model_name, force_custom_clip=True)
1096
  model.text = None
1097
  model.logit_scale = None
1098
  self.vision_tower = model.visual
 
1104
  self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
1105
 
1106
  self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
1107
+ # patchify module
1108
  self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
1109
  out_channels=self.vision_tower.num_features,
1110
  kernel_size=(2, 2, 2),
 
1148
  image_features = image_features.permute(0, 4, 1, 2, 3)
1149
  image_features = self.adaptive_pooling(image_features)
1150
  image_features = image_features.permute(0, 2, 3, 4, 1)
 
 
 
 
1151
  image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
1152
 
1153
  image_features = self.mm_projector(image_features)