Update modeling_kangaroo.py
Browse files- modeling_kangaroo.py +2 -22
modeling_kangaroo.py
CHANGED
@@ -107,22 +107,6 @@ class LlamaRotaryEmbedding(nn.Module):
|
|
107 |
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
108 |
# For BC we register cos and sin cached
|
109 |
self.max_seq_len_cached = max_position_embeddings
|
110 |
-
|
111 |
-
#@torch.no_grad()
|
112 |
-
#def forward(self, x, position_ids):
|
113 |
-
# # x: [bs, num_attention_heads, seq_len, head_size]
|
114 |
-
# inv_freq_expanded = self.inv_freq[None, :, None].to(torch.bfloat16).expand(position_ids.shape[0], -1, 1)
|
115 |
-
# position_ids_expanded = position_ids[:, None, :].to(torch.bfloat16)
|
116 |
-
# # Force float32 since bfloat16 loses precision on long contexts
|
117 |
-
# # See https://github.com/huggingface/transformers/pull/29285
|
118 |
-
# device_type = x.device.type
|
119 |
-
# device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
120 |
-
# with torch.autocast(device_type=device_type, enabled=False):
|
121 |
-
# freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
|
122 |
-
# emb = torch.cat((freqs, freqs), dim=-1)
|
123 |
-
# cos = emb.cos()
|
124 |
-
# sin = emb.sin()
|
125 |
-
# return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
126 |
|
127 |
@torch.no_grad()
|
128 |
def forward(self, x, position_ids):
|
@@ -1107,9 +1091,8 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
|
|
1107 |
super().__init__(config)
|
1108 |
self.model = LlamaModel(config)
|
1109 |
model_name = "EVA02-CLIP-L-14-448"
|
1110 |
-
pretrained = "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-mtcv/liujiajun18/models/models--QuanSun--EVA-CLIP/snapshots/11afd202f2ae80869d6cef18b1ec775e79bd8d12/EVA02_CLIP_L_psz14_s4B.pt"
|
1111 |
self.vocab_size = config.vocab_size
|
1112 |
-
model, _, preprocess = create_model_and_transforms(model_name,
|
1113 |
model.text = None
|
1114 |
model.logit_scale = None
|
1115 |
self.vision_tower = model.visual
|
@@ -1121,6 +1104,7 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
|
|
1121 |
self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
|
1122 |
|
1123 |
self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
|
|
|
1124 |
self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
|
1125 |
out_channels=self.vision_tower.num_features,
|
1126 |
kernel_size=(2, 2, 2),
|
@@ -1164,10 +1148,6 @@ class KangarooForCausalLM(LlamaPreTrainedModel):
|
|
1164 |
image_features = image_features.permute(0, 4, 1, 2, 3)
|
1165 |
image_features = self.adaptive_pooling(image_features)
|
1166 |
image_features = image_features.permute(0, 2, 3, 4, 1)
|
1167 |
-
#B, T, P, _, __ = image_features.shape
|
1168 |
-
#image_features = image_features.reshape(B, T // 2, 2, P, _, __)
|
1169 |
-
#image_features = image_features.mean(dim=2)
|
1170 |
-
#image_features = image_features.reshape(B, T // 2, P, _, __)
|
1171 |
image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
|
1172 |
|
1173 |
image_features = self.mm_projector(image_features)
|
|
|
107 |
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
108 |
# For BC we register cos and sin cached
|
109 |
self.max_seq_len_cached = max_position_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
@torch.no_grad()
|
112 |
def forward(self, x, position_ids):
|
|
|
1091 |
super().__init__(config)
|
1092 |
self.model = LlamaModel(config)
|
1093 |
model_name = "EVA02-CLIP-L-14-448"
|
|
|
1094 |
self.vocab_size = config.vocab_size
|
1095 |
+
model, _, preprocess = create_model_and_transforms(model_name, force_custom_clip=True)
|
1096 |
model.text = None
|
1097 |
model.logit_scale = None
|
1098 |
self.vision_tower = model.visual
|
|
|
1104 |
self.angle = torch.stack([1 / torch.pow(torch.tensor(10000), torch.tensor(2 * (hid_j // 2) / hidden_dim)) for hid_j in range(hidden_dim)])
|
1105 |
|
1106 |
self.patch_shape = self.vision_tower.patch_embed.patch_shape[0]
|
1107 |
+
# patchify module
|
1108 |
self.adaptive_pooling = torch.nn.Conv3d(in_channels=self.vision_tower.num_features,
|
1109 |
out_channels=self.vision_tower.num_features,
|
1110 |
kernel_size=(2, 2, 2),
|
|
|
1148 |
image_features = image_features.permute(0, 4, 1, 2, 3)
|
1149 |
image_features = self.adaptive_pooling(image_features)
|
1150 |
image_features = image_features.permute(0, 2, 3, 4, 1)
|
|
|
|
|
|
|
|
|
1151 |
image_features = image_features.reshape(-1, self.patch_shape*self.patch_shape // 4, image_features.shape[-1])
|
1152 |
|
1153 |
image_features = self.mm_projector(image_features)
|