xiechunyu commited on
Commit
42b2552
·
1 Parent(s): c8e599b

sec commit

Browse files
config.json CHANGED
@@ -1,7 +1,12 @@
1
  {
 
2
  "architectures": [
3
- "CLIPModel"
4
  ],
 
 
 
 
5
  "initializer_factor": 1.0,
6
  "logit_scale_init_value": 2.6592,
7
  "model_type": "clip",
 
1
  {
2
+ "_name_or_path": "fg-clip-base",
3
  "architectures": [
4
+ "FGCLIPModel"
5
  ],
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_fgclip.FGCLIPConfig",
8
+ "AutoModelForCausalLM": "modeling_fgclip.FGCLIPModel"
9
+ },
10
  "initializer_factor": 1.0,
11
  "logit_scale_init_value": 2.6592,
12
  "model_type": "clip",
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7666ec60a0a211d0822ceebc628de6c7a6f867f3b216d161857844077978578c
3
- size 302509106
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42fdf108f56292277be193a5531a63d7b817541dbd6c250f8294b3fc76d8ab1
3
+ size 600595252
modeling_clip.py ADDED
The diff for this file is too large to render. See raw diff
 
modeling_fgclip.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ # from transformers import CLIPConfig,AutoConfig
6
+ from typing import Any, Optional, Tuple, Union
7
+ import torch.distributed.nn as nn_dist
8
+ import torch.nn.functional as F
9
+ import numpy as np
10
+ from collections import OrderedDict
11
+ from typing import Tuple, Union
12
+ from .modeling_clip import CLIPModel, CLIPTextTransformer, CLIPVisionTransformer, CLIPOutput, CLIPAttention, CLIPMLP
13
+
14
+ import torch.distributed as dist
15
+ from torch.nn import AvgPool2d
16
+ from transformers import (
17
+ AutoImageProcessor,
18
+ AutoModel,
19
+ AutoTokenizer,
20
+ HfArgumentParser,
21
+ Trainer,
22
+ TrainingArguments,
23
+ set_seed,
24
+ )
25
+
26
+ from .modeling_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
27
+ from torch import nn, einsum
28
+ from einops import rearrange, repeat, reduce
29
+ from einops.layers.torch import Rearrange, Reduce
30
+ import math
31
+ from torchvision.ops import roi_align
32
+
33
+
34
+ class FGCLIPConfig(CLIPConfig):
35
+ model_type = "clip"
36
+
37
+ class FGCLIPModel(CLIPModel):
38
+ config_class = FGCLIPConfig
39
+ main_input_name = "text_long"
40
+
41
+ def __init__(self, config):
42
+ super(CLIPModel, self).__init__(config)
43
+
44
+ if not isinstance(config.text_config, CLIPTextConfig):
45
+ raise ValueError(
46
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
47
+ f" {type(config.text_config)}."
48
+ )
49
+
50
+ if not isinstance(config.vision_config, CLIPVisionConfig):
51
+ raise ValueError(
52
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
53
+ f" {type(config.vision_config)}."
54
+ )
55
+
56
+ text_config = config.text_config
57
+ vision_config = config.vision_config
58
+ text_config.eos_token_id = 49407
59
+ text_config.pad_token_id = 49407
60
+ text_config.bos_token_id = 49406
61
+
62
+ self.projection_dim = config.projection_dim
63
+ self.text_embed_dim = text_config.hidden_size
64
+ self.vision_embed_dim = vision_config.hidden_size
65
+
66
+ self.text_model = CLIPTextTransformer(text_config)
67
+
68
+ self.vision_model = CLIPVisionTransformer(vision_config)
69
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
70
+
71
+
72
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
73
+ self.text_filip_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
74
+
75
+
76
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
77
+ self.logit_scale_finegraind = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
78
+ self.logit_scale_hardneg = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
79
+
80
+
81
+ self.embed_dim = text_config.hidden_size
82
+ self.world_size = 0
83
+
84
+ # Initialize weights and apply final processing
85
+ self.post_init()
86
+
87
+
88
+ def get_image_features(
89
+ self,
90
+ pixel_values: Optional[torch.FloatTensor] = None,
91
+ output_attentions: Optional[bool] = None,
92
+ output_hidden_states: Optional[bool] = None,
93
+ return_dict: Optional[bool] = None,
94
+ ) -> torch.FloatTensor:
95
+
96
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
97
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
98
+ output_hidden_states = (
99
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
100
+ )
101
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
102
+
103
+ vision_outputs = self.vision_model(
104
+ pixel_values=pixel_values,
105
+ output_attentions=output_attentions,
106
+ output_hidden_states=output_hidden_states,
107
+ return_dict=return_dict,
108
+ )
109
+
110
+ pooled_output = vision_outputs[1] # pooled_output
111
+ image_features = self.visual_projection(pooled_output)
112
+
113
+ return image_features
114
+
115
+ def get_image_box_roi_features(
116
+ self,
117
+ pixel_values: Optional[torch.FloatTensor] = None,
118
+ output_attentions: Optional[bool] = None,
119
+ output_hidden_states: Optional[bool] = None,
120
+ return_dict: Optional[bool] = None,
121
+ box_info=None,
122
+ ) -> torch.FloatTensor:
123
+
124
+
125
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
126
+ output_hidden_states = (
127
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
128
+ )
129
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
130
+
131
+ vision_outputs = self.vision_model(
132
+ pixel_values=pixel_values,
133
+ output_attentions=output_attentions,
134
+ output_hidden_states=True,
135
+ return_dict=return_dict
136
+ )
137
+
138
+ bs = pixel_values.shape[0]
139
+ length = vision_outputs[0].shape[1]-1
140
+ h = int(math.sqrt(length))
141
+ w = h
142
+
143
+ feature_map = vision_outputs.hidden_states[-2]#[:, 1:, :]
144
+ feature_map = self.forward_without_attn(feature_map)[:, 1:]
145
+
146
+ feature_map = self.vision_model.post_layernorm(feature_map)
147
+ feature_map = self.visual_projection(feature_map)
148
+
149
+ feature_map = feature_map.view(bs, h, w, -1).permute(0, 3, 1, 2)
150
+ x_rois = roi_align(feature_map.type(torch.float32),box_info, (1, 1), 1.0, -1, True)[..., 0, 0]
151
+
152
+ x_rois = x_rois / x_rois.norm(p=2, dim=-1, keepdim=True)
153
+
154
+ return x_rois
155
+
156
+ def get_text_features(
157
+ self,
158
+ input_ids: Optional[torch.Tensor] = None,
159
+ attention_mask: Optional[torch.Tensor] = None,
160
+ position_ids: Optional[torch.Tensor] = None,
161
+ output_attentions: Optional[bool] = None,
162
+ output_hidden_states: Optional[bool] = None,
163
+ return_dict: Optional[bool] = None,
164
+ walk_short_pos: Optional[bool] = True,
165
+ use_bbox: Optional[bool] = False
166
+ ) -> torch.FloatTensor:
167
+
168
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
169
+ output_hidden_states = (
170
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
171
+ )
172
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
173
+
174
+ pos_flag = walk_short_pos or use_bbox
175
+
176
+ text_outputs = self.text_model(
177
+ input_ids=input_ids,
178
+ attention_mask=attention_mask,
179
+ position_ids=position_ids,
180
+ output_attentions=output_attentions,
181
+ output_hidden_states=output_hidden_states,
182
+ return_dict=return_dict,
183
+ walk_short_pos=pos_flag,
184
+ )
185
+ pooled_output = text_outputs[1]
186
+
187
+ if walk_short_pos:
188
+ text_features = self.text_projection(pooled_output)
189
+ else:
190
+ text_features = self.text_filip_projection(pooled_output)
191
+
192
+ return text_features
193
+
194
+ @staticmethod
195
+ def _denormalize_boxes(normed_boxes, x):
196
+ h, w = x.shape[-2:]
197
+ denormed_boxes = []
198
+ for boxes in normed_boxes:
199
+
200
+ new_boxes = boxes.clone() # FIXME: do not change the value in normed_boxes!
201
+ new_boxes[:, [0, 2]] *= w
202
+ new_boxes[:, [1, 3]] *= h
203
+ denormed_boxes.append(new_boxes.type(torch.float32))
204
+ return denormed_boxes
205
+
206
+ def forward_without_attn(self, x):
207
+ # get last layer
208
+ residual = x
209
+ x = self.vision_model.encoder.layers[-1].layer_norm1(x)
210
+
211
+ x = F.linear(input=x, weight=self.vision_model.encoder.layers[-1].self_attn.v_proj.weight, bias=self.vision_model.encoder.layers[-1].self_attn.v_proj.bias)
212
+ x = self.vision_model.encoder.layers[-1].self_attn.out_proj(x)
213
+ x = residual+x
214
+
215
+ residual = x
216
+ x = self.vision_model.encoder.layers[-1].layer_norm2(x)
217
+ x = self.vision_model.encoder.layers[-1].mlp(x)
218
+ x = residual + x
219
+
220
+ return x
221
+
222
+
223
+ def get_image_dense_features(
224
+ self,
225
+ pixel_values: Optional[torch.FloatTensor] = None,
226
+ output_attentions: Optional[bool] = None,
227
+ output_hidden_states: Optional[bool] = None,
228
+ return_dict: Optional[bool] = None,
229
+ interpolate_pos_encoding=False,
230
+ box_info=None,
231
+ ) -> torch.FloatTensor:
232
+
233
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
234
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
235
+ output_hidden_states = (
236
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
237
+ )
238
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
239
+
240
+ vision_outputs = self.vision_model(
241
+ pixel_values=pixel_values,
242
+ output_attentions=output_attentions,
243
+ output_hidden_states=True,
244
+ return_dict=return_dict,
245
+ interpolate_pos_encoding=interpolate_pos_encoding,
246
+ )
247
+
248
+
249
+ bs = pixel_values.shape[0]
250
+ length = vision_outputs[0].shape[1]-1
251
+ h = int(math.sqrt(length))
252
+ w = h
253
+
254
+ feature_map = vision_outputs.hidden_states[-2]#[:, 1:, :]
255
+ feature_map = self.forward_without_attn(feature_map)[:, 1:]
256
+
257
+ feature_map = self.vision_model.post_layernorm(feature_map)
258
+ feature_map = self.visual_projection(feature_map)
259
+
260
+ return feature_map
261
+
262
+
training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:15b3a0d4006d788c60e3342bb637a5c2c029a79d13b31d1b5a6cb6e9a0890f59
3
- size 6136