xiechunyu commited on
Commit
e4e7193
·
1 Parent(s): cb82952

sec commit

Browse files
config copy.json DELETED
@@ -1,179 +0,0 @@
1
- {
2
- "_name_or_path": "openai/clip-vit-large-patch14-336",
3
- "architectures": [
4
- "CLIPModel"
5
- ],
6
- "initializer_factor": 1.0,
7
- "logit_scale_init_value": 2.6592,
8
- "model_type": "clip",
9
- "projection_dim": 768,
10
- "text_config": {
11
- "_name_or_path": "",
12
- "add_cross_attention": false,
13
- "architectures": null,
14
- "attention_dropout": 0.0,
15
- "bad_words_ids": null,
16
- "bos_token_id": 0,
17
- "chunk_size_feed_forward": 0,
18
- "cross_attention_hidden_size": null,
19
- "decoder_start_token_id": null,
20
- "diversity_penalty": 0.0,
21
- "do_sample": false,
22
- "dropout": 0.0,
23
- "early_stopping": false,
24
- "encoder_no_repeat_ngram_size": 0,
25
- "eos_token_id": 2,
26
- "exponential_decay_length_penalty": null,
27
- "finetuning_task": null,
28
- "forced_bos_token_id": null,
29
- "forced_eos_token_id": null,
30
- "hidden_act": "quick_gelu",
31
- "hidden_size": 768,
32
- "id2label": {
33
- "0": "LABEL_0",
34
- "1": "LABEL_1"
35
- },
36
- "initializer_factor": 1.0,
37
- "initializer_range": 0.02,
38
- "intermediate_size": 3072,
39
- "is_decoder": false,
40
- "is_encoder_decoder": false,
41
- "label2id": {
42
- "LABEL_0": 0,
43
- "LABEL_1": 1
44
- },
45
- "layer_norm_eps": 1e-05,
46
- "length_penalty": 1.0,
47
- "max_length": 20,
48
- "max_position_embeddings": 77,
49
- "min_length": 0,
50
- "model_type": "clip_text_model",
51
- "no_repeat_ngram_size": 0,
52
- "num_attention_heads": 12,
53
- "num_beam_groups": 1,
54
- "num_beams": 1,
55
- "num_hidden_layers": 12,
56
- "num_return_sequences": 1,
57
- "output_attentions": false,
58
- "output_hidden_states": false,
59
- "output_scores": false,
60
- "pad_token_id": 1,
61
- "prefix": null,
62
- "problem_type": null,
63
- "projection_dim": 768,
64
- "pruned_heads": {},
65
- "remove_invalid_values": false,
66
- "repetition_penalty": 1.0,
67
- "return_dict": true,
68
- "return_dict_in_generate": false,
69
- "sep_token_id": null,
70
- "task_specific_params": null,
71
- "temperature": 1.0,
72
- "tf_legacy_loss": false,
73
- "tie_encoder_decoder": false,
74
- "tie_word_embeddings": true,
75
- "tokenizer_class": null,
76
- "top_k": 50,
77
- "top_p": 1.0,
78
- "torch_dtype": null,
79
- "torchscript": false,
80
- "transformers_version": "4.21.3",
81
- "typical_p": 1.0,
82
- "use_bfloat16": false,
83
- "vocab_size": 49408
84
- },
85
- "text_config_dict": {
86
- "hidden_size": 768,
87
- "intermediate_size": 3072,
88
- "num_attention_heads": 12,
89
- "num_hidden_layers": 12,
90
- "projection_dim": 768
91
- },
92
- "torch_dtype": "float32",
93
- "transformers_version": null,
94
- "vision_config": {
95
- "_name_or_path": "",
96
- "add_cross_attention": false,
97
- "architectures": null,
98
- "attention_dropout": 0.0,
99
- "bad_words_ids": null,
100
- "bos_token_id": null,
101
- "chunk_size_feed_forward": 0,
102
- "cross_attention_hidden_size": null,
103
- "decoder_start_token_id": null,
104
- "diversity_penalty": 0.0,
105
- "do_sample": false,
106
- "dropout": 0.0,
107
- "early_stopping": false,
108
- "encoder_no_repeat_ngram_size": 0,
109
- "eos_token_id": null,
110
- "exponential_decay_length_penalty": null,
111
- "finetuning_task": null,
112
- "forced_bos_token_id": null,
113
- "forced_eos_token_id": null,
114
- "hidden_act": "quick_gelu",
115
- "hidden_size": 1024,
116
- "id2label": {
117
- "0": "LABEL_0",
118
- "1": "LABEL_1"
119
- },
120
- "image_size": 336,
121
- "initializer_factor": 1.0,
122
- "initializer_range": 0.02,
123
- "intermediate_size": 4096,
124
- "is_decoder": false,
125
- "is_encoder_decoder": false,
126
- "label2id": {
127
- "LABEL_0": 0,
128
- "LABEL_1": 1
129
- },
130
- "layer_norm_eps": 1e-05,
131
- "length_penalty": 1.0,
132
- "max_length": 20,
133
- "min_length": 0,
134
- "model_type": "clip_vision_model",
135
- "no_repeat_ngram_size": 0,
136
- "num_attention_heads": 16,
137
- "num_beam_groups": 1,
138
- "num_beams": 1,
139
- "num_channels": 3,
140
- "num_hidden_layers": 24,
141
- "num_return_sequences": 1,
142
- "output_attentions": false,
143
- "output_hidden_states": false,
144
- "output_scores": false,
145
- "pad_token_id": null,
146
- "patch_size": 14,
147
- "prefix": null,
148
- "problem_type": null,
149
- "projection_dim": 768,
150
- "pruned_heads": {},
151
- "remove_invalid_values": false,
152
- "repetition_penalty": 1.0,
153
- "return_dict": true,
154
- "return_dict_in_generate": false,
155
- "sep_token_id": null,
156
- "task_specific_params": null,
157
- "temperature": 1.0,
158
- "tf_legacy_loss": false,
159
- "tie_encoder_decoder": false,
160
- "tie_word_embeddings": true,
161
- "tokenizer_class": null,
162
- "top_k": 50,
163
- "top_p": 1.0,
164
- "torch_dtype": null,
165
- "torchscript": false,
166
- "transformers_version": "4.21.3",
167
- "typical_p": 1.0,
168
- "use_bfloat16": false
169
- },
170
- "vision_config_dict": {
171
- "hidden_size": 1024,
172
- "image_size": 336,
173
- "intermediate_size": 4096,
174
- "num_attention_heads": 16,
175
- "num_hidden_layers": 24,
176
- "patch_size": 14,
177
- "projection_dim": 768
178
- }
179
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,29 +1,180 @@
1
  {
2
- "_name_or_path": "/hbox2dir/clip-large-336-longshort",
3
  "architectures": [
4
- "LongCLIPModel"
5
  ],
 
 
 
 
6
  "initializer_factor": 1.0,
7
  "logit_scale_init_value": 2.6592,
8
  "model_type": "clip",
9
  "projection_dim": 768,
10
  "text_config": {
 
 
 
 
 
 
 
 
 
 
 
11
  "dropout": 0.0,
 
 
 
 
 
 
 
 
12
  "hidden_size": 768,
 
 
 
 
 
 
13
  "intermediate_size": 3072,
 
 
 
 
 
 
 
 
 
 
 
14
  "model_type": "clip_text_model",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "num_attention_heads": 12,
16
- "pad_token_id": 49407,
17
  "projection_dim": 768
18
  },
19
- "torch_dtype": "bfloat16",
20
- "transformers_version": "4.34.0",
21
  "vision_config": {
 
 
 
 
 
 
 
 
 
 
 
22
  "dropout": 0.0,
 
 
 
 
 
 
 
 
23
  "hidden_size": 1024,
 
 
 
 
24
  "image_size": 336,
 
 
25
  "intermediate_size": 4096,
 
 
 
 
 
 
 
 
 
 
26
  "model_type": "clip_vision_model",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "num_attention_heads": 16,
28
  "num_hidden_layers": 24,
29
  "patch_size": 14,
 
1
  {
2
+ "_name_or_path": "fg-clip-large",
3
  "architectures": [
4
+ "FGCLIPModel"
5
  ],
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_fgclip.FGCLIPConfig",
8
+ "AutoModelForCausalLM": "modeling_fgclip.FGCLIPModel"
9
+ },
10
  "initializer_factor": 1.0,
11
  "logit_scale_init_value": 2.6592,
12
  "model_type": "clip",
13
  "projection_dim": 768,
14
  "text_config": {
15
+ "_name_or_path": "",
16
+ "add_cross_attention": false,
17
+ "architectures": null,
18
+ "attention_dropout": 0.0,
19
+ "bad_words_ids": null,
20
+ "bos_token_id": 0,
21
+ "chunk_size_feed_forward": 0,
22
+ "cross_attention_hidden_size": null,
23
+ "decoder_start_token_id": null,
24
+ "diversity_penalty": 0.0,
25
+ "do_sample": false,
26
  "dropout": 0.0,
27
+ "early_stopping": false,
28
+ "encoder_no_repeat_ngram_size": 0,
29
+ "eos_token_id": 2,
30
+ "exponential_decay_length_penalty": null,
31
+ "finetuning_task": null,
32
+ "forced_bos_token_id": null,
33
+ "forced_eos_token_id": null,
34
+ "hidden_act": "quick_gelu",
35
  "hidden_size": 768,
36
+ "id2label": {
37
+ "0": "LABEL_0",
38
+ "1": "LABEL_1"
39
+ },
40
+ "initializer_factor": 1.0,
41
+ "initializer_range": 0.02,
42
  "intermediate_size": 3072,
43
+ "is_decoder": false,
44
+ "is_encoder_decoder": false,
45
+ "label2id": {
46
+ "LABEL_0": 0,
47
+ "LABEL_1": 1
48
+ },
49
+ "layer_norm_eps": 1e-05,
50
+ "length_penalty": 1.0,
51
+ "max_length": 20,
52
+ "max_position_embeddings": 77,
53
+ "min_length": 0,
54
  "model_type": "clip_text_model",
55
+ "no_repeat_ngram_size": 0,
56
+ "num_attention_heads": 12,
57
+ "num_beam_groups": 1,
58
+ "num_beams": 1,
59
+ "num_hidden_layers": 12,
60
+ "num_return_sequences": 1,
61
+ "output_attentions": false,
62
+ "output_hidden_states": false,
63
+ "output_scores": false,
64
+ "pad_token_id": 1,
65
+ "prefix": null,
66
+ "problem_type": null,
67
+ "projection_dim": 768,
68
+ "pruned_heads": {},
69
+ "remove_invalid_values": false,
70
+ "repetition_penalty": 1.0,
71
+ "return_dict": true,
72
+ "return_dict_in_generate": false,
73
+ "sep_token_id": null,
74
+ "task_specific_params": null,
75
+ "temperature": 1.0,
76
+ "tf_legacy_loss": false,
77
+ "tie_encoder_decoder": false,
78
+ "tie_word_embeddings": true,
79
+ "tokenizer_class": null,
80
+ "top_k": 50,
81
+ "top_p": 1.0,
82
+ "torch_dtype": null,
83
+ "torchscript": false,
84
+ "transformers_version": "4.21.3",
85
+ "typical_p": 1.0,
86
+ "use_bfloat16": false,
87
+ "vocab_size": 49408
88
+ },
89
+ "text_config_dict": {
90
+ "hidden_size": 768,
91
+ "intermediate_size": 3072,
92
  "num_attention_heads": 12,
93
+ "num_hidden_layers": 12,
94
  "projection_dim": 768
95
  },
96
+ "torch_dtype": "float32",
97
+ "transformers_version": null,
98
  "vision_config": {
99
+ "_name_or_path": "",
100
+ "add_cross_attention": false,
101
+ "architectures": null,
102
+ "attention_dropout": 0.0,
103
+ "bad_words_ids": null,
104
+ "bos_token_id": null,
105
+ "chunk_size_feed_forward": 0,
106
+ "cross_attention_hidden_size": null,
107
+ "decoder_start_token_id": null,
108
+ "diversity_penalty": 0.0,
109
+ "do_sample": false,
110
  "dropout": 0.0,
111
+ "early_stopping": false,
112
+ "encoder_no_repeat_ngram_size": 0,
113
+ "eos_token_id": null,
114
+ "exponential_decay_length_penalty": null,
115
+ "finetuning_task": null,
116
+ "forced_bos_token_id": null,
117
+ "forced_eos_token_id": null,
118
+ "hidden_act": "quick_gelu",
119
  "hidden_size": 1024,
120
+ "id2label": {
121
+ "0": "LABEL_0",
122
+ "1": "LABEL_1"
123
+ },
124
  "image_size": 336,
125
+ "initializer_factor": 1.0,
126
+ "initializer_range": 0.02,
127
  "intermediate_size": 4096,
128
+ "is_decoder": false,
129
+ "is_encoder_decoder": false,
130
+ "label2id": {
131
+ "LABEL_0": 0,
132
+ "LABEL_1": 1
133
+ },
134
+ "layer_norm_eps": 1e-05,
135
+ "length_penalty": 1.0,
136
+ "max_length": 20,
137
+ "min_length": 0,
138
  "model_type": "clip_vision_model",
139
+ "no_repeat_ngram_size": 0,
140
+ "num_attention_heads": 16,
141
+ "num_beam_groups": 1,
142
+ "num_beams": 1,
143
+ "num_channels": 3,
144
+ "num_hidden_layers": 24,
145
+ "num_return_sequences": 1,
146
+ "output_attentions": false,
147
+ "output_hidden_states": false,
148
+ "output_scores": false,
149
+ "pad_token_id": null,
150
+ "patch_size": 14,
151
+ "prefix": null,
152
+ "problem_type": null,
153
+ "projection_dim": 768,
154
+ "pruned_heads": {},
155
+ "remove_invalid_values": false,
156
+ "repetition_penalty": 1.0,
157
+ "return_dict": true,
158
+ "return_dict_in_generate": false,
159
+ "sep_token_id": null,
160
+ "task_specific_params": null,
161
+ "temperature": 1.0,
162
+ "tf_legacy_loss": false,
163
+ "tie_encoder_decoder": false,
164
+ "tie_word_embeddings": true,
165
+ "tokenizer_class": null,
166
+ "top_k": 50,
167
+ "top_p": 1.0,
168
+ "torch_dtype": null,
169
+ "torchscript": false,
170
+ "transformers_version": "4.21.3",
171
+ "typical_p": 1.0,
172
+ "use_bfloat16": false
173
+ },
174
+ "vision_config_dict": {
175
+ "hidden_size": 1024,
176
+ "image_size": 336,
177
+ "intermediate_size": 4096,
178
  "num_attention_heads": 16,
179
  "num_hidden_layers": 24,
180
  "patch_size": 14,
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95d1f289c527641cdf41d308f790b97dc523bec18eaaa4b446c56229a9f75a65
3
- size 862358450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9a1420417fb27e39fae0ca4cb78068c0dc5b5afd5a3c960521f022b18087c73
3
+ size 1715731940
modeling_clip.py ADDED
The diff for this file is too large to render. See raw diff
 
modeling_fgclip.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ # from transformers import CLIPConfig,AutoConfig
6
+ from typing import Any, Optional, Tuple, Union
7
+ import torch.distributed.nn as nn_dist
8
+ import torch.nn.functional as F
9
+ import numpy as np
10
+ from collections import OrderedDict
11
+ from typing import Tuple, Union
12
+ from .modeling_clip import CLIPModel, CLIPTextTransformer, CLIPVisionTransformer, CLIPOutput, CLIPAttention, CLIPMLP
13
+
14
+ import torch.distributed as dist
15
+ from torch.nn import AvgPool2d
16
+ from transformers import (
17
+ AutoImageProcessor,
18
+ AutoModel,
19
+ AutoTokenizer,
20
+ HfArgumentParser,
21
+ Trainer,
22
+ TrainingArguments,
23
+ set_seed,
24
+ )
25
+
26
+ from .modeling_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
27
+ from torch import nn, einsum
28
+ from einops import rearrange, repeat, reduce
29
+ from einops.layers.torch import Rearrange, Reduce
30
+ import math
31
+ from torchvision.ops import roi_align
32
+
33
+
34
+ class FGCLIPConfig(CLIPConfig):
35
+ model_type = "clip"
36
+
37
+ class FGCLIPModel(CLIPModel):
38
+ config_class = FGCLIPConfig
39
+ main_input_name = "text_long"
40
+
41
+ def __init__(self, config):
42
+ super(CLIPModel, self).__init__(config)
43
+
44
+ if not isinstance(config.text_config, CLIPTextConfig):
45
+ raise ValueError(
46
+ "config.text_config is expected to be of type CLIPTextConfig but is of type"
47
+ f" {type(config.text_config)}."
48
+ )
49
+
50
+ if not isinstance(config.vision_config, CLIPVisionConfig):
51
+ raise ValueError(
52
+ "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
53
+ f" {type(config.vision_config)}."
54
+ )
55
+
56
+ text_config = config.text_config
57
+ vision_config = config.vision_config
58
+ text_config.eos_token_id = 49407
59
+ text_config.pad_token_id = 49407
60
+ text_config.bos_token_id = 49406
61
+
62
+ self.projection_dim = config.projection_dim
63
+ self.text_embed_dim = text_config.hidden_size
64
+ self.vision_embed_dim = vision_config.hidden_size
65
+
66
+ self.text_model = CLIPTextTransformer(text_config)
67
+
68
+ self.vision_model = CLIPVisionTransformer(vision_config)
69
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
70
+
71
+
72
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
73
+ self.text_filip_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
74
+
75
+
76
+ self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
77
+ self.logit_scale_finegraind = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
78
+ self.logit_scale_hardneg = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
79
+
80
+
81
+ self.embed_dim = text_config.hidden_size
82
+ self.world_size = 0
83
+
84
+ # Initialize weights and apply final processing
85
+ self.post_init()
86
+
87
+
88
+ def get_image_features(
89
+ self,
90
+ pixel_values: Optional[torch.FloatTensor] = None,
91
+ output_attentions: Optional[bool] = None,
92
+ output_hidden_states: Optional[bool] = None,
93
+ return_dict: Optional[bool] = None,
94
+ ) -> torch.FloatTensor:
95
+
96
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
97
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
98
+ output_hidden_states = (
99
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
100
+ )
101
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
102
+
103
+ vision_outputs = self.vision_model(
104
+ pixel_values=pixel_values,
105
+ output_attentions=output_attentions,
106
+ output_hidden_states=output_hidden_states,
107
+ return_dict=return_dict,
108
+ )
109
+
110
+ pooled_output = vision_outputs[1] # pooled_output
111
+ image_features = self.visual_projection(pooled_output)
112
+
113
+ return image_features
114
+
115
+ def get_image_box_roi_features(
116
+ self,
117
+ pixel_values: Optional[torch.FloatTensor] = None,
118
+ output_attentions: Optional[bool] = None,
119
+ output_hidden_states: Optional[bool] = None,
120
+ return_dict: Optional[bool] = None,
121
+ box_info=None,
122
+ ) -> torch.FloatTensor:
123
+
124
+
125
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
126
+ output_hidden_states = (
127
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
128
+ )
129
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
130
+
131
+ vision_outputs = self.vision_model(
132
+ pixel_values=pixel_values,
133
+ output_attentions=output_attentions,
134
+ output_hidden_states=True,
135
+ return_dict=return_dict
136
+ )
137
+
138
+ bs = pixel_values.shape[0]
139
+ length = vision_outputs[0].shape[1]-1
140
+ h = int(math.sqrt(length))
141
+ w = h
142
+
143
+ feature_map = vision_outputs.hidden_states[-2]#[:, 1:, :]
144
+ feature_map = self.forward_without_attn(feature_map)[:, 1:]
145
+
146
+ feature_map = self.vision_model.post_layernorm(feature_map)
147
+ feature_map = self.visual_projection(feature_map)
148
+
149
+ feature_map = feature_map.view(bs, h, w, -1).permute(0, 3, 1, 2)
150
+ x_rois = roi_align(feature_map.type(torch.float32),box_info, (1, 1), 1.0, -1, True)[..., 0, 0]
151
+
152
+ x_rois = x_rois / x_rois.norm(p=2, dim=-1, keepdim=True)
153
+
154
+ return x_rois
155
+
156
+ def get_text_features(
157
+ self,
158
+ input_ids: Optional[torch.Tensor] = None,
159
+ attention_mask: Optional[torch.Tensor] = None,
160
+ position_ids: Optional[torch.Tensor] = None,
161
+ output_attentions: Optional[bool] = None,
162
+ output_hidden_states: Optional[bool] = None,
163
+ return_dict: Optional[bool] = None,
164
+ walk_short_pos: Optional[bool] = True,
165
+ use_bbox: Optional[bool] = False
166
+ ) -> torch.FloatTensor:
167
+
168
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
169
+ output_hidden_states = (
170
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
171
+ )
172
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
173
+
174
+ pos_flag = walk_short_pos or use_bbox
175
+
176
+ text_outputs = self.text_model(
177
+ input_ids=input_ids,
178
+ attention_mask=attention_mask,
179
+ position_ids=position_ids,
180
+ output_attentions=output_attentions,
181
+ output_hidden_states=output_hidden_states,
182
+ return_dict=return_dict,
183
+ walk_short_pos=pos_flag,
184
+ )
185
+ pooled_output = text_outputs[1]
186
+
187
+ if walk_short_pos:
188
+ text_features = self.text_projection(pooled_output)
189
+ else:
190
+ text_features = self.text_filip_projection(pooled_output)
191
+
192
+ return text_features
193
+
194
+ @staticmethod
195
+ def _denormalize_boxes(normed_boxes, x):
196
+ h, w = x.shape[-2:]
197
+ denormed_boxes = []
198
+ for boxes in normed_boxes:
199
+
200
+ new_boxes = boxes.clone() # FIXME: do not change the value in normed_boxes!
201
+ new_boxes[:, [0, 2]] *= w
202
+ new_boxes[:, [1, 3]] *= h
203
+ denormed_boxes.append(new_boxes.type(torch.float32))
204
+ return denormed_boxes
205
+
206
+ def forward_without_attn(self, x):
207
+ # get last layer
208
+ residual = x
209
+ x = self.vision_model.encoder.layers[-1].layer_norm1(x)
210
+
211
+ x = F.linear(input=x, weight=self.vision_model.encoder.layers[-1].self_attn.v_proj.weight, bias=self.vision_model.encoder.layers[-1].self_attn.v_proj.bias)
212
+ x = self.vision_model.encoder.layers[-1].self_attn.out_proj(x)
213
+ x = residual+x
214
+
215
+ residual = x
216
+ x = self.vision_model.encoder.layers[-1].layer_norm2(x)
217
+ x = self.vision_model.encoder.layers[-1].mlp(x)
218
+ x = residual + x
219
+
220
+ return x
221
+
222
+
223
+ def get_image_dense_features(
224
+ self,
225
+ pixel_values: Optional[torch.FloatTensor] = None,
226
+ output_attentions: Optional[bool] = None,
227
+ output_hidden_states: Optional[bool] = None,
228
+ return_dict: Optional[bool] = None,
229
+ interpolate_pos_encoding=False,
230
+ box_info=None,
231
+ ) -> torch.FloatTensor:
232
+
233
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
234
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
235
+ output_hidden_states = (
236
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
237
+ )
238
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
239
+
240
+ vision_outputs = self.vision_model(
241
+ pixel_values=pixel_values,
242
+ output_attentions=output_attentions,
243
+ output_hidden_states=True,
244
+ return_dict=return_dict,
245
+ interpolate_pos_encoding=interpolate_pos_encoding,
246
+ )
247
+
248
+
249
+ bs = pixel_values.shape[0]
250
+ length = vision_outputs[0].shape[1]-1
251
+ h = int(math.sqrt(length))
252
+ w = h
253
+
254
+ feature_map = vision_outputs.hidden_states[-2]#[:, 1:, :]
255
+ feature_map = self.forward_without_attn(feature_map)[:, 1:]
256
+
257
+ feature_map = self.vision_model.post_layernorm(feature_map)
258
+ feature_map = self.visual_projection(feature_map)
259
+
260
+ return feature_map
261
+
262
+
trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e7db289055bd76688862c87c1e7311ff64530d1bbc793bcf6ada94563d7920c
3
- size 6264