macwiatrak commited on
Commit
9e8002a
·
verified ·
1 Parent(s): 6531407

Upload BacformerForMaskedGM

Browse files
Files changed (3) hide show
  1. config.json +7 -203
  2. configuration_bacformer.py +72 -0
  3. model.safetensors +2 -2
config.json CHANGED
@@ -1,10 +1,12 @@
1
  {
2
- "_name_or_path": "/rds/user/mw896/rds-flotolab-9X9gY1OFt4M/projects/bacformer/output-data/complete-genomes/finetuning/runs-mgm/30epochs-new-masking/checkpoint-26994",
3
  "alpha_contrastive_loss": 0.5,
4
  "architectures": [
5
  "BacformerForMaskedGM"
6
  ],
7
  "attention_probs_dropout_prob": 0.1,
 
 
 
8
  "batch_size": 1,
9
  "ckpt_path": null,
10
  "dataloader_num_workers": 16,
@@ -15,212 +17,14 @@
15
  "hidden_dropout_prob": 0.1,
16
  "hidden_size": 480,
17
  "id2label": {
18
- "0": "LABEL_0",
19
- "1": "LABEL_1",
20
- "2": "LABEL_2",
21
- "3": "LABEL_3",
22
- "4": "LABEL_4",
23
- "5": "LABEL_5",
24
- "6": "LABEL_6",
25
- "7": "LABEL_7",
26
- "8": "LABEL_8",
27
- "9": "LABEL_9",
28
- "10": "LABEL_10",
29
- "11": "LABEL_11",
30
- "12": "LABEL_12",
31
- "13": "LABEL_13",
32
- "14": "LABEL_14",
33
- "15": "LABEL_15",
34
- "16": "LABEL_16",
35
- "17": "LABEL_17",
36
- "18": "LABEL_18",
37
- "19": "LABEL_19",
38
- "20": "LABEL_20",
39
- "21": "LABEL_21",
40
- "22": "LABEL_22",
41
- "23": "LABEL_23",
42
- "24": "LABEL_24",
43
- "25": "LABEL_25",
44
- "26": "LABEL_26",
45
- "27": "LABEL_27",
46
- "28": "LABEL_28",
47
- "29": "LABEL_29",
48
- "30": "LABEL_30",
49
- "31": "LABEL_31",
50
- "32": "LABEL_32",
51
- "33": "LABEL_33",
52
- "34": "LABEL_34",
53
- "35": "LABEL_35",
54
- "36": "LABEL_36",
55
- "37": "LABEL_37",
56
- "38": "LABEL_38",
57
- "39": "LABEL_39",
58
- "40": "LABEL_40",
59
- "41": "LABEL_41",
60
- "42": "LABEL_42",
61
- "43": "LABEL_43",
62
- "44": "LABEL_44",
63
- "45": "LABEL_45",
64
- "46": "LABEL_46",
65
- "47": "LABEL_47",
66
- "48": "LABEL_48",
67
- "49": "LABEL_49",
68
- "50": "LABEL_50",
69
- "51": "LABEL_51",
70
- "52": "LABEL_52",
71
- "53": "LABEL_53",
72
- "54": "LABEL_54",
73
- "55": "LABEL_55",
74
- "56": "LABEL_56",
75
- "57": "LABEL_57",
76
- "58": "LABEL_58",
77
- "59": "LABEL_59",
78
- "60": "LABEL_60",
79
- "61": "LABEL_61",
80
- "62": "LABEL_62",
81
- "63": "LABEL_63",
82
- "64": "LABEL_64",
83
- "65": "LABEL_65",
84
- "66": "LABEL_66",
85
- "67": "LABEL_67",
86
- "68": "LABEL_68",
87
- "69": "LABEL_69",
88
- "70": "LABEL_70",
89
- "71": "LABEL_71",
90
- "72": "LABEL_72",
91
- "73": "LABEL_73",
92
- "74": "LABEL_74",
93
- "75": "LABEL_75",
94
- "76": "LABEL_76",
95
- "77": "LABEL_77",
96
- "78": "LABEL_78",
97
- "79": "LABEL_79",
98
- "80": "LABEL_80",
99
- "81": "LABEL_81",
100
- "82": "LABEL_82",
101
- "83": "LABEL_83",
102
- "84": "LABEL_84",
103
- "85": "LABEL_85",
104
- "86": "LABEL_86",
105
- "87": "LABEL_87",
106
- "88": "LABEL_88",
107
- "89": "LABEL_89",
108
- "90": "LABEL_90",
109
- "91": "LABEL_91",
110
- "92": "LABEL_92",
111
- "93": "LABEL_93",
112
- "94": "LABEL_94",
113
- "95": "LABEL_95",
114
- "96": "LABEL_96",
115
- "97": "LABEL_97",
116
- "98": "LABEL_98",
117
- "99": "LABEL_99"
118
  },
119
  "initializer_range": 0.02,
120
  "input_dir": "/rds/user/mw896/rds-flotolab-9X9gY1OFt4M/projects/bacformer/input-data/eval-genomes/",
121
  "intermediate_size": 1280,
122
  "is_causal_gm": false,
123
  "label2id": {
124
- "LABEL_0": 0,
125
- "LABEL_1": 1,
126
- "LABEL_10": 10,
127
- "LABEL_11": 11,
128
- "LABEL_12": 12,
129
- "LABEL_13": 13,
130
- "LABEL_14": 14,
131
- "LABEL_15": 15,
132
- "LABEL_16": 16,
133
- "LABEL_17": 17,
134
- "LABEL_18": 18,
135
- "LABEL_19": 19,
136
- "LABEL_2": 2,
137
- "LABEL_20": 20,
138
- "LABEL_21": 21,
139
- "LABEL_22": 22,
140
- "LABEL_23": 23,
141
- "LABEL_24": 24,
142
- "LABEL_25": 25,
143
- "LABEL_26": 26,
144
- "LABEL_27": 27,
145
- "LABEL_28": 28,
146
- "LABEL_29": 29,
147
- "LABEL_3": 3,
148
- "LABEL_30": 30,
149
- "LABEL_31": 31,
150
- "LABEL_32": 32,
151
- "LABEL_33": 33,
152
- "LABEL_34": 34,
153
- "LABEL_35": 35,
154
- "LABEL_36": 36,
155
- "LABEL_37": 37,
156
- "LABEL_38": 38,
157
- "LABEL_39": 39,
158
- "LABEL_4": 4,
159
- "LABEL_40": 40,
160
- "LABEL_41": 41,
161
- "LABEL_42": 42,
162
- "LABEL_43": 43,
163
- "LABEL_44": 44,
164
- "LABEL_45": 45,
165
- "LABEL_46": 46,
166
- "LABEL_47": 47,
167
- "LABEL_48": 48,
168
- "LABEL_49": 49,
169
- "LABEL_5": 5,
170
- "LABEL_50": 50,
171
- "LABEL_51": 51,
172
- "LABEL_52": 52,
173
- "LABEL_53": 53,
174
- "LABEL_54": 54,
175
- "LABEL_55": 55,
176
- "LABEL_56": 56,
177
- "LABEL_57": 57,
178
- "LABEL_58": 58,
179
- "LABEL_59": 59,
180
- "LABEL_6": 6,
181
- "LABEL_60": 60,
182
- "LABEL_61": 61,
183
- "LABEL_62": 62,
184
- "LABEL_63": 63,
185
- "LABEL_64": 64,
186
- "LABEL_65": 65,
187
- "LABEL_66": 66,
188
- "LABEL_67": 67,
189
- "LABEL_68": 68,
190
- "LABEL_69": 69,
191
- "LABEL_7": 7,
192
- "LABEL_70": 70,
193
- "LABEL_71": 71,
194
- "LABEL_72": 72,
195
- "LABEL_73": 73,
196
- "LABEL_74": 74,
197
- "LABEL_75": 75,
198
- "LABEL_76": 76,
199
- "LABEL_77": 77,
200
- "LABEL_78": 78,
201
- "LABEL_79": 79,
202
- "LABEL_8": 8,
203
- "LABEL_80": 80,
204
- "LABEL_81": 81,
205
- "LABEL_82": 82,
206
- "LABEL_83": 83,
207
- "LABEL_84": 84,
208
- "LABEL_85": 85,
209
- "LABEL_86": 86,
210
- "LABEL_87": 87,
211
- "LABEL_88": 88,
212
- "LABEL_89": 89,
213
- "LABEL_9": 9,
214
- "LABEL_90": 90,
215
- "LABEL_91": 91,
216
- "LABEL_92": 92,
217
- "LABEL_93": 93,
218
- "LABEL_94": 94,
219
- "LABEL_95": 95,
220
- "LABEL_96": 96,
221
- "LABEL_97": 97,
222
- "LABEL_98": 98,
223
- "LABEL_99": 99
224
  },
225
  "layer_norm_eps": 1e-12,
226
  "logging_steps": 500,
@@ -260,9 +64,9 @@
260
  },
261
  "test": false,
262
  "test_after_train": false,
263
- "torch_dtype": "float32",
264
  "train_subset_prop": 1.0,
265
- "transformers_version": "4.38.2",
266
  "warmup_proportion": 0.1,
267
  "weight_decay": 0.01
268
  }
 
1
  {
 
2
  "alpha_contrastive_loss": 0.5,
3
  "architectures": [
4
  "BacformerForMaskedGM"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_bacformer.BacformerConfig"
9
+ },
10
  "batch_size": 1,
11
  "ckpt_path": null,
12
  "dataloader_num_workers": 16,
 
17
  "hidden_dropout_prob": 0.1,
18
  "hidden_size": 480,
19
  "id2label": {
20
+ "0": "LABEL_0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  },
22
  "initializer_range": 0.02,
23
  "input_dir": "/rds/user/mw896/rds-flotolab-9X9gY1OFt4M/projects/bacformer/input-data/eval-genomes/",
24
  "intermediate_size": 1280,
25
  "is_causal_gm": false,
26
  "label2id": {
27
+ "LABEL_0": 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  "layer_norm_eps": 1e-12,
30
  "logging_steps": 500,
 
64
  },
65
  "test": false,
66
  "test_after_train": false,
67
+ "torch_dtype": "bfloat16",
68
  "train_subset_prop": 1.0,
69
+ "transformers_version": "4.50.3",
70
  "warmup_proportion": 0.1,
71
  "weight_decay": 0.01
72
  }
configuration_bacformer.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from transformers import PretrainedConfig
4
+
5
+ SPECIAL_TOKENS_DICT = {
6
+ "PAD": 0,
7
+ "MASK": 1,
8
+ "CLS": 2,
9
+ "SEP": 3,
10
+ "PROT_EMB": 4,
11
+ "END": 5,
12
+ }
13
+
14
+
15
+ class BacformerConfig(PretrainedConfig):
16
+ """Configuration class to store the configuration of a `BacformerModel`."""
17
+
18
+ model_type = "bacformer"
19
+
20
+ def __init__(
21
+ self,
22
+ num_hidden_layers: int = 6,
23
+ num_attention_heads: int = 8,
24
+ hidden_size: int = 480, # default esm2_t12_35M_UR50D embedding dim
25
+ intermediate_size: int = 1280,
26
+ hidden_dropout_prob: float = 0.1,
27
+ attention_probs_dropout_prob: float = 0.1,
28
+ max_position_embeddings: int = 6000,
29
+ max_token_type_embeddings: int = 1000,
30
+ layer_norm_eps: float = 1e-12,
31
+ initializer_range: float = 0.02,
32
+ pad_token_id: int = SPECIAL_TOKENS_DICT["PAD"],
33
+ mask_token_id: int = SPECIAL_TOKENS_DICT["MASK"],
34
+ prot_emb_token_id: int = SPECIAL_TOKENS_DICT["PROT_EMB"],
35
+ end_token_id: int = SPECIAL_TOKENS_DICT["END"],
36
+ num_special_tokens: int = len(SPECIAL_TOKENS_DICT),
37
+ protein_clusters_vocab_size: int = 50001, # equal to the nr of protein clusters + 1
38
+ num_labels: int = 1, # for downstream tasks
39
+ is_causal_gm: bool = False,
40
+ return_dict: bool = False,
41
+ return_attn_weights: bool = False,
42
+ alpha_contrastive_loss: float = 0.5,
43
+ # only to use in the BacformerForGenomeClassification
44
+ problem_type: Literal[
45
+ "regression", "binary_classification", "single_label_classification", "multi_label_classification"
46
+ ] = "single_label_classification",
47
+ **kwargs,
48
+ ):
49
+ super().__init__(**kwargs)
50
+
51
+ self.num_hidden_layers = num_hidden_layers
52
+ self.num_attention_heads = num_attention_heads
53
+ self.hidden_size = hidden_size
54
+ self.intermediate_size = intermediate_size
55
+ self.hidden_dropout_prob = hidden_dropout_prob
56
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
57
+ self.max_position_embeddings = max_position_embeddings
58
+ self.max_token_type_embeddings = max_token_type_embeddings
59
+ self.layer_norm_eps = layer_norm_eps
60
+ self.initializer_range = initializer_range
61
+ self.pad_token_id = pad_token_id
62
+ self.mask_token_id = mask_token_id
63
+ self.prot_emb_token_id = prot_emb_token_id
64
+ self.end_token_id = end_token_id
65
+ self.num_special_tokens = num_special_tokens
66
+ self.protein_clusters_vocab_size = protein_clusters_vocab_size
67
+ self.num_labels = num_labels
68
+ self.is_causal_gm = is_causal_gm
69
+ self.return_dict = return_dict
70
+ self.return_attn_weights = return_attn_weights
71
+ self.problem_type = problem_type
72
+ self.alpha_contrastive_loss = alpha_contrastive_loss
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26ba938a94953a3e715b43ad548737fe2e21a10c36715b025068f6b5b68daac7
3
- size 203428892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea7dca2b4580ef7914b438f1f4136c0424170d09073ebc3c1a5e3e262da13d0
3
+ size 101724522