Macropodus commited on
Commit
cbbf1e1
·
verified ·
1 Parent(s): ede0f3b

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +18 -3
  2. config.json +19 -0
  3. idx2pun.json +123 -0
  4. pytorch_model.bin +3 -0
  5. sl.config +630 -0
  6. special_tokens_map.json +7 -0
  7. tokenizer_config.json +15 -0
  8. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,18 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bert4sl_punct_zh_public
2
+ ## 时间(time)
3
+ 2024.6
4
+
5
+ ## 训练数据构成(dataset)
6
+ 使用高质量语料过滤而成, 收集高质量语料, 并使用PPL过滤等;
7
+ - [chinese-poetry/chinese-poetry](https://github.com/chinese-poetry/chinese-poetry)
8
+ - [chinese-poetry/huajianji](https://github.com/chinese-poetry/huajianji)
9
+ - [garychowcmu/daizhigev20](https://github.com/garychowcmu/daizhigev20)
10
+ - [yangjianxin1/Firefly](https://github.com/yangjianxin1/Firefly)
11
+ - [学习强国428万数据](https://huggingface.co/datasets/Macropodus/xuexiqiangguo_428w); 国内源[Macropodus/xuexiqiangguo_428w](https://hf-mirror.com/datasets/Macropodus/xuexiqiangguo_428w)
12
+ - [xi_talk40万](https://huggingface.co/datasets/Papersnake/xi_talk); 国内源[Papersnake/xi_talk](https://hf-mirror.com/datasets/Papersnake/xi_talk)
13
+ - [qwen-7b生成的100万好句]
14
+ - [人民日报语料2000万]
15
+
16
+ ## 训练说明
17
+ 每种标点的最大句子数为10万, 总计500万训练句子, 训练3epoch;
18
+
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "directionality": "bidi",
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 3072,
9
+ "max_position_embeddings": 512,
10
+ "num_attention_heads": 12,
11
+ "num_hidden_layers": 12,
12
+ "pooler_fc_size": 768,
13
+ "pooler_num_attention_heads": 12,
14
+ "pooler_num_fc_layers": 3,
15
+ "pooler_size_per_head": 128,
16
+ "pooler_type": "first_token_transform",
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 21128
19
+ }
idx2pun.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": ",",
3
+ "1": "。",
4
+ "2": "、",
5
+ "3": ";",
6
+ "4": "?",
7
+ "5": "!",
8
+ "6": ":",
9
+ "7": "“",
10
+ "8": "”",
11
+ "9": "‘",
12
+ "10": "’",
13
+ "11": "—",
14
+ "12": "…",
15
+ "13": "·",
16
+ "14": "~",
17
+ "15": "《",
18
+ "16": "》",
19
+ "17": "(",
20
+ "18": ")",
21
+ "19": "<",
22
+ "20": ">",
23
+ "21": ":“",
24
+ "22": "。”",
25
+ "23": "!”",
26
+ "24": "?”",
27
+ "25": ",“",
28
+ "26": "”,",
29
+ "27": "”。",
30
+ "28": "”!",
31
+ "29": "”?",
32
+ "30": "……",
33
+ "31": "——",
34
+ "32": "》。",
35
+ "33": "”“",
36
+ "34": "》,",
37
+ "35": ",《",
38
+ "36": "”、",
39
+ "37": "、“",
40
+ "38": "……”",
41
+ "39": "》《",
42
+ "40": "”;",
43
+ "41": "”:",
44
+ "42": "”、“",
45
+ "43": "”,“",
46
+ "44": "”——",
47
+ "45": "“‘",
48
+ "46": "’”",
49
+ "47": ",‘",
50
+ "48": ";“",
51
+ "49": "》、《",
52
+ "50": "”…",
53
+ "51": "》、",
54
+ "52": "》…",
55
+ "53": "),",
56
+ "54": ")。",
57
+ "55": ")、",
58
+ "56": "——“",
59
+ "57": "”(",
60
+ "58": "》(",
61
+ "59": "~",
62
+ "60": ":《",
63
+ "61": "”)",
64
+ "62": ":(",
65
+ "63": ")》",
66
+ "64": "「",
67
+ "65": "」",
68
+ "66": "):",
69
+ "67": "--",
70
+ "68": ")》,",
71
+ "69": "》),",
72
+ "70": ");",
73
+ "71": ",(",
74
+ "72": ";(",
75
+ "73": "《“",
76
+ "74": "》)",
77
+ "75": "”;“",
78
+ "76": "〈",
79
+ "77": "〉",
80
+ "78": "》;",
81
+ "79": ":“‘",
82
+ "80": ")“",
83
+ "81": ")”",
84
+ "82": "》:",
85
+ "83": "”:“",
86
+ "84": "[",
87
+ "85": "]",
88
+ "86": "—”",
89
+ "87": ";《",
90
+ "88": ")》(",
91
+ "89": "’、",
92
+ "90": "》,《",
93
+ "91": "(“",
94
+ "92": "’、‘",
95
+ "93": "”(《",
96
+ "94": ",“‘",
97
+ "95": "”》",
98
+ "96": "〔",
99
+ "97": "〕",
100
+ "98": ")、“",
101
+ "99": "、‘",
102
+ "100": ":‘",
103
+ "101": "(《",
104
+ "102": "?“",
105
+ "103": "」,",
106
+ "104": "《〈",
107
+ "105": "》,“",
108
+ "106": "。“",
109
+ "107": "!“",
110
+ "108": ";“",
111
+ "109": ":「",
112
+ "110": "」)",
113
+ "111": ":“《",
114
+ "112": "“《",
115
+ "113": ")(",
116
+ "114": "『",
117
+ "115": "』",
118
+ "116": "“(",
119
+ "117": ")、《",
120
+ "118": "),“",
121
+ "119": ")》《",
122
+ "120": "、("
123
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ed27d2d8fefb74a83159c1513e36cdf9f762254374ee167dcc61e2704ba3290
3
+ size 416223240
sl.config ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "CUDA_VISIBLE_DEVICES": "0",
3
+ "USE_TORCH": "1",
4
+ "output_hidden_states": null,
5
+ "pretrained_model_name_or_path": "hfl/chinese-roberta-wwm-ext",
6
+ "model_save_path": "../output/sequence_labeling/bert4sl_public_v10",
7
+ "config_name": "sl.config",
8
+ "model_name": "pytorch_model.bin",
9
+ "path_train": "/opt/dataset/punct/v2_merge_punct_public.train.span",
10
+ "path_dev": "/opt/dataset/punct/v2_merge_punct_public.dev.span",
11
+ "path_tet": "/opt/dataset/punct/v2_merge_punct_public.tet.span",
12
+ "loss_type": "CIRCLE_LOSS",
13
+ "corpus_type": "DATA-SPAN",
14
+ "task_type": "SL-CRF",
15
+ "model_type": "BERT",
16
+ "active_type": "GELU",
17
+ "padding_side": "RIGHT",
18
+ "max_len_limit": 512,
19
+ "batch_size": 32,
20
+ "num_labels": 137,
21
+ "dense_lr": 3e-05,
22
+ "max_len": 128,
23
+ "epochs": 3,
24
+ "lr": 3e-05,
25
+ "grad_accum_steps": 4,
26
+ "max_grad_norm": 1.0,
27
+ "weight_decay": 0.99,
28
+ "dropout_rate": 0.1,
29
+ "adam_eps": 1e-08,
30
+ "seed": 2021,
31
+ "evaluate_steps": 3200,
32
+ "warmup_steps": 128,
33
+ "ignore_index": 0,
34
+ "save_steps": 3200,
35
+ "stop_epochs": 4,
36
+ "num_workers": 0,
37
+ "max_steps": -1,
38
+ "flag_save_model_state": true,
39
+ "flag_dynamic_encode": false,
40
+ "flag_tokenizer_char": true,
41
+ "flag_soft_label": true,
42
+ "flag_dropout": true,
43
+ "flag_shuffle": true,
44
+ "flag_active": true,
45
+ "flag_train": false,
46
+ "flag_cuda": true,
47
+ "flag_adv": false,
48
+ "save_best_mertics_key": [
49
+ "micro_avg",
50
+ "f1-score"
51
+ ],
52
+ "multi_label_threshold": 0.3,
53
+ "grid_pointer_threshold": 0,
54
+ "xy_keys_predict": [
55
+ "text",
56
+ "label"
57
+ ],
58
+ "xy_keys": [
59
+ "text",
60
+ "label"
61
+ ],
62
+ "label_sep": "|myz|",
63
+ "sl_ctype": "BIO",
64
+ "head_size": 64,
65
+ "adv_emb_name": "word_embeddings.",
66
+ "adv_eps": 1.0,
67
+ "additional_special_tokens": [],
68
+ "keys": [
69
+ "text",
70
+ "label"
71
+ ],
72
+ "version": "v10",
73
+ "row_sep": " ",
74
+ "prior": [
75
+ 0.5,
76
+ 0.4203515291420627,
77
+ 0.20280245911475547,
78
+ 0.09099409019456753,
79
+ 0.057407213264443396,
80
+ 0.0346761425207655,
81
+ 0.016178442418445606,
82
+ 0.01350873328253565,
83
+ 0.01312916600237759,
84
+ 0.012515661130986146,
85
+ 0.01244560116956664,
86
+ 0.0123608316491478,
87
+ 0.010844786654274916,
88
+ 0.010433171743857142,
89
+ 0.007847827741864325,
90
+ 0.007773167883987983,
91
+ 0.007506677179367756,
92
+ 0.006972988093749327,
93
+ 0.005921835930893178,
94
+ 0.004883320846215149,
95
+ 0.004302217443272409,
96
+ 0.003862548219299204,
97
+ 0.0036721327253112688,
98
+ 0.0026064226483939234,
99
+ 0.0025564303671212745,
100
+ 0.0022204357325213793,
101
+ 0.0020437188312785277,
102
+ 0.002031587236227531,
103
+ 0.001957786699667301,
104
+ 0.0019046098746937656,
105
+ 0.0018862608371791332,
106
+ 0.001810640561361254,
107
+ 0.001802856121203531,
108
+ 0.001711666965070206,
109
+ 0.0016780523371164027,
110
+ 0.0014267261263099216,
111
+ 0.0013526728481861295,
112
+ 0.0013433719586470319,
113
+ 0.001107867369719559,
114
+ 0.0007129839508096172,
115
+ 0.0006982238434975713,
116
+ 0.0006826549631821255,
117
+ 0.0006443898904587734,
118
+ 0.00048521325372715454,
119
+ 0.0004481107921961897,
120
+ 0.00042915517492900744,
121
+ 0.00042212895946197186,
122
+ 0.0003619764673341133,
123
+ 0.00030743483791734075,
124
+ 0.00028130136024498545,
125
+ 0.0002759937874101744,
126
+ 0.00025198333887174344,
127
+ 0.00024682741097506987,
128
+ 0.00021477978071535365,
129
+ 0.00020866343487714283,
130
+ 0.0001854617593421117,
131
+ 0.00017393674404366484,
132
+ 0.00015897444348076894,
133
+ 0.0001573568974739694,
134
+ 0.00014355720810346067,
135
+ 0.00011985004944130466,
136
+ 0.00011964785619045472,
137
+ 0.00010529213538010865,
138
+ 0.00010488774887840876,
139
+ 0.00010124827036310977,
140
+ 9.993401423258512e-05,
141
+ 9.41715065833617e-05,
142
+ 9.144189769688744e-05,
143
+ 8.886393374855065e-05,
144
+ 8.653871136377629e-05,
145
+ 8.057401046370292e-05,
146
+ 7.738946676281629e-05,
147
+ 7.294121524411751e-05,
148
+ 7.18797006771553e-05,
149
+ 7.132366923731796e-05,
150
+ 7.127312092460547e-05,
151
+ 6.171948982194559e-05,
152
+ 6.16689415092331e-05,
153
+ 6.030413706599598e-05,
154
+ 5.5047112543897416e-05,
155
+ 5.125598909046096e-05,
156
+ 5.100324752689853e-05,
157
+ 4.89307667056866e-05,
158
+ 4.670664094633721e-05,
159
+ 3.9882618730151575e-05,
160
+ 3.927603897760174e-05,
161
+ 3.811342778521456e-05,
162
+ 3.5636560462302737e-05,
163
+ 3.5232173960602844e-05,
164
+ 3.508052902246539e-05,
165
+ 2.8711441620692133e-05,
166
+ 2.7649927053729924e-05,
167
+ 2.7043347301180092e-05,
168
+ 2.4920318167255673e-05,
169
+ 2.4060996851143406e-05,
170
+ 2.390935191300595e-05,
171
+ 2.310057890960617e-05,
172
+ 2.2645644095193797e-05,
173
+ 2.244345084434385e-05,
174
+ 2.2039064342643962e-05,
175
+ 2.168522615365656e-05,
176
+ 1.9511648707019654e-05,
177
+ 1.865232739090739e-05,
178
+ 1.743916788580772e-05,
179
+ 1.7388619573095235e-05,
180
+ 1.6984233071395346e-05,
181
+ 1.6377653318845514e-05,
182
+ 1.587217019172065e-05,
183
+ 1.5164493813745846e-05,
184
+ 1.4204075872208608e-05,
185
+ 1.3850237683221205e-05,
186
+ 1.3496399494233802e-05,
187
+ 1.2940368054396454e-05,
188
+ 1.1474466985734356e-05,
189
+ 1.0968983858609495e-05,
190
+ 1.0210759167922203e-05,
191
+ 7.834988470435354e-06,
192
+ 6.874570528898117e-06,
193
+ 6.419635714485741e-06,
194
+ 6.419635714485741e-06,
195
+ 5.914152587360879e-06,
196
+ 5.459217772948504e-06,
197
+ 5.307572834811046e-06,
198
+ 3.740575140723975e-06,
199
+ 3.740575140723975e-06,
200
+ 3.336188639024086e-06,
201
+ 3.2856403263116e-06,
202
+ 2.8812538246117107e-06,
203
+ 2.274674072061877e-06,
204
+ 1.3142561305246399e-06,
205
+ 1.010966254249723e-06,
206
+ 8.087730033997784e-07,
207
+ 2.5274156356243075e-07,
208
+ 2.021932508499446e-07,
209
+ 1.010966254249723e-07,
210
+ 1.010966254249723e-07,
211
+ 1.010966254249723e-07
212
+ ],
213
+ "l2i_conll": {
214
+ "O": 0,
215
+ "B-0": 1,
216
+ "B-1": 2,
217
+ "B-2": 3,
218
+ "B-7": 4,
219
+ "B-8": 5,
220
+ "B-15": 6,
221
+ "B-4": 7,
222
+ "B-26": 8,
223
+ "B-27": 9,
224
+ "B-3": 10,
225
+ "B-17": 11,
226
+ "B-21": 12,
227
+ "B-16": 13,
228
+ "B-18": 14,
229
+ "B-5": 15,
230
+ "B-25": 16,
231
+ "B-22": 17,
232
+ "B-6": 18,
233
+ "B-30": 19,
234
+ "B-34": 20,
235
+ "B-31": 21,
236
+ "B-13": 22,
237
+ "B-23": 23,
238
+ "B-53": 24,
239
+ "B-42": 25,
240
+ "B-11": 26,
241
+ "B-24": 27,
242
+ "B-137": 28,
243
+ "B-32": 29,
244
+ "B-37": 30,
245
+ "B-49": 31,
246
+ "B-35": 32,
247
+ "B-9": 33,
248
+ "B-43": 34,
249
+ "B-54": 35,
250
+ "B-36": 36,
251
+ "B-10": 37,
252
+ "B-40": 38,
253
+ "B-55": 39,
254
+ "B-29": 40,
255
+ "B-39": 41,
256
+ "B-57": 42,
257
+ "B-41": 43,
258
+ "B-12": 44,
259
+ "B-58": 45,
260
+ "B-14": 46,
261
+ "B-28": 47,
262
+ "B-62": 48,
263
+ "B-51": 49,
264
+ "B-56": 50,
265
+ "B-44": 51,
266
+ "B-38": 52,
267
+ "B-63": 53,
268
+ "B-61": 54,
269
+ "B-108": 55,
270
+ "B-123": 56,
271
+ "B-60": 57,
272
+ "B-68": 58,
273
+ "B-72": 59,
274
+ "B-131": 60,
275
+ "B-69": 61,
276
+ "B-64": 62,
277
+ "B-71": 63,
278
+ "B-66": 64,
279
+ "B-70": 65,
280
+ "B-73": 66,
281
+ "B-75": 67,
282
+ "B-79": 68,
283
+ "B-142": 69,
284
+ "B-74": 70,
285
+ "B-78": 71,
286
+ "B-45": 72,
287
+ "B-77": 73,
288
+ "B-121": 74,
289
+ "B-47": 75,
290
+ "B-46": 76,
291
+ "B-81": 77,
292
+ "B-76": 78,
293
+ "B-96": 79,
294
+ "B-80": 80,
295
+ "B-97": 81,
296
+ "B-65": 82,
297
+ "B-88": 83,
298
+ "B-141": 84,
299
+ "B-140": 85,
300
+ "B-87": 86,
301
+ "B-90": 87,
302
+ "B-91": 88,
303
+ "B-83": 89,
304
+ "B-110": 90,
305
+ "B-93": 91,
306
+ "B-95": 92,
307
+ "B-92": 93,
308
+ "B-82": 94,
309
+ "B-98": 95,
310
+ "B-89": 96,
311
+ "B-104": 97,
312
+ "B-143": 98,
313
+ "B-94": 99,
314
+ "B-101": 100,
315
+ "B-144": 101,
316
+ "B-113": 102,
317
+ "B-111": 103,
318
+ "B-103": 104,
319
+ "B-105": 105,
320
+ "B-112": 106,
321
+ "B-130": 107,
322
+ "B-99": 108,
323
+ "B-118": 109,
324
+ "B-145": 110,
325
+ "B-146": 111,
326
+ "B-119": 112,
327
+ "B-50": 113,
328
+ "B-122": 114,
329
+ "B-117": 115,
330
+ "B-138": 116,
331
+ "B-116": 117,
332
+ "B-126": 118,
333
+ "B-100": 119,
334
+ "B-147": 120,
335
+ "B-120": 121,
336
+ "B-148": 122,
337
+ "B-133": 123,
338
+ "B-139": 124,
339
+ "B-129": 125,
340
+ "B-128": 126,
341
+ "B-114": 127,
342
+ "B-115": 128,
343
+ "B-52": 129,
344
+ "B-125": 130,
345
+ "B-134": 131,
346
+ "B-86": 132,
347
+ "B-107": 133,
348
+ "B-127": 134,
349
+ "B-109": 135,
350
+ "B-124": 136
351
+ },
352
+ "l2i": {
353
+ "O": 0,
354
+ "B-0": 1,
355
+ "B-1": 2,
356
+ "B-2": 3,
357
+ "B-7": 4,
358
+ "B-8": 5,
359
+ "B-15": 6,
360
+ "B-4": 7,
361
+ "B-26": 8,
362
+ "B-27": 9,
363
+ "B-3": 10,
364
+ "B-17": 11,
365
+ "B-21": 12,
366
+ "B-16": 13,
367
+ "B-18": 14,
368
+ "B-5": 15,
369
+ "B-25": 16,
370
+ "B-22": 17,
371
+ "B-6": 18,
372
+ "B-30": 19,
373
+ "B-34": 20,
374
+ "B-31": 21,
375
+ "B-13": 22,
376
+ "B-23": 23,
377
+ "B-53": 24,
378
+ "B-42": 25,
379
+ "B-11": 26,
380
+ "B-24": 27,
381
+ "B-137": 28,
382
+ "B-32": 29,
383
+ "B-37": 30,
384
+ "B-49": 31,
385
+ "B-35": 32,
386
+ "B-9": 33,
387
+ "B-43": 34,
388
+ "B-54": 35,
389
+ "B-36": 36,
390
+ "B-10": 37,
391
+ "B-40": 38,
392
+ "B-55": 39,
393
+ "B-29": 40,
394
+ "B-39": 41,
395
+ "B-57": 42,
396
+ "B-41": 43,
397
+ "B-12": 44,
398
+ "B-58": 45,
399
+ "B-14": 46,
400
+ "B-28": 47,
401
+ "B-62": 48,
402
+ "B-51": 49,
403
+ "B-56": 50,
404
+ "B-44": 51,
405
+ "B-38": 52,
406
+ "B-63": 53,
407
+ "B-61": 54,
408
+ "B-108": 55,
409
+ "B-123": 56,
410
+ "B-60": 57,
411
+ "B-68": 58,
412
+ "B-72": 59,
413
+ "B-131": 60,
414
+ "B-69": 61,
415
+ "B-64": 62,
416
+ "B-71": 63,
417
+ "B-66": 64,
418
+ "B-70": 65,
419
+ "B-73": 66,
420
+ "B-75": 67,
421
+ "B-79": 68,
422
+ "B-142": 69,
423
+ "B-74": 70,
424
+ "B-78": 71,
425
+ "B-45": 72,
426
+ "B-77": 73,
427
+ "B-121": 74,
428
+ "B-47": 75,
429
+ "B-46": 76,
430
+ "B-81": 77,
431
+ "B-76": 78,
432
+ "B-96": 79,
433
+ "B-80": 80,
434
+ "B-97": 81,
435
+ "B-65": 82,
436
+ "B-88": 83,
437
+ "B-141": 84,
438
+ "B-140": 85,
439
+ "B-87": 86,
440
+ "B-90": 87,
441
+ "B-91": 88,
442
+ "B-83": 89,
443
+ "B-110": 90,
444
+ "B-93": 91,
445
+ "B-95": 92,
446
+ "B-92": 93,
447
+ "B-82": 94,
448
+ "B-98": 95,
449
+ "B-89": 96,
450
+ "B-104": 97,
451
+ "B-143": 98,
452
+ "B-94": 99,
453
+ "B-101": 100,
454
+ "B-144": 101,
455
+ "B-113": 102,
456
+ "B-111": 103,
457
+ "B-103": 104,
458
+ "B-105": 105,
459
+ "B-112": 106,
460
+ "B-130": 107,
461
+ "B-99": 108,
462
+ "B-118": 109,
463
+ "B-145": 110,
464
+ "B-146": 111,
465
+ "B-119": 112,
466
+ "B-50": 113,
467
+ "B-122": 114,
468
+ "B-117": 115,
469
+ "B-138": 116,
470
+ "B-116": 117,
471
+ "B-126": 118,
472
+ "B-100": 119,
473
+ "B-147": 120,
474
+ "B-120": 121,
475
+ "B-148": 122,
476
+ "B-133": 123,
477
+ "B-139": 124,
478
+ "B-129": 125,
479
+ "B-128": 126,
480
+ "B-114": 127,
481
+ "B-115": 128,
482
+ "B-52": 129,
483
+ "B-125": 130,
484
+ "B-134": 131,
485
+ "B-86": 132,
486
+ "B-107": 133,
487
+ "B-127": 134,
488
+ "B-109": 135,
489
+ "B-124": 136
490
+ },
491
+ "i2l": {
492
+ "0": "O",
493
+ "1": "B-0",
494
+ "2": "B-1",
495
+ "3": "B-2",
496
+ "4": "B-7",
497
+ "5": "B-8",
498
+ "6": "B-15",
499
+ "7": "B-4",
500
+ "8": "B-26",
501
+ "9": "B-27",
502
+ "10": "B-3",
503
+ "11": "B-17",
504
+ "12": "B-21",
505
+ "13": "B-16",
506
+ "14": "B-18",
507
+ "15": "B-5",
508
+ "16": "B-25",
509
+ "17": "B-22",
510
+ "18": "B-6",
511
+ "19": "B-30",
512
+ "20": "B-34",
513
+ "21": "B-31",
514
+ "22": "B-13",
515
+ "23": "B-23",
516
+ "24": "B-53",
517
+ "25": "B-42",
518
+ "26": "B-11",
519
+ "27": "B-24",
520
+ "28": "B-137",
521
+ "29": "B-32",
522
+ "30": "B-37",
523
+ "31": "B-49",
524
+ "32": "B-35",
525
+ "33": "B-9",
526
+ "34": "B-43",
527
+ "35": "B-54",
528
+ "36": "B-36",
529
+ "37": "B-10",
530
+ "38": "B-40",
531
+ "39": "B-55",
532
+ "40": "B-29",
533
+ "41": "B-39",
534
+ "42": "B-57",
535
+ "43": "B-41",
536
+ "44": "B-12",
537
+ "45": "B-58",
538
+ "46": "B-14",
539
+ "47": "B-28",
540
+ "48": "B-62",
541
+ "49": "B-51",
542
+ "50": "B-56",
543
+ "51": "B-44",
544
+ "52": "B-38",
545
+ "53": "B-63",
546
+ "54": "B-61",
547
+ "55": "B-108",
548
+ "56": "B-123",
549
+ "57": "B-60",
550
+ "58": "B-68",
551
+ "59": "B-72",
552
+ "60": "B-131",
553
+ "61": "B-69",
554
+ "62": "B-64",
555
+ "63": "B-71",
556
+ "64": "B-66",
557
+ "65": "B-70",
558
+ "66": "B-73",
559
+ "67": "B-75",
560
+ "68": "B-79",
561
+ "69": "B-142",
562
+ "70": "B-74",
563
+ "71": "B-78",
564
+ "72": "B-45",
565
+ "73": "B-77",
566
+ "74": "B-121",
567
+ "75": "B-47",
568
+ "76": "B-46",
569
+ "77": "B-81",
570
+ "78": "B-76",
571
+ "79": "B-96",
572
+ "80": "B-80",
573
+ "81": "B-97",
574
+ "82": "B-65",
575
+ "83": "B-88",
576
+ "84": "B-141",
577
+ "85": "B-140",
578
+ "86": "B-87",
579
+ "87": "B-90",
580
+ "88": "B-91",
581
+ "89": "B-83",
582
+ "90": "B-110",
583
+ "91": "B-93",
584
+ "92": "B-95",
585
+ "93": "B-92",
586
+ "94": "B-82",
587
+ "95": "B-98",
588
+ "96": "B-89",
589
+ "97": "B-104",
590
+ "98": "B-143",
591
+ "99": "B-94",
592
+ "100": "B-101",
593
+ "101": "B-144",
594
+ "102": "B-113",
595
+ "103": "B-111",
596
+ "104": "B-103",
597
+ "105": "B-105",
598
+ "106": "B-112",
599
+ "107": "B-130",
600
+ "108": "B-99",
601
+ "109": "B-118",
602
+ "110": "B-145",
603
+ "111": "B-146",
604
+ "112": "B-119",
605
+ "113": "B-50",
606
+ "114": "B-122",
607
+ "115": "B-117",
608
+ "116": "B-138",
609
+ "117": "B-116",
610
+ "118": "B-126",
611
+ "119": "B-100",
612
+ "120": "B-147",
613
+ "121": "B-120",
614
+ "122": "B-148",
615
+ "123": "B-133",
616
+ "124": "B-139",
617
+ "125": "B-129",
618
+ "126": "B-128",
619
+ "127": "B-114",
620
+ "128": "B-115",
621
+ "129": "B-52",
622
+ "130": "B-125",
623
+ "131": "B-134",
624
+ "132": "B-86",
625
+ "133": "B-107",
626
+ "134": "B-127",
627
+ "135": "B-109",
628
+ "136": "B-124"
629
+ }
630
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "PretrainedTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff