benjamin commited on
Commit
075d234
·
verified ·
1 Parent(s): 648375d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +23 -0
  2. tokenizer.json +672 -0
  3. tokenizer_config.json +63 -0
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eot_id|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|eot_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 256,
8
+ "content": "<|begin_of_text|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 265,
17
+ "content": "<|eot_id|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 512,
26
+ "content": "ĊĊ",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": true,
31
+ "special": false
32
+ },
33
+ {
34
+ "id": 513,
35
+ "content": "user",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": true,
40
+ "special": false
41
+ },
42
+ {
43
+ "id": 514,
44
+ "content": "assistant",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": true,
49
+ "special": false
50
+ },
51
+ {
52
+ "id": 515,
53
+ "content": "system",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": true,
58
+ "special": false
59
+ }
60
+ ],
61
+ "normalizer": null,
62
+ "pre_tokenizer": {
63
+ "type": "Sequence",
64
+ "pretokenizers": [
65
+ {
66
+ "type": "Split",
67
+ "pattern": {
68
+ "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
69
+ },
70
+ "behavior": "Isolated",
71
+ "invert": false
72
+ },
73
+ {
74
+ "type": "ByteLevel",
75
+ "add_prefix_space": false,
76
+ "trim_offsets": true,
77
+ "use_regex": false
78
+ }
79
+ ]
80
+ },
81
+ "post_processor": {
82
+ "type": "Sequence",
83
+ "processors": [
84
+ {
85
+ "type": "ByteLevel",
86
+ "add_prefix_space": true,
87
+ "trim_offsets": false,
88
+ "use_regex": true
89
+ },
90
+ {
91
+ "type": "TemplateProcessing",
92
+ "single": [
93
+ {
94
+ "SpecialToken": {
95
+ "id": "<|begin_of_text|>",
96
+ "type_id": 0
97
+ }
98
+ },
99
+ {
100
+ "Sequence": {
101
+ "id": "A",
102
+ "type_id": 0
103
+ }
104
+ }
105
+ ],
106
+ "pair": [
107
+ {
108
+ "SpecialToken": {
109
+ "id": "<|begin_of_text|>",
110
+ "type_id": 0
111
+ }
112
+ },
113
+ {
114
+ "Sequence": {
115
+ "id": "A",
116
+ "type_id": 0
117
+ }
118
+ },
119
+ {
120
+ "SpecialToken": {
121
+ "id": "<|begin_of_text|>",
122
+ "type_id": 1
123
+ }
124
+ },
125
+ {
126
+ "Sequence": {
127
+ "id": "B",
128
+ "type_id": 1
129
+ }
130
+ }
131
+ ],
132
+ "special_tokens": {
133
+ "<|begin_of_text|>": {
134
+ "id": "<|begin_of_text|>",
135
+ "ids": [
136
+ 256
137
+ ],
138
+ "tokens": [
139
+ "<|begin_of_text|>"
140
+ ]
141
+ }
142
+ }
143
+ }
144
+ ]
145
+ },
146
+ "decoder": {
147
+ "type": "ByteLevel",
148
+ "add_prefix_space": true,
149
+ "trim_offsets": true,
150
+ "use_regex": true
151
+ },
152
+ "model": {
153
+ "type": "WordPiece",
154
+ "unk_token": "<|eot_id|>",
155
+ "continuing_subword_prefix": "",
156
+ "max_input_chars_per_word": 1000000,
157
+ "vocab": {
158
+ "Ā": 0,
159
+ "ā": 1,
160
+ "Ă": 2,
161
+ "ă": 3,
162
+ "Ą": 4,
163
+ "ą": 5,
164
+ "Ć": 6,
165
+ "ć": 7,
166
+ "Ĉ": 8,
167
+ "ĉ": 9,
168
+ "Ċ": 10,
169
+ "ċ": 11,
170
+ "Č": 12,
171
+ "č": 13,
172
+ "Ď": 14,
173
+ "ď": 15,
174
+ "Đ": 16,
175
+ "đ": 17,
176
+ "Ē": 18,
177
+ "ē": 19,
178
+ "Ĕ": 20,
179
+ "ĕ": 21,
180
+ "Ė": 22,
181
+ "ė": 23,
182
+ "Ę": 24,
183
+ "ę": 25,
184
+ "Ě": 26,
185
+ "ě": 27,
186
+ "Ĝ": 28,
187
+ "ĝ": 29,
188
+ "Ğ": 30,
189
+ "ğ": 31,
190
+ "Ġ": 32,
191
+ "!": 33,
192
+ "\"": 34,
193
+ "#": 35,
194
+ "$": 36,
195
+ "%": 37,
196
+ "&": 38,
197
+ "'": 39,
198
+ "(": 40,
199
+ ")": 41,
200
+ "*": 42,
201
+ "+": 43,
202
+ ",": 44,
203
+ "-": 45,
204
+ ".": 46,
205
+ "/": 47,
206
+ "0": 48,
207
+ "1": 49,
208
+ "2": 50,
209
+ "3": 51,
210
+ "4": 52,
211
+ "5": 53,
212
+ "6": 54,
213
+ "7": 55,
214
+ "8": 56,
215
+ "9": 57,
216
+ ":": 58,
217
+ ";": 59,
218
+ "<": 60,
219
+ "=": 61,
220
+ ">": 62,
221
+ "?": 63,
222
+ "@": 64,
223
+ "A": 65,
224
+ "B": 66,
225
+ "C": 67,
226
+ "D": 68,
227
+ "E": 69,
228
+ "F": 70,
229
+ "G": 71,
230
+ "H": 72,
231
+ "I": 73,
232
+ "J": 74,
233
+ "K": 75,
234
+ "L": 76,
235
+ "M": 77,
236
+ "N": 78,
237
+ "O": 79,
238
+ "P": 80,
239
+ "Q": 81,
240
+ "R": 82,
241
+ "S": 83,
242
+ "T": 84,
243
+ "U": 85,
244
+ "V": 86,
245
+ "W": 87,
246
+ "X": 88,
247
+ "Y": 89,
248
+ "Z": 90,
249
+ "[": 91,
250
+ "\\": 92,
251
+ "]": 93,
252
+ "^": 94,
253
+ "_": 95,
254
+ "`": 96,
255
+ "a": 97,
256
+ "b": 98,
257
+ "c": 99,
258
+ "d": 100,
259
+ "e": 101,
260
+ "f": 102,
261
+ "g": 103,
262
+ "h": 104,
263
+ "i": 105,
264
+ "j": 106,
265
+ "k": 107,
266
+ "l": 108,
267
+ "m": 109,
268
+ "n": 110,
269
+ "o": 111,
270
+ "p": 112,
271
+ "q": 113,
272
+ "r": 114,
273
+ "s": 115,
274
+ "t": 116,
275
+ "u": 117,
276
+ "v": 118,
277
+ "w": 119,
278
+ "x": 120,
279
+ "y": 121,
280
+ "z": 122,
281
+ "{": 123,
282
+ "|": 124,
283
+ "}": 125,
284
+ "~": 126,
285
+ "ġ": 127,
286
+ "Ģ": 128,
287
+ "ģ": 129,
288
+ "Ĥ": 130,
289
+ "ĥ": 131,
290
+ "Ħ": 132,
291
+ "ħ": 133,
292
+ "Ĩ": 134,
293
+ "ĩ": 135,
294
+ "Ī": 136,
295
+ "ī": 137,
296
+ "Ĭ": 138,
297
+ "ĭ": 139,
298
+ "Į": 140,
299
+ "į": 141,
300
+ "İ": 142,
301
+ "ı": 143,
302
+ "IJ": 144,
303
+ "ij": 145,
304
+ "Ĵ": 146,
305
+ "ĵ": 147,
306
+ "Ķ": 148,
307
+ "ķ": 149,
308
+ "ĸ": 150,
309
+ "Ĺ": 151,
310
+ "ĺ": 152,
311
+ "Ļ": 153,
312
+ "ļ": 154,
313
+ "Ľ": 155,
314
+ "ľ": 156,
315
+ "Ŀ": 157,
316
+ "ŀ": 158,
317
+ "Ł": 159,
318
+ "ł": 160,
319
+ "¡": 161,
320
+ "¢": 162,
321
+ "£": 163,
322
+ "¤": 164,
323
+ "¥": 165,
324
+ "¦": 166,
325
+ "§": 167,
326
+ "¨": 168,
327
+ "©": 169,
328
+ "ª": 170,
329
+ "«": 171,
330
+ "¬": 172,
331
+ "Ń": 173,
332
+ "®": 174,
333
+ "¯": 175,
334
+ "°": 176,
335
+ "±": 177,
336
+ "²": 178,
337
+ "³": 179,
338
+ "´": 180,
339
+ "µ": 181,
340
+ "¶": 182,
341
+ "·": 183,
342
+ "¸": 184,
343
+ "¹": 185,
344
+ "º": 186,
345
+ "»": 187,
346
+ "¼": 188,
347
+ "½": 189,
348
+ "¾": 190,
349
+ "¿": 191,
350
+ "À": 192,
351
+ "Á": 193,
352
+ "Â": 194,
353
+ "Ã": 195,
354
+ "Ä": 196,
355
+ "Å": 197,
356
+ "Æ": 198,
357
+ "Ç": 199,
358
+ "È": 200,
359
+ "É": 201,
360
+ "Ê": 202,
361
+ "Ë": 203,
362
+ "Ì": 204,
363
+ "Í": 205,
364
+ "Î": 206,
365
+ "Ï": 207,
366
+ "Ð": 208,
367
+ "Ñ": 209,
368
+ "Ò": 210,
369
+ "Ó": 211,
370
+ "Ô": 212,
371
+ "Õ": 213,
372
+ "Ö": 214,
373
+ "×": 215,
374
+ "Ø": 216,
375
+ "Ù": 217,
376
+ "Ú": 218,
377
+ "Û": 219,
378
+ "Ü": 220,
379
+ "Ý": 221,
380
+ "Þ": 222,
381
+ "ß": 223,
382
+ "à": 224,
383
+ "á": 225,
384
+ "â": 226,
385
+ "ã": 227,
386
+ "ä": 228,
387
+ "å": 229,
388
+ "æ": 230,
389
+ "ç": 231,
390
+ "è": 232,
391
+ "é": 233,
392
+ "ê": 234,
393
+ "ë": 235,
394
+ "ì": 236,
395
+ "í": 237,
396
+ "î": 238,
397
+ "ï": 239,
398
+ "ð": 240,
399
+ "ñ": 241,
400
+ "ò": 242,
401
+ "ó": 243,
402
+ "ô": 244,
403
+ "õ": 245,
404
+ "ö": 246,
405
+ "÷": 247,
406
+ "ø": 248,
407
+ "ù": 249,
408
+ "ú": 250,
409
+ "û": 251,
410
+ "ü": 252,
411
+ "ý": 253,
412
+ "þ": 254,
413
+ "ÿ": 255,
414
+ "<|begin_of_text|>": 256,
415
+ "<|end_of_text|>": 257,
416
+ "<|reserved_special_token_0|>": 258,
417
+ "<|reserved_special_token_1|>": 259,
418
+ "<|finetune_right_pad_id|>": 260,
419
+ "<|reserved_special_token_2|>": 261,
420
+ "<|start_header_id|>": 262,
421
+ "<|end_header_id|>": 263,
422
+ "<|eom_id|>": 264,
423
+ "<|eot_id|>": 265,
424
+ "<|python_tag|>": 266,
425
+ "<|reserved_special_token_3|>": 267,
426
+ "<|reserved_special_token_4|>": 268,
427
+ "<|reserved_special_token_5|>": 269,
428
+ "<|reserved_special_token_6|>": 270,
429
+ "<|reserved_special_token_7|>": 271,
430
+ "<|reserved_special_token_8|>": 272,
431
+ "<|reserved_special_token_9|>": 273,
432
+ "<|reserved_special_token_10|>": 274,
433
+ "<|reserved_special_token_11|>": 275,
434
+ "<|reserved_special_token_12|>": 276,
435
+ "<|reserved_special_token_13|>": 277,
436
+ "<|reserved_special_token_14|>": 278,
437
+ "<|reserved_special_token_15|>": 279,
438
+ "<|reserved_special_token_16|>": 280,
439
+ "<|reserved_special_token_17|>": 281,
440
+ "<|reserved_special_token_18|>": 282,
441
+ "<|reserved_special_token_19|>": 283,
442
+ "<|reserved_special_token_20|>": 284,
443
+ "<|reserved_special_token_21|>": 285,
444
+ "<|reserved_special_token_22|>": 286,
445
+ "<|reserved_special_token_23|>": 287,
446
+ "<|reserved_special_token_24|>": 288,
447
+ "<|reserved_special_token_25|>": 289,
448
+ "<|reserved_special_token_26|>": 290,
449
+ "<|reserved_special_token_27|>": 291,
450
+ "<|reserved_special_token_28|>": 292,
451
+ "<|reserved_special_token_29|>": 293,
452
+ "<|reserved_special_token_30|>": 294,
453
+ "<|reserved_special_token_31|>": 295,
454
+ "<|reserved_special_token_32|>": 296,
455
+ "<|reserved_special_token_33|>": 297,
456
+ "<|reserved_special_token_34|>": 298,
457
+ "<|reserved_special_token_35|>": 299,
458
+ "<|reserved_special_token_36|>": 300,
459
+ "<|reserved_special_token_37|>": 301,
460
+ "<|reserved_special_token_38|>": 302,
461
+ "<|reserved_special_token_39|>": 303,
462
+ "<|reserved_special_token_40|>": 304,
463
+ "<|reserved_special_token_41|>": 305,
464
+ "<|reserved_special_token_42|>": 306,
465
+ "<|reserved_special_token_43|>": 307,
466
+ "<|reserved_special_token_44|>": 308,
467
+ "<|reserved_special_token_45|>": 309,
468
+ "<|reserved_special_token_46|>": 310,
469
+ "<|reserved_special_token_47|>": 311,
470
+ "<|reserved_special_token_48|>": 312,
471
+ "<|reserved_special_token_49|>": 313,
472
+ "<|reserved_special_token_50|>": 314,
473
+ "<|reserved_special_token_51|>": 315,
474
+ "<|reserved_special_token_52|>": 316,
475
+ "<|reserved_special_token_53|>": 317,
476
+ "<|reserved_special_token_54|>": 318,
477
+ "<|reserved_special_token_55|>": 319,
478
+ "<|reserved_special_token_56|>": 320,
479
+ "<|reserved_special_token_57|>": 321,
480
+ "<|reserved_special_token_58|>": 322,
481
+ "<|reserved_special_token_59|>": 323,
482
+ "<|reserved_special_token_60|>": 324,
483
+ "<|reserved_special_token_61|>": 325,
484
+ "<|reserved_special_token_62|>": 326,
485
+ "<|reserved_special_token_63|>": 327,
486
+ "<|reserved_special_token_64|>": 328,
487
+ "<|reserved_special_token_65|>": 329,
488
+ "<|reserved_special_token_66|>": 330,
489
+ "<|reserved_special_token_67|>": 331,
490
+ "<|reserved_special_token_68|>": 332,
491
+ "<|reserved_special_token_69|>": 333,
492
+ "<|reserved_special_token_70|>": 334,
493
+ "<|reserved_special_token_71|>": 335,
494
+ "<|reserved_special_token_72|>": 336,
495
+ "<|reserved_special_token_73|>": 337,
496
+ "<|reserved_special_token_74|>": 338,
497
+ "<|reserved_special_token_75|>": 339,
498
+ "<|reserved_special_token_76|>": 340,
499
+ "<|reserved_special_token_77|>": 341,
500
+ "<|reserved_special_token_78|>": 342,
501
+ "<|reserved_special_token_79|>": 343,
502
+ "<|reserved_special_token_80|>": 344,
503
+ "<|reserved_special_token_81|>": 345,
504
+ "<|reserved_special_token_82|>": 346,
505
+ "<|reserved_special_token_83|>": 347,
506
+ "<|reserved_special_token_84|>": 348,
507
+ "<|reserved_special_token_85|>": 349,
508
+ "<|reserved_special_token_86|>": 350,
509
+ "<|reserved_special_token_87|>": 351,
510
+ "<|reserved_special_token_88|>": 352,
511
+ "<|reserved_special_token_89|>": 353,
512
+ "<|reserved_special_token_90|>": 354,
513
+ "<|reserved_special_token_91|>": 355,
514
+ "<|reserved_special_token_92|>": 356,
515
+ "<|reserved_special_token_93|>": 357,
516
+ "<|reserved_special_token_94|>": 358,
517
+ "<|reserved_special_token_95|>": 359,
518
+ "<|reserved_special_token_96|>": 360,
519
+ "<|reserved_special_token_97|>": 361,
520
+ "<|reserved_special_token_98|>": 362,
521
+ "<|reserved_special_token_99|>": 363,
522
+ "<|reserved_special_token_100|>": 364,
523
+ "<|reserved_special_token_101|>": 365,
524
+ "<|reserved_special_token_102|>": 366,
525
+ "<|reserved_special_token_103|>": 367,
526
+ "<|reserved_special_token_104|>": 368,
527
+ "<|reserved_special_token_105|>": 369,
528
+ "<|reserved_special_token_106|>": 370,
529
+ "<|reserved_special_token_107|>": 371,
530
+ "<|reserved_special_token_108|>": 372,
531
+ "<|reserved_special_token_109|>": 373,
532
+ "<|reserved_special_token_110|>": 374,
533
+ "<|reserved_special_token_111|>": 375,
534
+ "<|reserved_special_token_112|>": 376,
535
+ "<|reserved_special_token_113|>": 377,
536
+ "<|reserved_special_token_114|>": 378,
537
+ "<|reserved_special_token_115|>": 379,
538
+ "<|reserved_special_token_116|>": 380,
539
+ "<|reserved_special_token_117|>": 381,
540
+ "<|reserved_special_token_118|>": 382,
541
+ "<|reserved_special_token_119|>": 383,
542
+ "<|reserved_special_token_120|>": 384,
543
+ "<|reserved_special_token_121|>": 385,
544
+ "<|reserved_special_token_122|>": 386,
545
+ "<|reserved_special_token_123|>": 387,
546
+ "<|reserved_special_token_124|>": 388,
547
+ "<|reserved_special_token_125|>": 389,
548
+ "<|reserved_special_token_126|>": 390,
549
+ "<|reserved_special_token_127|>": 391,
550
+ "<|reserved_special_token_128|>": 392,
551
+ "<|reserved_special_token_129|>": 393,
552
+ "<|reserved_special_token_130|>": 394,
553
+ "<|reserved_special_token_131|>": 395,
554
+ "<|reserved_special_token_132|>": 396,
555
+ "<|reserved_special_token_133|>": 397,
556
+ "<|reserved_special_token_134|>": 398,
557
+ "<|reserved_special_token_135|>": 399,
558
+ "<|reserved_special_token_136|>": 400,
559
+ "<|reserved_special_token_137|>": 401,
560
+ "<|reserved_special_token_138|>": 402,
561
+ "<|reserved_special_token_139|>": 403,
562
+ "<|reserved_special_token_140|>": 404,
563
+ "<|reserved_special_token_141|>": 405,
564
+ "<|reserved_special_token_142|>": 406,
565
+ "<|reserved_special_token_143|>": 407,
566
+ "<|reserved_special_token_144|>": 408,
567
+ "<|reserved_special_token_145|>": 409,
568
+ "<|reserved_special_token_146|>": 410,
569
+ "<|reserved_special_token_147|>": 411,
570
+ "<|reserved_special_token_148|>": 412,
571
+ "<|reserved_special_token_149|>": 413,
572
+ "<|reserved_special_token_150|>": 414,
573
+ "<|reserved_special_token_151|>": 415,
574
+ "<|reserved_special_token_152|>": 416,
575
+ "<|reserved_special_token_153|>": 417,
576
+ "<|reserved_special_token_154|>": 418,
577
+ "<|reserved_special_token_155|>": 419,
578
+ "<|reserved_special_token_156|>": 420,
579
+ "<|reserved_special_token_157|>": 421,
580
+ "<|reserved_special_token_158|>": 422,
581
+ "<|reserved_special_token_159|>": 423,
582
+ "<|reserved_special_token_160|>": 424,
583
+ "<|reserved_special_token_161|>": 425,
584
+ "<|reserved_special_token_162|>": 426,
585
+ "<|reserved_special_token_163|>": 427,
586
+ "<|reserved_special_token_164|>": 428,
587
+ "<|reserved_special_token_165|>": 429,
588
+ "<|reserved_special_token_166|>": 430,
589
+ "<|reserved_special_token_167|>": 431,
590
+ "<|reserved_special_token_168|>": 432,
591
+ "<|reserved_special_token_169|>": 433,
592
+ "<|reserved_special_token_170|>": 434,
593
+ "<|reserved_special_token_171|>": 435,
594
+ "<|reserved_special_token_172|>": 436,
595
+ "<|reserved_special_token_173|>": 437,
596
+ "<|reserved_special_token_174|>": 438,
597
+ "<|reserved_special_token_175|>": 439,
598
+ "<|reserved_special_token_176|>": 440,
599
+ "<|reserved_special_token_177|>": 441,
600
+ "<|reserved_special_token_178|>": 442,
601
+ "<|reserved_special_token_179|>": 443,
602
+ "<|reserved_special_token_180|>": 444,
603
+ "<|reserved_special_token_181|>": 445,
604
+ "<|reserved_special_token_182|>": 446,
605
+ "<|reserved_special_token_183|>": 447,
606
+ "<|reserved_special_token_184|>": 448,
607
+ "<|reserved_special_token_185|>": 449,
608
+ "<|reserved_special_token_186|>": 450,
609
+ "<|reserved_special_token_187|>": 451,
610
+ "<|reserved_special_token_188|>": 452,
611
+ "<|reserved_special_token_189|>": 453,
612
+ "<|reserved_special_token_190|>": 454,
613
+ "<|reserved_special_token_191|>": 455,
614
+ "<|reserved_special_token_192|>": 456,
615
+ "<|reserved_special_token_193|>": 457,
616
+ "<|reserved_special_token_194|>": 458,
617
+ "<|reserved_special_token_195|>": 459,
618
+ "<|reserved_special_token_196|>": 460,
619
+ "<|reserved_special_token_197|>": 461,
620
+ "<|reserved_special_token_198|>": 462,
621
+ "<|reserved_special_token_199|>": 463,
622
+ "<|reserved_special_token_200|>": 464,
623
+ "<|reserved_special_token_201|>": 465,
624
+ "<|reserved_special_token_202|>": 466,
625
+ "<|reserved_special_token_203|>": 467,
626
+ "<|reserved_special_token_204|>": 468,
627
+ "<|reserved_special_token_205|>": 469,
628
+ "<|reserved_special_token_206|>": 470,
629
+ "<|reserved_special_token_207|>": 471,
630
+ "<|reserved_special_token_208|>": 472,
631
+ "<|reserved_special_token_209|>": 473,
632
+ "<|reserved_special_token_210|>": 474,
633
+ "<|reserved_special_token_211|>": 475,
634
+ "<|reserved_special_token_212|>": 476,
635
+ "<|reserved_special_token_213|>": 477,
636
+ "<|reserved_special_token_214|>": 478,
637
+ "<|reserved_special_token_215|>": 479,
638
+ "<|reserved_special_token_216|>": 480,
639
+ "<|reserved_special_token_217|>": 481,
640
+ "<|reserved_special_token_218|>": 482,
641
+ "<|reserved_special_token_219|>": 483,
642
+ "<|reserved_special_token_220|>": 484,
643
+ "<|reserved_special_token_221|>": 485,
644
+ "<|reserved_special_token_222|>": 486,
645
+ "<|reserved_special_token_223|>": 487,
646
+ "<|reserved_special_token_224|>": 488,
647
+ "<|reserved_special_token_225|>": 489,
648
+ "<|reserved_special_token_226|>": 490,
649
+ "<|reserved_special_token_227|>": 491,
650
+ "<|reserved_special_token_228|>": 492,
651
+ "<|reserved_special_token_229|>": 493,
652
+ "<|reserved_special_token_230|>": 494,
653
+ "<|reserved_special_token_231|>": 495,
654
+ "<|reserved_special_token_232|>": 496,
655
+ "<|reserved_special_token_233|>": 497,
656
+ "<|reserved_special_token_234|>": 498,
657
+ "<|reserved_special_token_235|>": 499,
658
+ "<|reserved_special_token_236|>": 500,
659
+ "<|reserved_special_token_237|>": 501,
660
+ "<|reserved_special_token_238|>": 502,
661
+ "<|reserved_special_token_239|>": 503,
662
+ "<|reserved_special_token_240|>": 504,
663
+ "<|reserved_special_token_241|>": 505,
664
+ "<|reserved_special_token_242|>": 506,
665
+ "<|reserved_special_token_243|>": 507,
666
+ "<|reserved_special_token_244|>": 508,
667
+ "<|reserved_special_token_245|>": 509,
668
+ "<|reserved_special_token_246|>": 510,
669
+ "<|reserved_special_token_247|>": 511
670
+ }
671
+ }
672
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "256": {
4
+ "content": "<|begin_of_text|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "265": {
12
+ "content": "<|eot_id|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "512": {
20
+ "content": "ĊĊ",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "513": {
28
+ "content": "user",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "514": {
36
+ "content": "assistant",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "515": {
44
+ "content": "system",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<|begin_of_text|>",
53
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
54
+ "clean_up_tokenization_spaces": true,
55
+ "eos_token": "<|eot_id|>",
56
+ "model_input_names": [
57
+ "input_ids",
58
+ "attention_mask"
59
+ ],
60
+ "model_max_length": 131072,
61
+ "pad_token": "<|eot_id|>",
62
+ "tokenizer_class": "PreTrainedTokenizerFast"
63
+ }