fix(tokenizer_config): Sets fim and think tokens to non-special, and removes unk_token.
Browse files- tokenizer_config.json +6 -15
tokenizer_config.json
CHANGED
@@ -1,14 +1,6 @@
|
|
1 |
{
|
2 |
"add_prefix_space": false,
|
3 |
"added_tokens_decoder": {
|
4 |
-
"5809": {
|
5 |
-
"content": "�",
|
6 |
-
"lstrip": false,
|
7 |
-
"normalized": false,
|
8 |
-
"rstrip": false,
|
9 |
-
"single_word": false,
|
10 |
-
"special": true
|
11 |
-
},
|
12 |
"100256": {
|
13 |
"content": "<|dummy_0|>",
|
14 |
"lstrip": true,
|
@@ -31,7 +23,7 @@
|
|
31 |
"normalized": false,
|
32 |
"rstrip": true,
|
33 |
"single_word": false,
|
34 |
-
"special":
|
35 |
},
|
36 |
"100259": {
|
37 |
"content": "<|fim_middle|>",
|
@@ -39,7 +31,7 @@
|
|
39 |
"normalized": false,
|
40 |
"rstrip": true,
|
41 |
"single_word": false,
|
42 |
-
"special":
|
43 |
},
|
44 |
"100260": {
|
45 |
"content": "<|fim_suffix|>",
|
@@ -47,7 +39,7 @@
|
|
47 |
"normalized": false,
|
48 |
"rstrip": true,
|
49 |
"single_word": false,
|
50 |
-
"special":
|
51 |
},
|
52 |
"100261": {
|
53 |
"content": "<|dummy_1|>",
|
@@ -767,7 +759,7 @@
|
|
767 |
"normalized": false,
|
768 |
"rstrip": true,
|
769 |
"single_word": false,
|
770 |
-
"special":
|
771 |
},
|
772 |
"100351": {
|
773 |
"content": "</think>",
|
@@ -775,7 +767,7 @@
|
|
775 |
"normalized": false,
|
776 |
"rstrip": true,
|
777 |
"single_word": false,
|
778 |
-
"special":
|
779 |
}
|
780 |
},
|
781 |
"bos_token": "<|endoftext|>",
|
@@ -786,6 +778,5 @@
|
|
786 |
"model_max_length": 32768,
|
787 |
"pad_token": "<|dummy_85|>",
|
788 |
"padding_side": "left",
|
789 |
-
"tokenizer_class": "GPT2Tokenizer"
|
790 |
-
"unk_token": "�"
|
791 |
}
|
|
|
1 |
{
|
2 |
"add_prefix_space": false,
|
3 |
"added_tokens_decoder": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"100256": {
|
5 |
"content": "<|dummy_0|>",
|
6 |
"lstrip": true,
|
|
|
23 |
"normalized": false,
|
24 |
"rstrip": true,
|
25 |
"single_word": false,
|
26 |
+
"special": false
|
27 |
},
|
28 |
"100259": {
|
29 |
"content": "<|fim_middle|>",
|
|
|
31 |
"normalized": false,
|
32 |
"rstrip": true,
|
33 |
"single_word": false,
|
34 |
+
"special": false
|
35 |
},
|
36 |
"100260": {
|
37 |
"content": "<|fim_suffix|>",
|
|
|
39 |
"normalized": false,
|
40 |
"rstrip": true,
|
41 |
"single_word": false,
|
42 |
+
"special": false
|
43 |
},
|
44 |
"100261": {
|
45 |
"content": "<|dummy_1|>",
|
|
|
759 |
"normalized": false,
|
760 |
"rstrip": true,
|
761 |
"single_word": false,
|
762 |
+
"special": false
|
763 |
},
|
764 |
"100351": {
|
765 |
"content": "</think>",
|
|
|
767 |
"normalized": false,
|
768 |
"rstrip": true,
|
769 |
"single_word": false,
|
770 |
+
"special": false
|
771 |
}
|
772 |
},
|
773 |
"bos_token": "<|endoftext|>",
|
|
|
778 |
"model_max_length": 32768,
|
779 |
"pad_token": "<|dummy_85|>",
|
780 |
"padding_side": "left",
|
781 |
+
"tokenizer_class": "GPT2Tokenizer"
|
|
|
782 |
}
|