gugarosa commited on
Commit
f1527a3
·
verified ·
1 Parent(s): f580375

fix(tokenizer_config): Sets fim and think tokens to non-special, and removes unk_token.

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +6 -15
tokenizer_config.json CHANGED
@@ -1,14 +1,6 @@
1
  {
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
- "5809": {
5
- "content": "�",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
  "100256": {
13
  "content": "<|dummy_0|>",
14
  "lstrip": true,
@@ -31,7 +23,7 @@
31
  "normalized": false,
32
  "rstrip": true,
33
  "single_word": false,
34
- "special": true
35
  },
36
  "100259": {
37
  "content": "<|fim_middle|>",
@@ -39,7 +31,7 @@
39
  "normalized": false,
40
  "rstrip": true,
41
  "single_word": false,
42
- "special": true
43
  },
44
  "100260": {
45
  "content": "<|fim_suffix|>",
@@ -47,7 +39,7 @@
47
  "normalized": false,
48
  "rstrip": true,
49
  "single_word": false,
50
- "special": true
51
  },
52
  "100261": {
53
  "content": "<|dummy_1|>",
@@ -767,7 +759,7 @@
767
  "normalized": false,
768
  "rstrip": true,
769
  "single_word": false,
770
- "special": true
771
  },
772
  "100351": {
773
  "content": "</think>",
@@ -775,7 +767,7 @@
775
  "normalized": false,
776
  "rstrip": true,
777
  "single_word": false,
778
- "special": true
779
  }
780
  },
781
  "bos_token": "<|endoftext|>",
@@ -786,6 +778,5 @@
786
  "model_max_length": 32768,
787
  "pad_token": "<|dummy_85|>",
788
  "padding_side": "left",
789
- "tokenizer_class": "GPT2Tokenizer",
790
- "unk_token": "�"
791
  }
 
1
  {
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
 
 
 
 
 
 
 
 
4
  "100256": {
5
  "content": "<|dummy_0|>",
6
  "lstrip": true,
 
23
  "normalized": false,
24
  "rstrip": true,
25
  "single_word": false,
26
+ "special": false
27
  },
28
  "100259": {
29
  "content": "<|fim_middle|>",
 
31
  "normalized": false,
32
  "rstrip": true,
33
  "single_word": false,
34
+ "special": false
35
  },
36
  "100260": {
37
  "content": "<|fim_suffix|>",
 
39
  "normalized": false,
40
  "rstrip": true,
41
  "single_word": false,
42
+ "special": false
43
  },
44
  "100261": {
45
  "content": "<|dummy_1|>",
 
759
  "normalized": false,
760
  "rstrip": true,
761
  "single_word": false,
762
+ "special": false
763
  },
764
  "100351": {
765
  "content": "</think>",
 
767
  "normalized": false,
768
  "rstrip": true,
769
  "single_word": false,
770
+ "special": false
771
  }
772
  },
773
  "bos_token": "<|endoftext|>",
 
778
  "model_max_length": 32768,
779
  "pad_token": "<|dummy_85|>",
780
  "padding_side": "left",
781
+ "tokenizer_class": "GPT2Tokenizer"
 
782
  }