chandralegend commited on
Commit
6406d04
·
verified ·
1 Parent(s): 5f9ca5d

Upload tokenizer

Browse files
Files changed (2) hide show
  1. special_tokens_map.json +18 -10
  2. tokenizer_config.json +19 -6
special_tokens_map.json CHANGED
@@ -1,24 +1,32 @@
1
  {
2
  "additional_special_tokens": [
 
3
  "<|im_start|>",
4
- "<|im_end|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "bos_token": {
7
- "content": "<|im_start|>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false
12
  },
13
  "eos_token": {
14
- "content": "<|im_end|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "pad_token": {
21
- "content": "<|im_end|>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|endoftext|>",
4
  "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
  ],
21
  "bos_token": {
22
+ "content": "<|endoftext|>",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
26
  "single_word": false
27
  },
28
  "eos_token": {
29
+ "content": "<|endoftext|>",
 
 
 
 
 
 
 
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -139,15 +139,28 @@
139
  }
140
  },
141
  "additional_special_tokens": [
 
142
  "<|im_start|>",
143
- "<|im_end|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  ],
145
- "bos_token": "<|im_start|>",
146
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
147
  "clean_up_tokenization_spaces": false,
148
- "eos_token": "<|im_end|>",
149
- "model_max_length": 2048,
150
- "pad_token": "<|im_end|>",
151
  "tokenizer_class": "GPT2Tokenizer",
152
  "unk_token": "<|endoftext|>",
153
  "vocab_size": 49152
 
139
  }
140
  },
141
  "additional_special_tokens": [
142
+ "<|endoftext|>",
143
  "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
  ],
160
+ "bos_token": "<|endoftext|>",
 
161
  "clean_up_tokenization_spaces": false,
162
+ "eos_token": "<|endoftext|>",
163
+ "model_max_length": 1000000000000000019884624838656,
 
164
  "tokenizer_class": "GPT2Tokenizer",
165
  "unk_token": "<|endoftext|>",
166
  "vocab_size": 49152