julioc-p commited on
Commit
ae9ed93
·
verified ·
1 Parent(s): 64f74e7

Upload tokenizer

Browse files
Files changed (3) hide show
  1. chat_template.jinja +4 -0
  2. tokenizer.json +16 -6
  3. tokenizer_config.json +1 -2
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
tokenizer.json CHANGED
@@ -58,13 +58,23 @@
58
  "special": true
59
  }
60
  ],
61
- "normalizer": null,
62
- "pre_tokenizer": {
63
- "type": "Metaspace",
64
- "replacement": "▁",
65
- "prepend_scheme": "first",
66
- "split": false
 
 
 
 
 
 
 
 
 
67
  },
 
68
  "post_processor": {
69
  "type": "TemplateProcessing",
70
  "single": [
 
58
  "special": true
59
  }
60
  ],
61
+ "normalizer": {
62
+ "type": "Sequence",
63
+ "normalizers": [
64
+ {
65
+ "type": "Prepend",
66
+ "prepend": "▁"
67
+ },
68
+ {
69
+ "type": "Replace",
70
+ "pattern": {
71
+ "String": " "
72
+ },
73
+ "content": "▁"
74
+ }
75
+ ]
76
  },
77
+ "pre_tokenizer": null,
78
  "post_processor": {
79
  "type": "TemplateProcessing",
80
  "single": [
tokenizer_config.json CHANGED
@@ -57,11 +57,10 @@
57
  "<|im_end|>"
58
  ],
59
  "bos_token": "<|im_start|>",
60
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
61
  "clean_up_tokenization_spaces": false,
62
  "eos_token": "<|im_end|>",
63
  "extra_special_tokens": {},
64
- "legacy": false,
65
  "model_max_length": 1000000000000000019884624838656,
66
  "pad_token": "<|im_end|>",
67
  "sp_model_kwargs": {},
 
57
  "<|im_end|>"
58
  ],
59
  "bos_token": "<|im_start|>",
 
60
  "clean_up_tokenization_spaces": false,
61
  "eos_token": "<|im_end|>",
62
  "extra_special_tokens": {},
63
+ "legacy": true,
64
  "model_max_length": 1000000000000000019884624838656,
65
  "pad_token": "<|im_end|>",
66
  "sp_model_kwargs": {},