farzadab commited on
Commit
2209b8f
·
verified ·
1 Parent(s): 21c1cd8

add <|audio|> token

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -48,5 +48,6 @@
48
  },
49
  "torch_dtype": "float32",
50
  "transformers_version": "4.44.0",
51
- "vocab_size": 128256
52
- }
 
 
48
  },
49
  "torch_dtype": "float32",
50
  "transformers_version": "4.44.0",
51
+ "vocab_size": 128256,
52
+ "audio_token_index": 128256
53
+ }
special_tokens_map.json CHANGED
@@ -1,4 +1,13 @@
1
  {
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|begin_of_text|>",
4
  "lstrip": false,
@@ -13,5 +22,11 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|eot_id|>"
 
 
 
 
 
 
17
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|audio|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
  "bos_token": {
12
  "content": "<|begin_of_text|>",
13
  "lstrip": false,
 
22
  "rstrip": false,
23
  "single_word": false
24
  },
25
+ "pad_token": {
26
+ "content": "<|eot_id|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -2047,12 +2047,24 @@
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
 
 
 
 
 
 
 
 
2050
  }
2051
  },
 
 
 
2052
  "bos_token": "<|begin_of_text|>",
2053
  "chat_template": "{{ '<|begin_of_text|>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|eot_id|>",
 
2056
  "model_input_names": [
2057
  "input_ids",
2058
  "attention_mask"
@@ -2061,5 +2073,5 @@
2061
  "pad_token": "<|eot_id|>",
2062
  "padding_side": "right",
2063
  "split_special_tokens": false,
2064
- "tokenizer_class": "PreTrainedTokenizerFast"
2065
  }
 
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
2050
+ },
2051
+ "128256": {
2052
+ "content": "<|audio|>",
2053
+ "lstrip": false,
2054
+ "normalized": false,
2055
+ "rstrip": false,
2056
+ "single_word": false,
2057
+ "special": true
2058
  }
2059
  },
2060
+ "additional_special_tokens": [
2061
+ "<|audio|>"
2062
+ ],
2063
  "bos_token": "<|begin_of_text|>",
2064
  "chat_template": "{{ '<|begin_of_text|>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}",
2065
  "clean_up_tokenization_spaces": true,
2066
  "eos_token": "<|eot_id|>",
2067
+ "extra_special_tokens": {},
2068
  "model_input_names": [
2069
  "input_ids",
2070
  "attention_mask"
 
2073
  "pad_token": "<|eot_id|>",
2074
  "padding_side": "right",
2075
  "split_special_tokens": false,
2076
+ "tokenizer_class": "PreTrainedTokenizer"
2077
  }