dav1dliu commited on
Commit
904f089
·
verified ·
1 Parent(s): b7ddc96

Init Commit

Browse files
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ ---
5
+ # Introduction
6
+
7
+ **SDAR**(**S**ynergy of **D**iffusion and **A**uto**R**egression)-model is a new large language model that integrates autoregressive (AR) and discrete diffusion modeling strategies. It combines the efficient training paradigm of AR models with the highly parallel inference capability of diffusion models, while delivering performance fully on par with SOTA opensource AR models. At the same time, SDAR sets a new benchmark as the most powerful diffusion language model to date.
8
+
9
+ > The SDAR series models undergo continued training on Qwen3 models.
10
+
11
+ # Performance of SDAR-1.7B-Chat on various benchmarks
12
+
13
+ evaluation settings:
14
+ - MMLU: 5-shot
15
+ - Math500: 0-shot
16
+ - GSM8K: 0-shot
17
+ - HumanEval: 0-shot
18
+ - Sanitized_MBPP: 0-shot
19
+ - IFEval: 0-shot
20
+ - MathBench: 0-shot
21
+
22
+
23
+ | Model | MMLU | Math500 | GSM8K | HumanEval | Sanitized_MBPP | IFEval | MathBench |
24
+ |-------------------|------|---------|-------|-----------|----------------|--------|-----------|
25
+ | SDAR-1.7B-Chat | 62.9 | 63.2 | 80.06 | 61.59 | 61.09 | 43.44 | 63.55 |
26
+ | SDAR-4B-Chat | | | | | | | |
27
+ | SDAR-8B-Chat | | | | | | | |
28
+ | SDAR-30B-A3B-Chat | | | | | | | |
29
+
30
+
31
+
32
+ **Note**: The 4B, 8B, and 30B models are coming soon. Performance results for these models will be released in the near future.
33
+
34
+ ## Inference
35
+ The inference code will come soon
36
+
37
+ ## Hightlights
38
+ - **Performance**: SDAR-1.7B-Chat achieves state-of-the-art.
39
+ - **Efficiency**: SDAR provides over 2× faster inference speed compared to the same-size AR models, while maintaining comparable performance.
added_tokens.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<MASK>": 151669,
6
+ "<think>": 151667,
7
+ "<tool_call>": 151657,
8
+ "<tool_response>": 151665,
9
+ "<|box_end|>": 151649,
10
+ "<|box_start|>": 151648,
11
+ "<|endoftext|>": 151643,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|im_end|>": 151645,
18
+ "<|im_start|>": 151644,
19
+ "<|image_pad|>": 151655,
20
+ "<|object_ref_end|>": 151647,
21
+ "<|object_ref_start|>": 151646,
22
+ "<|quad_end|>": 151651,
23
+ "<|quad_start|>": 151650,
24
+ "<|repo_name|>": 151663,
25
+ "<|video_pad|>": 151656,
26
+ "<|vision_end|>": 151653,
27
+ "<|vision_pad|>": 151654,
28
+ "<|vision_start|>": 151652
29
+ }
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SDARForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_sdar.SDARConfig",
7
+ "AutoModel": "modeling_sdar.SDARModel",
8
+ "AutoModelForCausalLM": "modeling_sdar.SDARForCausalLM"
9
+ },
10
+ "attention_bias": false,
11
+ "attention_dropout": 0.0,
12
+ "attn_implementation": "flex_attention",
13
+ "bos_token_id": 151643,
14
+ "debug": false,
15
+ "eos_token_id": 151643,
16
+ "ep_size": 1,
17
+ "fuse_cross_entropy": true,
18
+ "head_dim": 128,
19
+ "hidden_act": "silu",
20
+ "hidden_size": 4096,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 12288,
23
+ "max_position_embeddings": 32768,
24
+ "max_window_layers": 36,
25
+ "micro_forward": false,
26
+ "model_type": "qwen3",
27
+ "num_attention_heads": 32,
28
+ "num_hidden_layers": 36,
29
+ "num_key_value_heads": 8,
30
+ "rms_norm_eps": 1e-06,
31
+ "rope_scaling": null,
32
+ "rope_theta": 1000000,
33
+ "skip_checkpoint": false,
34
+ "sliding_window": null,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.52.4",
38
+ "use_cache": false,
39
+ "use_deepep": false,
40
+ "use_sliding_window": false,
41
+ "vocab_size": 151936
42
+ }
configuration_sdar.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """SDAR model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class SDARConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`SDARModel`]. It is used to instantiate a
28
+ SDAR model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of
30
+ SDAR-1.7B [DiffuOpen/SDAR-1.7B-Chat](https://huggingface.co/DiffuOpen/SDAR-1.7B-Chat/).
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 151936):
38
+ Vocabulary size of the SDAR model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`SDARModel`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 22016):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ num_key_value_heads (`int`, *optional*, defaults to 32):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details checkout [this
54
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
55
+ head_dim (`int`, *optional*, defaults to 128):
56
+ The attention head dimension.
57
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
58
+ The non-linear activation function (function or string) in the decoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
60
+ The maximum sequence length that this model might ever be used with.
61
+ initializer_range (`float`, *optional*, defaults to 0.02):
62
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
63
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
64
+ The epsilon used by the rms normalization layers.
65
+ use_cache (`bool`, *optional*, defaults to `True`):
66
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
67
+ relevant if `config.is_decoder=True`.
68
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
69
+ Whether the model's input and output word embeddings should be tied.
70
+ rope_theta (`float`, *optional*, defaults to 10000.0):
71
+ The base period of the RoPE embeddings.
72
+ rope_scaling (`Dict`, *optional*):
73
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
74
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
75
+ accordingly.
76
+ Expected contents:
77
+ `rope_type` (`str`):
78
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
79
+ 'llama3'], with 'default' being the original RoPE implementation.
80
+ `factor` (`float`, *optional*):
81
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
82
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
83
+ original maximum pre-trained length.
84
+ `original_max_position_embeddings` (`int`, *optional*):
85
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
86
+ pretraining.
87
+ `attention_factor` (`float`, *optional*):
88
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
89
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
90
+ `factor` field to infer the suggested value.
91
+ `beta_fast` (`float`, *optional*):
92
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
93
+ ramp function. If unspecified, it defaults to 32.
94
+ `beta_slow` (`float`, *optional*):
95
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
96
+ ramp function. If unspecified, it defaults to 1.
97
+ `short_factor` (`List[float]`, *optional*):
98
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
99
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
100
+ size divided by the number of attention heads divided by 2
101
+ `long_factor` (`List[float]`, *optional*):
102
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
103
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
104
+ size divided by the number of attention heads divided by 2
105
+ `low_freq_factor` (`float`, *optional*):
106
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
107
+ `high_freq_factor` (`float`, *optional*):
108
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
109
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
110
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
111
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
112
+ Whether to use sliding window attention.
113
+ sliding_window (`int`, *optional*, defaults to 4096):
114
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
115
+ max_window_layers (`int`, *optional*, defaults to 28):
116
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
117
+ attention_dropout (`float`, *optional*, defaults to 0.0):
118
+ The dropout ratio for the attention probabilities.
119
+
120
+ ```python
121
+ >>> from transformers import SDARModel, SDARConfig
122
+
123
+ >>> # Initializing a SDAR style configuration
124
+ >>> configuration = SDARConfig()
125
+
126
+ >>> # Initializing a model from the SDAR-8B style configuration
127
+ >>> model = SDARModel(configuration)
128
+
129
+ >>> # Accessing the model configuration
130
+ >>> configuration = model.config
131
+ ```"""
132
+
133
+ model_type = "sdar"
134
+ keys_to_ignore_at_inference = ["past_key_values"]
135
+
136
+ # Default tensor parallel plan for base model `SDAR`
137
+ base_model_tp_plan = {
138
+ "layers.*.self_attn.q_proj": "colwise",
139
+ "layers.*.self_attn.k_proj": "colwise",
140
+ "layers.*.self_attn.v_proj": "colwise",
141
+ "layers.*.self_attn.o_proj": "rowwise",
142
+ "layers.*.mlp.gate_proj": "colwise",
143
+ "layers.*.mlp.up_proj": "colwise",
144
+ "layers.*.mlp.down_proj": "rowwise",
145
+ }
146
+ base_model_pp_plan = {
147
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
148
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
149
+ "norm": (["hidden_states"], ["hidden_states"]),
150
+ }
151
+
152
+ def __init__(
153
+ self,
154
+ vocab_size=151936,
155
+ hidden_size=4096,
156
+ intermediate_size=22016,
157
+ num_hidden_layers=32,
158
+ num_attention_heads=32,
159
+ num_key_value_heads=32,
160
+ head_dim=128,
161
+ hidden_act="silu",
162
+ max_position_embeddings=32768,
163
+ initializer_range=0.02,
164
+ rms_norm_eps=1e-6,
165
+ use_cache=True,
166
+ tie_word_embeddings=False,
167
+ rope_theta=10000.0,
168
+ rope_scaling=None,
169
+ attention_bias=False,
170
+ use_sliding_window=False,
171
+ sliding_window=4096,
172
+ max_window_layers=28,
173
+ attention_dropout=0.0,
174
+ **kwargs,
175
+ ):
176
+ self.vocab_size = vocab_size
177
+ self.max_position_embeddings = max_position_embeddings
178
+ self.hidden_size = hidden_size
179
+ self.intermediate_size = intermediate_size
180
+ self.num_hidden_layers = num_hidden_layers
181
+ self.num_attention_heads = num_attention_heads
182
+ self.use_sliding_window = use_sliding_window
183
+ self.sliding_window = sliding_window # we check `use_sliding_window` in the modeling code
184
+ self.max_window_layers = max_window_layers
185
+
186
+ # for backward compatibility
187
+ if num_key_value_heads is None:
188
+ num_key_value_heads = num_attention_heads
189
+
190
+ self.num_key_value_heads = num_key_value_heads
191
+ self.head_dim = head_dim
192
+ self.hidden_act = hidden_act
193
+ self.initializer_range = initializer_range
194
+ self.rms_norm_eps = rms_norm_eps
195
+ self.use_cache = use_cache
196
+ self.rope_theta = rope_theta
197
+ self.rope_scaling = rope_scaling
198
+ self.attention_bias = attention_bias
199
+ self.attention_dropout = attention_dropout
200
+ # Validate the correctness of rotary position embeddings parameters
201
+ # BC: if there is a 'type' field, move it to 'rope_type'.
202
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
203
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
204
+ rope_config_validation(self)
205
+
206
+ super().__init__(
207
+ tie_word_embeddings=tie_word_embeddings,
208
+ **kwargs,
209
+ )
210
+
211
+
212
+ __all__ = ["SDARConfig"]
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.51.0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee7beb9c15109330ef7dff91d93ead6ffa2ac3e53d0964a5290415eeb1984de5
3
+ size 4902257696
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d47798317d24a05d9d316c512a44ddf4832881550124bf3bd642a134be6e92b
3
+ size 4915960368
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e86fbfb220a639eaa3b392f54b385e6d059e8846756899b02c23d7adf500f72
3
+ size 4983068496
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2316fc89624629a22569d471a23ff6e5b06d2d163d92d436db674d3904b1b653
3
+ size 1580230264
model.safetensors.index.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16381470720
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
141
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
142
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
143
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
144
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
145
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
146
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
147
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
148
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
149
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
150
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
151
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
155
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
158
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
167
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
168
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
169
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
170
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
171
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
172
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
173
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
179
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
181
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
273
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
293
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
303
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
318
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
320
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
327
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
328
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
329
+ "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
330
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
331
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
332
+ "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
333
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
335
+ "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
336
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
337
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
339
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
340
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
341
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
342
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
343
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
344
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
345
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
346
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
347
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
348
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
349
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
350
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
351
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
352
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
353
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
354
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
355
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
356
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
357
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
358
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
359
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
360
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
361
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
362
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
363
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
364
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
365
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
366
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
367
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
368
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
369
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
370
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
371
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
372
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
373
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
374
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
375
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
376
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
377
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
378
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
379
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
380
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
381
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
382
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
383
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
384
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
385
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
387
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
388
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
389
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
390
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
391
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
392
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
393
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
394
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
395
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
396
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
397
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
398
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
399
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
400
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
401
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
402
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
403
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
404
+ "model.norm.weight": "model-00004-of-00004.safetensors"
405
+ }
406
+ }
modeling_sdar.py ADDED
@@ -0,0 +1,865 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is modified based on https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3/modeling_qwen3.py.
2
+ #
3
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
4
+ # This file was automatically generated from src/transformers/models/qwen3/modular_qwen3.py.
5
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
6
+ # the file from the modular. If any change should be done, please apply the change to the
7
+ # modular_qwen3.py file directly. One of our CI enforces this.
8
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
9
+ # coding=utf-8
10
+ # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
11
+ #
12
+ # Licensed under the Apache License, Version 2.0 (the "License");
13
+ # you may not use this file except in compliance with the License.
14
+ # You may obtain a copy of the License at
15
+ #
16
+ # http://www.apache.org/licenses/LICENSE-2.0
17
+ #
18
+ # Unless required by applicable law or agreed to in writing, software
19
+ # distributed under the License is distributed on an "AS IS" BASIS,
20
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
+ # See the License for the specific language governing permissions and
22
+ # limitations under the License.
23
+
24
+ from typing import Callable, Optional, Tuple, Union
25
+
26
+ import torch
27
+ from torch import nn
28
+
29
+ from transformers.activations import ACT2FN
30
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
31
+ from transformers.generation import GenerationMixin
32
+ from transformers.integrations import use_kernel_forward_from_hub
33
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
34
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
35
+ from transformers.modeling_layers import GradientCheckpointingLayer
36
+ from transformers.modeling_outputs import (
37
+ BaseModelOutputWithPast,
38
+ CausalLMOutputWithPast,
39
+ QuestionAnsweringModelOutput,
40
+ SequenceClassifierOutputWithPast,
41
+ TokenClassifierOutput,
42
+ )
43
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
44
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
45
+ from transformers.processing_utils import Unpack
46
+ from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
47
+ from .configuration_sdar import SDARConfig
48
+
49
+ from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
50
+
51
+ import torch.nn.functional as F
52
+ try:
53
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
54
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
55
+ except:
56
+ pass
57
+
58
+ try:
59
+ from liger_kernel.ops.swiglu import LigerSiLUMulFunction # noqa: F401
60
+ liger_kernel_is_available = True
61
+ except ImportError:
62
+ liger_kernel_is_available = False
63
+
64
+
65
+ if is_torch_flex_attn_available():
66
+ from torch.nn.attention.flex_attention import BlockMask, create_block_mask, flex_attention
67
+ from transformers.integrations.flex_attention import make_flex_block_causal_mask
68
+
69
+
70
+ logger = logging.get_logger(__name__)
71
+
72
+
73
+ @use_kernel_forward_from_hub("RMSNorm")
74
+ class SDARRMSNorm(nn.Module):
75
+ def __init__(self, hidden_size, eps=1e-6):
76
+ """
77
+ SDARRMSNorm is equivalent to T5LayerNorm
78
+ """
79
+ super().__init__()
80
+ self.weight = nn.Parameter(torch.ones(hidden_size))
81
+ self.variance_epsilon = eps
82
+
83
+ def forward(self, hidden_states):
84
+ return flash_rms_norm(
85
+ hidden_states, weight=self.weight, bias=None, eps=self.variance_epsilon)
86
+ '''
87
+ input_dtype = hidden_states.dtype
88
+ hidden_states = hidden_states.to(torch.float32)
89
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
90
+ hidden_states = hidden_states * \
91
+ torch.rsqrt(variance + self.variance_epsilon)
92
+ return self.weight * hidden_states.to(input_dtype)
93
+ '''
94
+
95
+ def extra_repr(self):
96
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
97
+
98
+
99
+ class SDARMLP(nn.Module):
100
+ def __init__(self, config):
101
+ super().__init__()
102
+ self.config = config
103
+ self.hidden_size = config.hidden_size
104
+ self.intermediate_size = config.intermediate_size
105
+ self.gate_proj = nn.Linear(
106
+ self.hidden_size, self.intermediate_size, bias=False)
107
+ self.up_proj = nn.Linear(
108
+ self.hidden_size, self.intermediate_size, bias=False)
109
+ self.down_proj = nn.Linear(
110
+ self.intermediate_size, self.hidden_size, bias=False)
111
+ self.act_fn = ACT2FN[config.hidden_act]
112
+
113
+ def forward(self, x):
114
+ if liger_kernel_is_available:
115
+ return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
116
+ else:
117
+ down_proj = self.down_proj(self.act_fn(
118
+ self.gate_proj(x)) * self.up_proj(x))
119
+ return down_proj
120
+
121
+
122
+ def rotate_half(x):
123
+ """Rotates half the hidden dims of the input."""
124
+ x1 = x[..., : x.shape[-1] // 2]
125
+ x2 = x[..., x.shape[-1] // 2:]
126
+ return torch.cat((-x2, x1), dim=-1)
127
+
128
+
129
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
130
+ """Applies Rotary Position Embedding to the query and key tensors.
131
+
132
+ Args:
133
+ q (`torch.Tensor`): The query tensor.
134
+ k (`torch.Tensor`): The key tensor.
135
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
136
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
137
+ position_ids (`torch.Tensor`, *optional*):
138
+ Deprecated and unused.
139
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
140
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
141
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
142
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
143
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
144
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
145
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
146
+ Returns:
147
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
148
+ """
149
+ cos = cos.unsqueeze(unsqueeze_dim)
150
+ sin = sin.unsqueeze(unsqueeze_dim)
151
+ q_embed = (q * cos) + (rotate_half(q) * sin)
152
+ k_embed = (k * cos) + (rotate_half(k) * sin)
153
+ return q_embed, k_embed
154
+
155
+
156
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
157
+ """
158
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
159
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
160
+ """
161
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
162
+ if n_rep == 1:
163
+ return hidden_states
164
+ hidden_states = hidden_states[:, :, None, :, :].expand(
165
+ batch, num_key_value_heads, n_rep, slen, head_dim)
166
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
167
+
168
+
169
+ def eager_attention_forward(
170
+ module: nn.Module,
171
+ query: torch.Tensor,
172
+ key: torch.Tensor,
173
+ value: torch.Tensor,
174
+ attention_mask: Optional[torch.Tensor],
175
+ scaling: float,
176
+ dropout: float = 0.0,
177
+ **kwargs,
178
+ ):
179
+ key_states = repeat_kv(key, module.num_key_value_groups)
180
+ value_states = repeat_kv(value, module.num_key_value_groups)
181
+
182
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
183
+ if attention_mask is not None:
184
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
185
+ attn_weights = attn_weights + causal_mask
186
+
187
+ attn_weights = nn.functional.softmax(
188
+ attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
189
+ attn_weights = nn.functional.dropout(
190
+ attn_weights, p=dropout, training=module.training)
191
+ attn_output = torch.matmul(attn_weights, value_states)
192
+ attn_output = attn_output.transpose(1, 2).contiguous()
193
+
194
+ return attn_output, attn_weights
195
+
196
+
197
+ class SDARAttention(nn.Module):
198
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
199
+
200
+ def __init__(self, config: SDARConfig, layer_idx: int):
201
+ super().__init__()
202
+ self.config = config
203
+ self.layer_idx = layer_idx
204
+ self.head_dim = getattr(
205
+ config, "head_dim", config.hidden_size // config.num_attention_heads)
206
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
207
+ self.scaling = self.head_dim**-0.5
208
+ self.attention_dropout = config.attention_dropout
209
+ self.is_causal = True
210
+
211
+ self.hidden_size = config.hidden_size
212
+ self.num_attention_heads = config.num_attention_heads
213
+ self.num_key_value_heads = config.num_key_value_heads
214
+
215
+ self.q_proj = nn.Linear(
216
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
217
+ )
218
+ self.k_proj = nn.Linear(
219
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
220
+ )
221
+ self.v_proj = nn.Linear(
222
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
223
+ )
224
+ self.o_proj = nn.Linear(
225
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
226
+ )
227
+ # unlike olmo, only on the head dim!
228
+ self.q_norm = SDARRMSNorm(self.head_dim, eps=config.rms_norm_eps)
229
+ # thus post q_norm does not need reshape
230
+ self.k_norm = SDARRMSNorm(self.head_dim, eps=config.rms_norm_eps)
231
+ self.sliding_window = config.sliding_window
232
+ if not (
233
+ self.config.use_sliding_window
234
+ and getattr(self.config, "sliding_window", None) is not None
235
+ and self.layer_idx >= self.config.max_window_layers
236
+ ):
237
+ self.sliding_window = None
238
+
239
+ def forward(
240
+ self,
241
+ hidden_states: torch.Tensor,
242
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
243
+ attention_mask: Optional[torch.Tensor],
244
+ past_key_value: Optional[Cache] = None,
245
+ cache_position: Optional[torch.LongTensor] = None,
246
+ **kwargs: Unpack[FlashAttentionKwargs],
247
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
248
+ input_shape = hidden_states.shape[:-1]
249
+ bsz, q_len = input_shape
250
+ hidden_shape = (*input_shape, -1, self.head_dim)
251
+
252
+ query_states = self.q_norm(self.q_proj(
253
+ hidden_states).view(hidden_shape)).transpose(1, 2)
254
+ key_states = self.k_norm(self.k_proj(
255
+ hidden_states).view(hidden_shape)).transpose(1, 2)
256
+ value_states = self.v_proj(hidden_states).view(
257
+ hidden_shape).transpose(1, 2)
258
+
259
+ cos, sin = position_embeddings
260
+ query_states, key_states = apply_rotary_pos_emb(
261
+ query_states, key_states, cos, sin)
262
+
263
+ if past_key_value is not None and kwargs.get("store_kv", False):
264
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
265
+ key_states, value_states = past_key_value.update(
266
+ key_states, value_states, self.layer_idx)
267
+ elif past_key_value is not None and not kwargs.get("store_kv", False) and len(past_key_value) > self.layer_idx:
268
+ # only retrive, do not store kv
269
+ past_key_states, past_value_states = past_key_value[self.layer_idx]
270
+ key_states = torch.cat(
271
+ [past_key_states, key_states], dim=-2)
272
+ value_states = torch.cat(
273
+ [past_value_states, value_states], dim=-2)
274
+
275
+ attention_mask = attention_mask.bool() if attention_mask is not None else None
276
+ if torch.all(attention_mask): # decoding
277
+ query_states = query_states.transpose(1, 2)
278
+ key_states = key_states.transpose(1, 2)
279
+ value_states = value_states.transpose(1, 2)
280
+ attn_output = flash_attn_func(
281
+ query_states,
282
+ key_states,
283
+ value_states,
284
+ causal=False,
285
+ softmax_scale=self.scaling
286
+ )
287
+
288
+ else: # prefilling
289
+ attn_output = F.scaled_dot_product_attention(
290
+ query=query_states,
291
+ key=key_states,
292
+ value=value_states,
293
+ attn_mask=attention_mask,
294
+ is_causal=False,
295
+ scale=self.scaling,
296
+ enable_gqa=True
297
+ )
298
+ attn_output = attn_output.transpose(1, 2).contiguous()
299
+
300
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
301
+ attn_output = self.o_proj(attn_output)
302
+ return attn_output, None # , attn_weights
303
+
304
+
305
+ class SDARDecoderLayer(GradientCheckpointingLayer):
306
+ def __init__(self, config: SDARConfig, layer_idx: int):
307
+ super().__init__()
308
+ self.hidden_size = config.hidden_size
309
+ self.self_attn = SDARAttention(config=config, layer_idx=layer_idx)
310
+ self.mlp = SDARMLP(config)
311
+ self.input_layernorm = SDARRMSNorm(
312
+ config.hidden_size, eps=config.rms_norm_eps)
313
+ self.post_attention_layernorm = SDARRMSNorm(
314
+ config.hidden_size, eps=config.rms_norm_eps)
315
+ if (
316
+ config.sliding_window and config._attn_implementation != "flash_attention_2"
317
+ ): # diff with Llama is this warning
318
+ logger.warning_once(
319
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
320
+ "unexpected results may be encountered."
321
+ )
322
+
323
+ def forward(
324
+ self,
325
+ hidden_states: torch.Tensor,
326
+ attention_mask: Optional[torch.Tensor] = None,
327
+ position_ids: Optional[torch.LongTensor] = None,
328
+ past_key_value: Optional[Cache] = None,
329
+ output_attentions: Optional[bool] = False,
330
+ use_cache: Optional[bool] = False,
331
+ store_kv: Optional[bool] = False,
332
+ cache_position: Optional[torch.LongTensor] = None,
333
+ # necessary, but kept here for BC
334
+ position_embeddings: Optional[Tuple[torch.Tensor,
335
+ torch.Tensor]] = None,
336
+ **kwargs: Unpack[FlashAttentionKwargs],
337
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
338
+ residual = hidden_states
339
+ hidden_states = self.input_layernorm(hidden_states)
340
+
341
+ # Self Attention
342
+ hidden_states, self_attn_weights = self.self_attn(
343
+ hidden_states=hidden_states,
344
+ attention_mask=attention_mask,
345
+ position_ids=position_ids,
346
+ past_key_value=past_key_value,
347
+ output_attentions=output_attentions,
348
+ use_cache=use_cache,
349
+ store_kv=store_kv,
350
+ cache_position=cache_position,
351
+ position_embeddings=position_embeddings,
352
+ **kwargs,
353
+ )
354
+ hidden_states = residual + hidden_states
355
+
356
+ # Fully Connected
357
+ residual = hidden_states
358
+ hidden_states = self.post_attention_layernorm(hidden_states)
359
+ hidden_states = self.mlp(hidden_states)
360
+ hidden_states = residual + hidden_states
361
+
362
+ outputs = (hidden_states,)
363
+ if output_attentions:
364
+ outputs += (self_attn_weights,)
365
+
366
+ return outputs
367
+
368
+
369
+ @auto_docstring
370
+ class SDARPreTrainedModel(PreTrainedModel):
371
+ config_class = SDARConfig
372
+ base_model_prefix = "model"
373
+ supports_gradient_checkpointing = True
374
+ _no_split_modules = ["SDARDecoderLayer"]
375
+ _skip_keys_device_placement = ["past_key_values"]
376
+ _supports_flash_attn_2 = True
377
+ _supports_sdpa = True
378
+ _supports_flex_attn = True
379
+ _supports_cache_class = True
380
+ _supports_quantized_cache = True
381
+ _supports_static_cache = True
382
+ _supports_attention_backend = True
383
+
384
+ def _init_weights(self, module):
385
+ std = self.config.initializer_range
386
+ if isinstance(module, nn.Linear):
387
+ module.weight.data.normal_(mean=0.0, std=std)
388
+ if module.bias is not None:
389
+ module.bias.data.zero_()
390
+ elif isinstance(module, nn.Embedding):
391
+ module.weight.data.normal_(mean=0.0, std=std)
392
+ if module.padding_idx is not None:
393
+ module.weight.data[module.padding_idx].zero_()
394
+ elif isinstance(module, SDARRMSNorm):
395
+ module.weight.data.fill_(1.0)
396
+
397
+
398
+ class SDARRotaryEmbedding(nn.Module):
399
+ def __init__(self, config: SDARConfig, device=None):
400
+ super().__init__()
401
+ # BC: "rope_type" was originally "type"
402
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
403
+ self.rope_type = config.rope_scaling.get(
404
+ "rope_type", config.rope_scaling.get("type"))
405
+ else:
406
+ self.rope_type = "default"
407
+ self.max_seq_len_cached = config.max_position_embeddings
408
+ self.original_max_seq_len = config.max_position_embeddings
409
+
410
+ self.config = config
411
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
412
+
413
+ inv_freq, self.attention_scaling = self.rope_init_fn(
414
+ self.config, device)
415
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
416
+ self.original_inv_freq = self.inv_freq
417
+
418
+ @torch.no_grad()
419
+ # power user: used with advanced RoPE types (e.g. dynamic rope)
420
+ @dynamic_rope_update
421
+ def forward(self, x, position_ids):
422
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
423
+ position_ids.shape[0], -1, 1).to(x.device)
424
+ position_ids_expanded = position_ids[:, None, :].float()
425
+
426
+ device_type = x.device.type if isinstance(
427
+ x.device.type, str) and x.device.type != "mps" else "cpu"
428
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
429
+ freqs = (inv_freq_expanded.float() @
430
+ position_ids_expanded.float()).transpose(1, 2)
431
+ emb = torch.cat((freqs, freqs), dim=-1)
432
+ cos = emb.cos() * self.attention_scaling
433
+ sin = emb.sin() * self.attention_scaling
434
+
435
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
436
+
437
+
438
+ @auto_docstring
439
+ class SDARModel(SDARPreTrainedModel):
440
+ def __init__(self, config: SDARConfig):
441
+ super().__init__(config)
442
+ self.padding_idx = config.pad_token_id
443
+ self.vocab_size = config.vocab_size
444
+
445
+ self.embed_tokens = nn.Embedding(
446
+ config.vocab_size, config.hidden_size, self.padding_idx)
447
+ self.layers = nn.ModuleList(
448
+ [SDARDecoderLayer(config, layer_idx)
449
+ for layer_idx in range(config.num_hidden_layers)]
450
+ )
451
+ self.norm = SDARRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
452
+ self.rotary_emb = SDARRotaryEmbedding(config=config)
453
+ self.gradient_checkpointing = False
454
+
455
+ # Initialize weights and apply final processing
456
+ self.post_init()
457
+
458
+ def get_input_embeddings(self):
459
+ return self.embed_tokens
460
+
461
+ def set_input_embeddings(self, value):
462
+ self.embed_tokens = value
463
+
464
+ @can_return_tuple
465
+ @auto_docstring
466
+ def forward(
467
+ self,
468
+ input_ids: Optional[torch.LongTensor] = None,
469
+ attention_mask: Optional[torch.Tensor] = None,
470
+ position_ids: Optional[torch.LongTensor] = None,
471
+ past_key_values: Optional[Cache] = None,
472
+ inputs_embeds: Optional[torch.FloatTensor] = None,
473
+ use_cache: Optional[bool] = None,
474
+ store_kv: Optional[bool] = None,
475
+ output_attentions: Optional[bool] = None,
476
+ output_hidden_states: Optional[bool] = None,
477
+ cache_position: Optional[torch.LongTensor] = None,
478
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
479
+ ) -> BaseModelOutputWithPast:
480
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
481
+ output_hidden_states = (
482
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
483
+ )
484
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
485
+
486
+ if (input_ids is None) ^ (inputs_embeds is not None):
487
+ raise ValueError(
488
+ "You must specify exactly one of input_ids or inputs_embeds")
489
+
490
+ if self.gradient_checkpointing and self.training and use_cache:
491
+ logger.warning_once(
492
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
493
+ )
494
+ use_cache = False
495
+
496
+ # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
497
+ if not isinstance(past_key_values, (type(None), Cache)):
498
+ raise ValueError(
499
+ "The `past_key_values` should be either a `Cache` object or `None`.")
500
+
501
+ if inputs_embeds is None:
502
+ inputs_embeds = self.embed_tokens(input_ids)
503
+
504
+ if use_cache and past_key_values is None:
505
+ past_key_values = DynamicCache()
506
+
507
+ if cache_position is None:
508
+ past_seen_tokens = past_key_values.get_seq_length(
509
+ ) if past_key_values is not None else 0
510
+ cache_position = torch.arange(
511
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
512
+ )
513
+
514
+ if position_ids is None:
515
+ position_ids = cache_position.unsqueeze(0)
516
+
517
+ # causal_mask = self._update_causal_mask(
518
+ # attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
519
+ # )
520
+
521
+ hidden_states = inputs_embeds
522
+
523
+ # create position embeddings to be shared across the decoder layers
524
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
525
+
526
+ # decoder layers
527
+ all_hidden_states = () if output_hidden_states else None
528
+ all_self_attns = () if output_attentions else None
529
+
530
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
531
+ if output_hidden_states:
532
+ all_hidden_states += (hidden_states,)
533
+
534
+ layer_outputs = decoder_layer(
535
+ hidden_states,
536
+ attention_mask=attention_mask,
537
+ position_ids=position_ids,
538
+ past_key_value=past_key_values,
539
+ output_attentions=output_attentions,
540
+ use_cache=use_cache,
541
+ store_kv=store_kv,
542
+ cache_position=cache_position,
543
+ position_embeddings=position_embeddings,
544
+ **flash_attn_kwargs,
545
+ )
546
+
547
+ hidden_states = layer_outputs[0]
548
+
549
+ if output_attentions:
550
+ all_self_attns += (layer_outputs[1],)
551
+
552
+ hidden_states = self.norm(hidden_states)
553
+
554
+ # add hidden states from the last decoder layer
555
+ if output_hidden_states:
556
+ all_hidden_states += (hidden_states,)
557
+
558
+ return BaseModelOutputWithPast(
559
+ last_hidden_state=hidden_states,
560
+ past_key_values=past_key_values if use_cache else None,
561
+ hidden_states=all_hidden_states,
562
+ attentions=all_self_attns,
563
+ )
564
+
565
+ def _update_causal_mask(
566
+ self,
567
+ attention_mask: Union[torch.Tensor, "BlockMask"],
568
+ input_tensor: torch.Tensor,
569
+ cache_position: torch.Tensor,
570
+ past_key_values: Cache,
571
+ output_attentions: bool = False,
572
+ ):
573
+ if self.config._attn_implementation == "flash_attention_2":
574
+ if attention_mask is not None and past_key_values is not None:
575
+ is_padding_right = attention_mask[:, -
576
+ 1].sum().item() != input_tensor.size()[0]
577
+ if is_padding_right:
578
+ raise ValueError(
579
+ "You are attempting to perform batched generation with padding_side='right'"
580
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen3. Make sure to "
581
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
582
+ )
583
+ if attention_mask is not None and 0.0 in attention_mask:
584
+ return attention_mask
585
+ return None
586
+ if self.config._attn_implementation == "flex_attention":
587
+ if isinstance(attention_mask, torch.Tensor):
588
+ seq_len_q, seq_len_kv = attention_mask.shape
589
+ assert seq_len_q == seq_len_kv, f"got {attention_mask.shape=}"
590
+ attention_mask = create_block_mask(
591
+ # 2d bool tensor, shape: [2*seqlen, 2*seqlen]
592
+ lambda b, h, q_idx, kv_idx: attention_mask[q_idx, kv_idx],
593
+ B=None, H=None, Q_LEN=seq_len_q, KV_LEN=seq_len_kv,
594
+ )
595
+ else:
596
+ # Here we pass in flex mask computed externally
597
+ assert isinstance(attention_mask, BlockMask)
598
+ return attention_mask
599
+
600
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
601
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
602
+ # to infer the attention mask.
603
+ past_seen_tokens = past_key_values.get_seq_length(
604
+ ) if past_key_values is not None else 0
605
+ using_static_cache = isinstance(past_key_values, StaticCache)
606
+ using_sliding_window_cache = isinstance(
607
+ past_key_values, SlidingWindowCache)
608
+
609
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
610
+ if (
611
+ self.config._attn_implementation == "sdpa"
612
+ and not (using_static_cache or using_sliding_window_cache)
613
+ and not output_attentions
614
+ ):
615
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
616
+ attention_mask,
617
+ inputs_embeds=input_tensor,
618
+ past_key_values_length=past_seen_tokens,
619
+ sliding_window=self.config.sliding_window,
620
+ is_training=self.training,
621
+ ):
622
+ return None
623
+
624
+ dtype = input_tensor.dtype
625
+ min_dtype = torch.finfo(dtype).min
626
+ sequence_length = input_tensor.shape[1]
627
+ # SlidingWindowCache or StaticCache
628
+ if using_sliding_window_cache or using_static_cache:
629
+ target_length = past_key_values.get_max_cache_shape()
630
+ # DynamicCache or no cache
631
+ else:
632
+ target_length = (
633
+ attention_mask.shape[-1]
634
+ if isinstance(attention_mask, torch.Tensor)
635
+ else past_seen_tokens + sequence_length + 1
636
+ )
637
+
638
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
639
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
640
+ attention_mask,
641
+ sequence_length=sequence_length,
642
+ target_length=target_length,
643
+ dtype=dtype,
644
+ cache_position=cache_position,
645
+ batch_size=input_tensor.shape[0],
646
+ config=self.config,
647
+ past_key_values=past_key_values,
648
+ )
649
+
650
+ if (
651
+ self.config._attn_implementation == "sdpa"
652
+ and attention_mask is not None
653
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
654
+ and not output_attentions
655
+ ):
656
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
657
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
658
+ # Details: https://github.com/pytorch/pytorch/issues/110213
659
+ causal_mask = AttentionMaskConverter._unmask_unattended(
660
+ causal_mask, min_dtype)
661
+
662
+ return causal_mask
663
+
664
+ @staticmethod
665
+ def _prepare_4d_causal_attention_mask_with_cache_position(
666
+ attention_mask: torch.Tensor,
667
+ sequence_length: int,
668
+ target_length: int,
669
+ dtype: torch.dtype,
670
+ cache_position: torch.Tensor,
671
+ batch_size: int,
672
+ config: SDARConfig,
673
+ past_key_values: Cache,
674
+ ):
675
+ """
676
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
677
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
678
+
679
+ Args:
680
+ attention_mask (`torch.Tensor`):
681
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
682
+ sequence_length (`int`):
683
+ The sequence length being processed.
684
+ target_length (`int`):
685
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
686
+ dtype (`torch.dtype`):
687
+ The dtype to use for the 4D attention mask.
688
+ cache_position (`torch.Tensor`):
689
+ Indices depicting the position of the input sequence tokens in the sequence.
690
+ batch_size (`torch.Tensor`):
691
+ Batch size.
692
+ config (`SDARConfig`):
693
+ The model's configuration class
694
+ past_key_values (`Cache`):
695
+ The cache class that is being used currently to generate
696
+ """
697
+ if attention_mask is not None and attention_mask.dim() == 4:
698
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
699
+ causal_mask = attention_mask
700
+ else:
701
+ min_dtype = torch.finfo(dtype).min
702
+ causal_mask = torch.full(
703
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
704
+ )
705
+ diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
706
+ -1, 1
707
+ )
708
+ text_config = config.get_text_config()
709
+ if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
710
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
711
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
712
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
713
+ sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
714
+ cache_position.reshape(-1, 1) -
715
+ text_config.sliding_window
716
+ )
717
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
718
+ causal_mask *= diagonal_attend_mask
719
+ causal_mask = causal_mask[None, None,
720
+ :, :].expand(batch_size, 1, -1, -1)
721
+ if attention_mask is not None:
722
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
723
+ if attention_mask.shape[-1] > target_length:
724
+ attention_mask = attention_mask[:, :target_length]
725
+ mask_length = attention_mask.shape[-1]
726
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
727
+ causal_mask.device
728
+ )
729
+ padding_mask = padding_mask == 0
730
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
731
+ padding_mask, min_dtype
732
+ )
733
+ return causal_mask
734
+
735
+
736
+ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
737
+ ...
738
+
739
+
740
+ @auto_docstring
741
+ class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
742
+ _tied_weights_keys = ["lm_head.weight"]
743
+ _tp_plan = {"lm_head": "colwise_rep"}
744
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
745
+
746
+ def __init__(self, config):
747
+ super().__init__(config)
748
+ self.model = SDARModel(config)
749
+ self.vocab_size = config.vocab_size
750
+ self.lm_head = nn.Linear(
751
+ config.hidden_size, config.vocab_size, bias=False)
752
+
753
+ # Initialize weights and apply final processing
754
+ self.post_init()
755
+
756
+ def get_input_embeddings(self):
757
+ return self.model.embed_tokens
758
+
759
+ def set_input_embeddings(self, value):
760
+ self.model.embed_tokens = value
761
+
762
+ def get_output_embeddings(self):
763
+ return self.lm_head
764
+
765
+ def set_output_embeddings(self, new_embeddings):
766
+ self.lm_head = new_embeddings
767
+
768
+ def set_decoder(self, decoder):
769
+ self.model = decoder
770
+
771
+ def get_decoder(self):
772
+ return self.model
773
+
774
+ @can_return_tuple
775
+ @auto_docstring
776
+ def forward(
777
+ self,
778
+ input_ids: Optional[torch.LongTensor] = None,
779
+ attention_mask: Optional[torch.Tensor] = None,
780
+ position_ids: Optional[torch.LongTensor] = None,
781
+ past_key_values: Optional[Cache] = None,
782
+ inputs_embeds: Optional[torch.FloatTensor] = None,
783
+ labels: Optional[torch.LongTensor] = None,
784
+ use_cache: Optional[bool] = None,
785
+ output_attentions: Optional[bool] = None,
786
+ output_hidden_states: Optional[bool] = None,
787
+ cache_position: Optional[torch.LongTensor] = None,
788
+ logits_to_keep: Union[int, torch.Tensor] = 0,
789
+ **kwargs: Unpack[KwargsForCausalLM],
790
+ ) -> CausalLMOutputWithPast:
791
+ r"""
792
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
793
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
794
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
795
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
796
+
797
+ Example:
798
+
799
+ ```python
800
+ >>> from transformers import AutoTokenizer, SDARForCausalLM
801
+
802
+ >>> model = SDARForCausalLM.from_pretrained("DiffuOpen/SDAR-1.7B-Chat")
803
+ >>> tokenizer = AutoTokenizer.from_pretrained("DiffuOpen/SDAR-1.7B-Chat")
804
+
805
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
806
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
807
+
808
+ >>> # Generate
809
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
810
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
811
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
812
+ ```"""
813
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
814
+ output_hidden_states = (
815
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
816
+ )
817
+
818
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
819
+ outputs: BaseModelOutputWithPast = self.model(
820
+ input_ids=input_ids,
821
+ attention_mask=attention_mask,
822
+ position_ids=position_ids,
823
+ past_key_values=past_key_values,
824
+ inputs_embeds=inputs_embeds,
825
+ use_cache=use_cache,
826
+ output_attentions=output_attentions,
827
+ output_hidden_states=output_hidden_states,
828
+ cache_position=cache_position,
829
+ **kwargs,
830
+ )
831
+
832
+ hidden_states = outputs.last_hidden_state
833
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
834
+ slice_indices = slice(-logits_to_keep,
835
+ None) if isinstance(logits_to_keep, int) else logits_to_keep
836
+ hidden_states = hidden_states[:, slice_indices, :].contiguous()
837
+ fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
838
+ if fuse_linear_and_cross_entropy:
839
+ # When using fused_linear_ce_loss, we do not compute the whole logits on HBM
840
+ logits = None
841
+ else:
842
+ logits = self.lm_head(hidden_states)
843
+
844
+ loss = None
845
+ if labels is not None:
846
+ # FusedLinearCrossEntropyLoss will be implemented by monkey patch when training
847
+ # We don't use it when inferencing
848
+ loss_fct = nn.CrossEntropyLoss() # nn.CE
849
+ loss = loss_fct(
850
+ logits.view(-1, self.config.vocab_size), labels.view(-1))
851
+
852
+ return CausalLMOutputWithPast(
853
+ loss=loss,
854
+ logits=logits,
855
+ past_key_values=outputs.past_key_values,
856
+ hidden_states=outputs.hidden_states,
857
+ attentions=outputs.attentions,
858
+ )
859
+
860
+
861
+ __all__ = [
862
+ "SDARForCausalLM",
863
+ "SDARModel",
864
+ "SDARPreTrainedModel",
865
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<MASK>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
tokenization_qwen2.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Qwen2."""
16
+
17
+ import json
18
+ import os
19
+ import unicodedata
20
+ from functools import lru_cache
21
+ from typing import Optional, Tuple
22
+
23
+ import regex as re
24
+
25
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
26
+ from transformers.utils import logging
27
+
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+ VOCAB_FILES_NAMES = {
32
+ "vocab_file": "vocab.json",
33
+ "merges_file": "merges.txt",
34
+ }
35
+
36
+
37
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
38
+
39
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
40
+
41
+
42
+ @lru_cache()
43
+ # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
44
+ def bytes_to_unicode():
45
+ """
46
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
47
+ characters the bpe code barfs on.
48
+
49
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
50
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
51
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
52
+ tables between utf-8 bytes and unicode strings.
53
+ """
54
+ bs = (
55
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
56
+ )
57
+ cs = bs[:]
58
+ n = 0
59
+ for b in range(2**8):
60
+ if b not in bs:
61
+ bs.append(b)
62
+ cs.append(2**8 + n)
63
+ n += 1
64
+ cs = [chr(n) for n in cs]
65
+ return dict(zip(bs, cs))
66
+
67
+
68
+ # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
69
+ def get_pairs(word):
70
+ """
71
+ Return set of symbol pairs in a word.
72
+
73
+ Word is represented as tuple of symbols (symbols being variable-length strings).
74
+ """
75
+ pairs = set()
76
+ prev_char = word[0]
77
+ for char in word[1:]:
78
+ pairs.add((prev_char, char))
79
+ prev_char = char
80
+ return pairs
81
+
82
+
83
+ class Qwen2Tokenizer(PreTrainedTokenizer):
84
+ """
85
+ Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
86
+
87
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
88
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
89
+
90
+ ```python
91
+ >>> from transformers import Qwen2Tokenizer
92
+
93
+ >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
94
+ >>> tokenizer("Hello world")["input_ids"]
95
+ [9707, 1879]
96
+
97
+ >>> tokenizer(" Hello world")["input_ids"]
98
+ [21927, 1879]
99
+ ```
100
+ This is expected.
101
+
102
+ You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
103
+
104
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
105
+ this superclass for more information regarding those methods.
106
+
107
+ Args:
108
+ vocab_file (`str`):
109
+ Path to the vocabulary file.
110
+ merges_file (`str`):
111
+ Path to the merges file.
112
+ errors (`str`, *optional*, defaults to `"replace"`):
113
+ Paradigm to follow when decoding bytes to UTF-8. See
114
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
115
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
116
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
117
+ token instead.
118
+ bos_token (`str`, *optional*):
119
+ The beginning of sequence token. Not applicable for this tokenizer.
120
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
121
+ The end of sequence token.
122
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
123
+ The token used for padding, for example when batching sequences of different lengths.
124
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
125
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
126
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
127
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
128
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
129
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
130
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
131
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
132
+ """
133
+
134
+ vocab_files_names = VOCAB_FILES_NAMES
135
+ model_input_names = ["input_ids", "attention_mask"]
136
+
137
+ def __init__(
138
+ self,
139
+ vocab_file,
140
+ merges_file,
141
+ errors="replace",
142
+ unk_token="<|endoftext|>",
143
+ bos_token=None,
144
+ eos_token="<|endoftext|>",
145
+ pad_token="<|endoftext|>",
146
+ clean_up_tokenization_spaces=False,
147
+ split_special_tokens=False,
148
+ **kwargs,
149
+ ):
150
+ # Qwen vocab does not contain control tokens; added tokens need to be special
151
+ bos_token = (
152
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
153
+ if isinstance(bos_token, str)
154
+ else bos_token
155
+ )
156
+ eos_token = (
157
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
158
+ if isinstance(eos_token, str)
159
+ else eos_token
160
+ )
161
+ unk_token = (
162
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
163
+ if isinstance(unk_token, str)
164
+ else unk_token
165
+ )
166
+ pad_token = (
167
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
168
+ if isinstance(pad_token, str)
169
+ else pad_token
170
+ )
171
+
172
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
173
+ self.encoder = json.load(vocab_handle)
174
+ self.decoder = {v: k for k, v in self.encoder.items()}
175
+ self.errors = errors # how to handle errors in decoding
176
+ self.byte_encoder = bytes_to_unicode()
177
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
178
+ bpe_merges = []
179
+ with open(merges_file, encoding="utf-8") as merges_handle:
180
+ for i, line in enumerate(merges_handle):
181
+ line = line.strip()
182
+ if (i == 0 and line.startswith("#version:")) or not line:
183
+ continue
184
+ bpe_merges.append(tuple(line.split()))
185
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
186
+ # NOTE: the cache can grow without bound and will get really large for long running processes
187
+ # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
188
+ # not a memory leak but appears as one.
189
+ # GPT2Tokenizer has the same problem, so let's be consistent.
190
+ self.cache = {}
191
+
192
+ self.pat = re.compile(PRETOKENIZE_REGEX)
193
+
194
+ if kwargs.get("add_prefix_space", False):
195
+ logger.warning_once(
196
+ f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
197
+ )
198
+
199
+ super().__init__(
200
+ errors=errors,
201
+ bos_token=bos_token,
202
+ eos_token=eos_token,
203
+ pad_token=pad_token,
204
+ unk_token=unk_token,
205
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
206
+ split_special_tokens=split_special_tokens,
207
+ **kwargs,
208
+ )
209
+
210
+ @property
211
+ def vocab_size(self) -> int:
212
+ return len(self.encoder)
213
+
214
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
215
+ def get_vocab(self):
216
+ return dict(self.encoder, **self.added_tokens_encoder)
217
+
218
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
219
+ def bpe(self, token):
220
+ if token in self.cache:
221
+ return self.cache[token]
222
+ word = tuple(token)
223
+ pairs = get_pairs(word)
224
+
225
+ if not pairs:
226
+ return token
227
+
228
+ while True:
229
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
230
+ if bigram not in self.bpe_ranks:
231
+ break
232
+ first, second = bigram
233
+ new_word = []
234
+ i = 0
235
+ while i < len(word):
236
+ try:
237
+ j = word.index(first, i)
238
+ except ValueError:
239
+ new_word.extend(word[i:])
240
+ break
241
+ else:
242
+ new_word.extend(word[i:j])
243
+ i = j
244
+
245
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
246
+ new_word.append(first + second)
247
+ i += 2
248
+ else:
249
+ new_word.append(word[i])
250
+ i += 1
251
+ new_word = tuple(new_word)
252
+ word = new_word
253
+ if len(word) == 1:
254
+ break
255
+ else:
256
+ pairs = get_pairs(word)
257
+ word = " ".join(word)
258
+ self.cache[token] = word
259
+ return word
260
+
261
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
262
+ def _tokenize(self, text):
263
+ """Tokenize a string."""
264
+ bpe_tokens = []
265
+ for token in re.findall(self.pat, text):
266
+ token = "".join(
267
+ self.byte_encoder[b] for b in token.encode("utf-8")
268
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
269
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
270
+ return bpe_tokens
271
+
272
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
273
+ def _convert_token_to_id(self, token):
274
+ """Converts a token (str) in an id using the vocab."""
275
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
276
+
277
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
278
+ def _convert_id_to_token(self, index):
279
+ """Converts an index (integer) in a token (str) using the vocab."""
280
+ return self.decoder.get(index)
281
+
282
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
283
+ def convert_tokens_to_string(self, tokens):
284
+ """Converts a sequence of tokens (string) in a single string."""
285
+ text = "".join(tokens)
286
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
287
+ return text
288
+
289
+ def decode(
290
+ self,
291
+ token_ids,
292
+ skip_special_tokens: bool = False,
293
+ clean_up_tokenization_spaces: Optional[bool] = False,
294
+ spaces_between_special_tokens: bool = False,
295
+ **kwargs,
296
+ ) -> str:
297
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
298
+ # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
299
+ return super().decode(
300
+ token_ids,
301
+ skip_special_tokens=skip_special_tokens,
302
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
303
+ spaces_between_special_tokens=spaces_between_special_tokens,
304
+ **kwargs,
305
+ )
306
+
307
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
308
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
309
+ if not os.path.isdir(save_directory):
310
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
311
+ return
312
+ vocab_file = os.path.join(
313
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
314
+ )
315
+ merge_file = os.path.join(
316
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
317
+ )
318
+
319
+ with open(vocab_file, "w", encoding="utf-8") as f:
320
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
321
+
322
+ index = 0
323
+ with open(merge_file, "w", encoding="utf-8") as writer:
324
+ writer.write("#version: 0.2\n")
325
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
326
+ if index != token_index:
327
+ logger.warning(
328
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
329
+ " Please check that the tokenizer is not corrupted!"
330
+ )
331
+ index = token_index
332
+ writer.write(" ".join(bpe_tokens) + "\n")
333
+ index += 1
334
+
335
+ return vocab_file, merge_file
336
+
337
+ def prepare_for_tokenization(self, text, **kwargs):
338
+ text = unicodedata.normalize("NFC", text)
339
+ return (text, kwargs)
340
+
341
+
342
+ __all__ = ["Qwen2Tokenizer"]
tokenization_qwen2_fast.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Qwen2."""
16
+
17
+ from typing import Optional, Tuple
18
+
19
+ from transformers.tokenization_utils import AddedToken
20
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
21
+ from transformers.utils import logging
22
+ from .tokenization_qwen2 import Qwen2Tokenizer
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ VOCAB_FILES_NAMES = {
28
+ "vocab_file": "vocab.json",
29
+ "merges_file": "merges.txt",
30
+ "tokenizer_file": "tokenizer.json",
31
+ }
32
+
33
+
34
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
35
+
36
+
37
+ class Qwen2TokenizerFast(PreTrainedTokenizerFast):
38
+ """
39
+ Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
40
+ Byte-Pair-Encoding.
41
+
42
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
43
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
44
+
45
+ ```python
46
+ >>> from transformers import Qwen2TokenizerFast
47
+
48
+ >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
49
+ >>> tokenizer("Hello world")["input_ids"]
50
+ [9707, 1879]
51
+
52
+ >>> tokenizer(" Hello world")["input_ids"]
53
+ [21927, 1879]
54
+ ```
55
+ This is expected.
56
+
57
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
58
+ refer to this superclass for more information regarding those methods.
59
+
60
+ Args:
61
+ vocab_file (`str`, *optional*):
62
+ Path to the vocabulary file.
63
+ merges_file (`str`, *optional*):
64
+ Path to the merges file.
65
+ tokenizer_file (`str`, *optional*):
66
+ Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
67
+ contains everything needed to load the tokenizer.
68
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
69
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
70
+ token instead. Not applicable to this tokenizer.
71
+ bos_token (`str`, *optional*):
72
+ The beginning of sequence token. Not applicable for this tokenizer.
73
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
74
+ The end of sequence token.
75
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
76
+ The token used for padding, for example when batching sequences of different lengths.
77
+ """
78
+
79
+ vocab_files_names = VOCAB_FILES_NAMES
80
+ model_input_names = ["input_ids", "attention_mask"]
81
+ slow_tokenizer_class = Qwen2Tokenizer
82
+
83
+ def __init__(
84
+ self,
85
+ vocab_file=None,
86
+ merges_file=None,
87
+ tokenizer_file=None,
88
+ unk_token="<|endoftext|>",
89
+ bos_token=None,
90
+ eos_token="<|endoftext|>",
91
+ pad_token="<|endoftext|>",
92
+ **kwargs,
93
+ ):
94
+ # We need to at least pass vocab_file and merges_file to base class
95
+ # in case a slow tokenizer needs to be initialized; other can be
96
+ # configured through files.
97
+ # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
98
+
99
+ bos_token = (
100
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
101
+ if isinstance(bos_token, str)
102
+ else bos_token
103
+ )
104
+ eos_token = (
105
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
106
+ if isinstance(eos_token, str)
107
+ else eos_token
108
+ )
109
+ unk_token = (
110
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
111
+ if isinstance(unk_token, str)
112
+ else unk_token
113
+ )
114
+ pad_token = (
115
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
116
+ if isinstance(pad_token, str)
117
+ else pad_token
118
+ )
119
+
120
+ super().__init__(
121
+ vocab_file=vocab_file,
122
+ merges_file=merges_file,
123
+ tokenizer_file=tokenizer_file,
124
+ unk_token=unk_token,
125
+ bos_token=bos_token,
126
+ eos_token=eos_token,
127
+ pad_token=pad_token,
128
+ **kwargs,
129
+ )
130
+
131
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
132
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
133
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
134
+ return tuple(files)
135
+
136
+
137
+ __all__ = ["Qwen2TokenizerFast"]
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<|MASK|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ }
222
+ },
223
+ "additional_special_tokens": [
224
+ "<|im_start|>",
225
+ "<|im_end|>",
226
+ "<|object_ref_start|>",
227
+ "<|object_ref_end|>",
228
+ "<|box_start|>",
229
+ "<|box_end|>",
230
+ "<|quad_start|>",
231
+ "<|quad_end|>",
232
+ "<|vision_start|>",
233
+ "<|vision_end|>",
234
+ "<|vision_pad|>",
235
+ "<|image_pad|>",
236
+ "<|video_pad|>",
237
+ "<|MASK|>"
238
+ ],
239
+ "auto_map": {
240
+ "AutoTokenizer": [
241
+ "tokenization_qwen2.Qwen2Tokenizer",
242
+ null
243
+ ]
244
+ },
245
+ "bos_token": null,
246
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
247
+ "clean_up_tokenization_spaces": false,
248
+ "eos_token": "<|endoftext|>",
249
+ "mask_token": "<|MASK|>",
250
+ "errors": "replace",
251
+ "model_max_length": 131072,
252
+ "pad_token": "<|endoftext|>",
253
+ "split_special_tokens": false,
254
+ "tokenizer_class": "Qwen2Tokenizer",
255
+ "unk_token": null
256
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff