LandyGuo commited on
Commit
5110b7e
·
1 Parent(s): 46e1d99

update 0504 version

Browse files
Ming_Uni/MingUniInference.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ from diffusers import DPMSolverMultistepScheduler, AutoencoderDC, FlowMatchEulerDiscreteScheduler
7
+ from safetensors.torch import load_file
8
+ from .qwen2_5_vit import Qwen2_5_VisionTransformer
9
+ from .modeling_qwen2_native import Qwen2ForCausalLM
10
+ from .sana_transformer import SanaTransformer2DModel
11
+ from .sana_loss import SANALoss
12
+ from copy import deepcopy
13
+ from IPython import embed
14
+
15
+ import logging
16
+ logger = logging.getLogger(__name__)
17
+
18
+ from .Templates_native import (
19
+ DEFAULT_IMAGE_PATCH_TOKEN,
20
+ DEFAULT_IM_START_TOKEN,
21
+ DEFAULT_IM_END_TOKEN,
22
+ DEFAULT_VID_START_TOKEN,
23
+ DEFAULT_VID_END_TOKEN,
24
+ DEFAULT_GEN_IMAGE_PATCH_TOKEN,
25
+ DEFAULT_GEN_IM_START_TOKEN,
26
+ DEFAULT_GEN_IM_END_TOKEN,
27
+ PLACEHOLDER_IMAGE_TOKEN_IN_TEXT,
28
+ DEFAULT_END_OF_CHUNK_TOKEN,
29
+ DEFAULT_END_OF_AUDIO_TOKEN,
30
+ DEFAULT_AUDIO_PATCH_TOKEN,
31
+ DEFAULT_AU_START_TOKEN,
32
+ DEFAULT_AU_END_TOKEN,
33
+ DEFAULT_GEN_AUDIO_PATCH_TOKEN,
34
+ DEFAULT_GEN_AU_START_TOKEN,
35
+ DEFAULT_GEN_AU_END_TOKEN,
36
+ PLACEHOLDER_AUDIO_TOKEN_IN_TEXT,
37
+ DEFAULT_FRAME_PATCH_TOKEN,
38
+ interleave_tokens,
39
+ )
40
+ additional_special_tokens_qwen2 = [
41
+ "[item]",
42
+ "<html>",
43
+ "</html>",
44
+ "<body>",
45
+ "</body>",
46
+ "<table>",
47
+ "</table>",
48
+ "<tr>",
49
+ "</tr>",
50
+ "<td>",
51
+ "</td>",
52
+ "<think>",
53
+ "</think>",
54
+ "<answer>",
55
+ "</answer>"
56
+ ]
57
+
58
+ def expand_gen_embeds_as_learnable_scales(
59
+ clip_feat,
60
+ image_grid_thw,
61
+ scales,
62
+ isgen_indicators,
63
+ learnable_queries_1d,
64
+ ):
65
+ resized_clip_feat = []
66
+ new_image_grid_thw = []
67
+
68
+ assert image_grid_thw.ndim == 2
69
+ bsz = len(image_grid_thw)
70
+ assert clip_feat.ndim == 2
71
+ feat_dim = clip_feat.shape[1]
72
+ n_clip_token_cum = 0
73
+ assert len(isgen_indicators) == bsz
74
+ #assert image_grid_thw.ndim == 3
75
+ for bsid in range(bsz):
76
+ thw = image_grid_thw[bsid].tolist()
77
+ assert thw[0] == 1
78
+ assert thw[1] % 2 == 0
79
+ assert thw[2] % 2 == 0
80
+ clip_h = thw[1] // 2
81
+ clip_w = thw[2] // 2
82
+ n_clip_token = clip_h * clip_w
83
+ assert n_clip_token_cum + n_clip_token <= clip_feat.shape[0]
84
+ if isgen_indicators[bsid]:
85
+ for scale in scales:
86
+ clip_feat_one = torch.zeros(scale * scale, feat_dim).to(clip_feat.dtype).to(clip_feat.device)
87
+ resized_clip_feat.append(clip_feat_one)
88
+ if learnable_queries_1d:
89
+ new_image_grid_thw.append([1, 2, scale * scale * 2])
90
+ else:
91
+ new_image_grid_thw.append([1, scale * 2, scale * 2])
92
+ else:
93
+ clip_feat_one = clip_feat[n_clip_token_cum : n_clip_token_cum + n_clip_token, :]
94
+ resized_clip_feat.append(clip_feat_one)
95
+ new_image_grid_thw.append(thw)
96
+
97
+ n_clip_token_cum += n_clip_token
98
+
99
+ assert n_clip_token_cum == clip_feat.shape[0]
100
+
101
+ encoder_hidden_states = torch.cat(resized_clip_feat, dim=0)
102
+ return encoder_hidden_states, torch.tensor(new_image_grid_thw, dtype=image_grid_thw.dtype).to(image_grid_thw.device)
103
+
104
+ def append_understand_embeds_with_learnable_scales(
105
+ clip_feat,
106
+ image_grid_thw,
107
+ scales,
108
+ dtype,
109
+ device,
110
+ feat_dim,
111
+ learnable_queries_1d,
112
+ ):
113
+ if clip_feat is not None:
114
+ assert feat_dim == clip_feat.shape[-1]
115
+ assert dtype == clip_feat.dtype
116
+ assert device == clip_feat.device
117
+ assert clip_feat.ndim == 2
118
+ else:
119
+ assert image_grid_thw is None
120
+
121
+ fake_learnable_embed = torch.zeros(256, feat_dim).to(dtype).to(device)
122
+ clip_feat = torch.cat([clip_feat, fake_learnable_embed], dim=0) if clip_feat is not None else fake_learnable_embed
123
+ fake_image_grid_thw = torch.tensor([[1, 32, 32]], dtype=torch.long).to(device)
124
+ image_grid_thw = torch.cat([image_grid_thw, fake_image_grid_thw], dim=0) if image_grid_thw is not None else fake_image_grid_thw
125
+
126
+ return expand_gen_embeds_as_learnable_scales(
127
+ clip_feat,
128
+ image_grid_thw,
129
+ scales,
130
+ isgen_indicators=[False for _ in range(image_grid_thw.shape[0]-1)] + [True],
131
+ learnable_queries_1d=learnable_queries_1d,
132
+ )
133
+
134
+ def expand_gen_input_ids_as_learnable_scales(
135
+ text_ids,
136
+ labels,
137
+ attention_mask,
138
+ scales,
139
+ start_token_id,
140
+ end_token_id,
141
+ patch_token_id,
142
+ num_learnable_queries,
143
+ ):
144
+ assert text_ids.ndim == 2
145
+ assert text_ids.shape == labels.shape
146
+ assert text_ids.shape == attention_mask.shape
147
+
148
+ default_scaled_tokens = []
149
+ for scale in scales:
150
+ default_scaled_tokens.append(start_token_id)
151
+ default_scaled_tokens.extend([patch_token_id for _ in range(scale * scale)])
152
+ default_scaled_tokens.append(end_token_id)
153
+
154
+ text_ids_list = text_ids.cpu().tolist()
155
+ labels_list = labels.cpu().tolist()
156
+ attention_mask_list = attention_mask.cpu().tolist()
157
+
158
+ new_text_ids_list = []
159
+ new_labels_list = []
160
+ new_attention_mask_list = []
161
+ for text_ids_one_batch, labels_one_batch, attention_mask_one_batch in zip(text_ids_list, labels_list, attention_mask_list):
162
+ assert len(text_ids_one_batch) == len(labels_one_batch)
163
+ assert len(text_ids_one_batch) == len(attention_mask_one_batch)
164
+ start_idx = [i for i, j in enumerate(labels_one_batch) if j == start_token_id]
165
+ end_idx = [i for i, j in enumerate(labels_one_batch) if j == end_token_id]
166
+ assert len(start_idx) == 1, start_idx
167
+ assert len(end_idx) == 1, end_idx
168
+ start_idx = start_idx[0]
169
+ end_idx = end_idx[0]
170
+ assert end_idx - start_idx == num_learnable_queries + 1, (start_idx, end_idx)
171
+ assert text_ids_one_batch[start_idx] == start_token_id and text_ids_one_batch[end_idx] == end_token_id
172
+ text_ids_one_batch[start_idx: end_idx+1] = deepcopy(default_scaled_tokens)
173
+ labels_one_batch[start_idx: end_idx+1] = deepcopy(default_scaled_tokens)
174
+ attention_mask_one_batch[start_idx: end_idx+1] = [1 for _ in range(len(default_scaled_tokens))]
175
+
176
+ new_text_ids_list.append(text_ids_one_batch)
177
+ new_labels_list.append(labels_one_batch)
178
+ new_attention_mask_list.append(attention_mask_one_batch)
179
+
180
+ return (
181
+ torch.tensor(new_text_ids_list, dtype=text_ids.dtype).to(text_ids.device),
182
+ torch.tensor(new_labels_list, dtype=labels.dtype).to(labels.device),
183
+ torch.tensor(new_attention_mask_list, dtype=attention_mask.dtype).to(attention_mask.device)
184
+ )
185
+
186
+
187
+ def append_input_ids_with_learnable_scales(
188
+ text_ids,
189
+ scales,
190
+ start_token_id,
191
+ end_token_id,
192
+ patch_token_id,
193
+ ):
194
+ assert text_ids.shape[0] == 1
195
+ assert text_ids[0][-1].tolist() == start_token_id
196
+
197
+ labels = torch.cat([
198
+ torch.ones_like(text_ids[:,:-1]) * 0 - 100,
199
+ torch.tensor([[start_token_id, patch_token_id, end_token_id]]).to(text_ids.dtype).to(text_ids.device),
200
+ ], dim=1)
201
+
202
+ text_ids = torch.cat([
203
+ text_ids,
204
+ torch.tensor([[patch_token_id, end_token_id]]).to(text_ids.dtype).to(text_ids.device),
205
+ ], dim=1)
206
+
207
+ assert labels.shape == text_ids.shape
208
+
209
+ attention_mask = torch.ones_like(text_ids)
210
+ text_ids, labels, attention_mask = expand_gen_input_ids_as_learnable_scales(
211
+ text_ids,
212
+ labels,
213
+ attention_mask,
214
+ scales,
215
+ start_token_id,
216
+ end_token_id,
217
+ patch_token_id,
218
+ num_learnable_queries=1,
219
+ )
220
+ return text_ids, labels
221
+
222
+ class Ming_Uni_Inference(nn.Module):
223
+ def __init__(self, inference_model_path):
224
+ super(Ming_Uni_Inference, self).__init__()
225
+ self.inference_model_path = inference_model_path
226
+ print('loading from pretrained:',inference_model_path)
227
+ self.load_from_huggingface()
228
+ #embed()
229
+
230
+ def init_tokens(self):
231
+ num_query_token=2560
232
+ num_query_token_video=64
233
+ num_query_token_audio=32
234
+ num_decoder_image_token=1024
235
+ num_decoder_audio_token=512
236
+ self.glm_tokenizer.add_special_tokens(
237
+ {"additional_special_tokens": additional_special_tokens_qwen2}
238
+ )
239
+ num_new_tokens = self.glm_tokenizer.add_tokens(
240
+ interleave_tokens,
241
+ special_tokens=True,
242
+ )
243
+ logger.warning("init_mm_specail_tokens: generation_num_tokens = {}".format(num_new_tokens))
244
+ self.glm_config.first_signal_token = self.glm_tokenizer.convert_tokens_to_ids("[IMG0]")
245
+ self.glm_config.image_start_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_IM_START_TOKEN)
246
+ self.glm_config.image_end_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_IM_END_TOKEN)
247
+ self.glm_config.image_patch_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_PATCH_TOKEN)
248
+ self.glm_config.video_start_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_VID_START_TOKEN)
249
+ self.glm_config.video_end_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_VID_END_TOKEN)
250
+ self.glm_config.gen_image_start_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_GEN_IM_START_TOKEN)
251
+ self.glm_config.gen_image_end_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_GEN_IM_END_TOKEN)
252
+ self.glm_config.gen_image_patch_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_GEN_IMAGE_PATCH_TOKEN)
253
+ self.glm_config.placeholder_image_token_in_text = self.glm_tokenizer.convert_tokens_to_ids(
254
+ PLACEHOLDER_IMAGE_TOKEN_IN_TEXT
255
+ ) # noqa
256
+ self.glm_config.end_of_chunk_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_END_OF_CHUNK_TOKEN)
257
+
258
+ self.glm_config.end_of_audio_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_END_OF_AUDIO_TOKEN)
259
+ self.glm_config.audio_start_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_AU_START_TOKEN)
260
+ self.glm_config.audio_end_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_AU_END_TOKEN)
261
+ self.glm_config.audio_patch_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_AUDIO_PATCH_TOKEN)
262
+ self.glm_config.gen_audio_start_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_GEN_AU_START_TOKEN)
263
+ self.glm_config.gen_audio_end_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_GEN_AU_END_TOKEN)
264
+ self.glm_config.gen_audio_patch_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_GEN_AUDIO_PATCH_TOKEN)
265
+ self.glm_config.placeholder_audio_token_in_text = self.glm_tokenizer.convert_tokens_to_ids(
266
+ PLACEHOLDER_AUDIO_TOKEN_IN_TEXT
267
+ ) # noqa
268
+ self.glm_config.frame_patch_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_FRAME_PATCH_TOKEN)
269
+ self.glm_config.video_patch_token = self.glm_tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_PATCH_TOKEN)
270
+
271
+ self.glm_config.num_image_token = num_query_token
272
+ self.glm_config.num_video_token = num_query_token_video
273
+ self.glm_config.num_audio_token = num_query_token_audio
274
+ self.glm_config.num_decoder_image_token = num_decoder_image_token
275
+ self.glm_config.num_decoder_audio_token = num_decoder_audio_token
276
+
277
+ def load_from_huggingface(self):
278
+ # Load Qwen2_5_vit
279
+ self.eva_encoder = Qwen2_5_VisionTransformer.from_pretrained(
280
+ os.path.join(self.inference_model_path, 'qwen2_5_vit'),
281
+ attn_implementation="flash_attention_2",
282
+ trust_remote_code=True,
283
+ force_download=True,
284
+ )
285
+
286
+ # Load Qwen2_5_llm (GLM model)
287
+
288
+
289
+ self.glm_tokenizer = AutoTokenizer.from_pretrained(os.path.join(self.inference_model_path, 'qwen2_5_llm'))
290
+ self.glm_config = Qwen2ForCausalLM.from_pretrained(os.path.join(self.inference_model_path, 'qwen2_5_llm')).config
291
+
292
+ self.init_tokens()
293
+ self.glm_config.audio_vocab_size = 4099
294
+ self.glm_config.audio_id_shift = 151699
295
+ self.glm_config.spatial_merge_size = 2
296
+ self.glm_config.tokens_per_second = 2
297
+ self.glm_config._attn_implementation = "flash_attention_2"
298
+ self.glm_config.use_llm_3drope = True
299
+ self.glm_model = Qwen2ForCausalLM.from_pretrained(os.path.join(self.inference_model_path, 'qwen2_5_llm'), config=self.glm_config)
300
+
301
+ # Load SANA
302
+ # self.scheduler = DPMSolverMultistepScheduler.from_pretrained(self.inference_model_path, subfolder="scheduler")
303
+ # self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(self.inference_model_path, subfolder="scheduler")
304
+ # self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
305
+ # self.vae = AutoencoderDC.from_pretrained(self.inference_model_path, subfolder="vae")
306
+ # self.train_model = SanaTransformer2DModel.from_pretrained(self.inference_model_path, subfolder="transformer")
307
+ # self.train_model = SanaModel_withMLP(self.train_model, vision_dim=self.glm_model.config.hidden_size) # Ensure vision_dim is properly defined/set
308
+ # mlp_checkpoint_path = os.path.join(self.inference_model_path, 'mlp', 'model.safetensors')
309
+ # assert os.path.exists(mlp_checkpoint_path), "MLP checkpoint path does not exist."
310
+ # inference_load_denoising_pretrained_weights(self.train_model, mlp_checkpoint_path)
311
+ self.diffloss = SANALoss(
312
+ model_path=self.inference_model_path,
313
+ scheduler_path=self.inference_model_path,
314
+ vision_dim=self.glm_model.config.hidden_size,
315
+ mlp_checkpoint_path=os.path.join(self.inference_model_path, 'mlp', 'model.safetensors'),
316
+ trainable_params="",
317
+ )
318
+ # Load MLP
319
+ self.image_emb_dim = 8192
320
+ mlp_modules_img = [nn.Linear(self.image_emb_dim, self.glm_model.config.hidden_size)]
321
+ for _ in range(1, 2):
322
+ mlp_modules_img.append(nn.GELU())
323
+ mlp_modules_img.append(nn.Linear(self.glm_model.config.hidden_size, self.glm_model.config.hidden_size))
324
+ self.linear_proj = nn.Sequential(*mlp_modules_img)
325
+ temp_state_dict = load_file(os.path.join(self.inference_model_path, 'mlp', 'model.safetensors'))
326
+ modified_state_dict = {
327
+ '0.weight': temp_state_dict['linear_proj.0.weight'],
328
+ '0.bias': temp_state_dict['linear_proj.0.bias'],
329
+ '2.weight': temp_state_dict['linear_proj.2.weight'],
330
+ '2.bias': temp_state_dict['linear_proj.2.bias']
331
+ }
332
+ self.linear_proj.load_state_dict(modified_state_dict, strict=True)
333
+ self.norm_query_embeds = True
334
+ # Load connector
335
+ self.connector = AutoModelForCausalLM.from_pretrained(os.path.join(self.inference_model_path, 'connector'))
336
+ for layer in self.connector.model.layers:
337
+ layer.self_attn.is_causal = False
338
+
339
+ self.proj_in = nn.Linear(self.glm_model.config.hidden_size, self.connector.config.hidden_size)
340
+ self.proj_out = nn.Linear(self.connector.config.hidden_size, self.glm_model.config.hidden_size)
341
+
342
+ temp_state_dict = load_file(os.path.join(self.inference_model_path, 'mlp', 'model.safetensors'))
343
+ modified_state_dict_in = {
344
+ 'weight': temp_state_dict['proj_in.weight'],
345
+ 'bias': temp_state_dict['proj_in.bias']
346
+ }
347
+ self.proj_in.load_state_dict(modified_state_dict_in, strict=True)
348
+
349
+ modified_state_dict_out = {
350
+ 'weight': temp_state_dict['proj_out.weight'],
351
+ 'bias': temp_state_dict['proj_out.bias']
352
+ }
353
+ self.proj_out.load_state_dict(modified_state_dict_out, strict=True)
354
+
355
+ self.num_learnable_queries = 256
356
+ self.use_multi_scale = True
357
+ self.scales = [4, 8, 16]
358
+ self.learnable_queries_1d = True
359
+
360
+
361
+ self.query_tokens_dict = nn.ParameterDict()
362
+ total_tokens = 0
363
+ for scale in self.scales:
364
+ num_tokens = scale * scale
365
+ self.query_tokens_dict[f"{scale}x{scale}"] = nn.Parameter(
366
+ torch.nn.functional.normalize(torch.randn(num_tokens, self.glm_model.config.hidden_size), dim=-1)
367
+ )
368
+ self.query_tokens_dict[f"{scale}x{scale}"].data = temp_state_dict[f"query_tokens_dict.{scale}x{scale}"]
369
+ total_tokens += num_tokens
370
+
371
+ # 计算各尺度的累积索引
372
+ self.scale_indices = []
373
+ current_idx = 0
374
+ for scale in self.scales:
375
+ current_idx += scale * scale
376
+ self.scale_indices.append(current_idx)
377
+
378
+ logger.info("All models load done.")
379
+
380
+ @torch.no_grad()
381
+ def image_gen_generate(
382
+ self,
383
+ samples,
384
+ steps=20,
385
+ seed=42,
386
+ cfg=7.0,
387
+ height=512,
388
+ width=512,
389
+ num_max_output_tokens=100,
390
+ ):
391
+ """
392
+ Args:
393
+ samples (dict): A dictionary containing the output of processor
394
+ steps (int): Number of inference steps for diffusion
395
+ height (int): height for output image
396
+ width (int): width for output image
397
+ Returns:
398
+ result_word (str): output words
399
+ result_image (PIL.Image): output image
400
+ """
401
+
402
+ assert samples["input_ids"].ndim == 2
403
+ assert samples["input_ids"].shape[0] == 1
404
+ if samples["input_ids"][0][-1].tolist() != self.glm_config.image_start_token:
405
+ print("Warning: No <image> found at the end of prompt, back to chat mode.")
406
+
407
+ image_embed_list = []
408
+ if ("image" in samples) and (samples["image"] is not None):
409
+ device = samples["image"].device
410
+ images = samples["image"]
411
+ if not isinstance(images, list):
412
+ images = [images]
413
+ else:
414
+ device = samples["input_ids"].device
415
+ images = []
416
+
417
+ image_embed_list = []
418
+ image_grid_thw = None
419
+ for idx, item in enumerate(images):
420
+ if len(images) > 0 and images[idx].size(0) > 0:
421
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
422
+ pixel_values = images[idx].type(self.eva_encoder.get_dtype())
423
+ image_grid_thw = samples["image_grid_thw"]
424
+ eva_image_feat = self.eva_encoder(pixel_values, grid_thw=image_grid_thw)
425
+
426
+ image_embed_list.append(eva_image_feat)
427
+
428
+ image_embeds = None
429
+ inputs_opt_visual = None
430
+ device = samples["input_ids"].device
431
+ if len(image_embed_list) > 0:
432
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
433
+ image_embeds = torch.cat(image_embed_list).to(device)
434
+ image_embeds = image_embeds.float()
435
+
436
+ inputs_opt_visual = self.linear_proj(image_embeds)
437
+
438
+ if self.norm_query_embeds:
439
+ inputs_opt_visual = torch.nn.functional.normalize(inputs_opt_visual, dim=-1)
440
+ else:
441
+ inputs_opt_visual = inputs_opt_visual * self.query_embeds_scale
442
+
443
+ # if self.half_glm:
444
+ # inputs_opt_visual = inputs_opt_visual.half()
445
+
446
+
447
+ inputs = {}
448
+ inputs["input_ids"] = samples["input_ids"].to(device)
449
+ assert "position_ids" not in samples or samples["position_ids"] is None
450
+ inputs["position_ids"] = None
451
+ inputs["attention_mask"] = samples["generation_attention_mask"].to(device)
452
+
453
+ query_embeds_image = inputs_opt_visual
454
+ query_embeds_video = None
455
+ image_grid_thw_video = None
456
+ inputs["query_embeds_image"] = query_embeds_image
457
+ inputs["query_embeds_video"] = query_embeds_video
458
+ inputs["image_grid_thw"] = image_grid_thw
459
+ inputs["image_grid_thw_video"] = image_grid_thw_video
460
+
461
+ output_str = ""
462
+ new_token_ids = None
463
+ new_query_embeds_images = None
464
+ assert inputs["input_ids"].shape[0] == 1
465
+ assert inputs["position_ids"] is None
466
+
467
+ num_remaining_image_gen_token = 0
468
+ curr_image_grid_thw = inputs["image_grid_thw"]
469
+ for _ in range(num_max_output_tokens):
470
+ assert num_remaining_image_gen_token >= 0
471
+ curr_input_ids = torch.cat([inputs["input_ids"], new_token_ids], dim=1) if new_token_ids is not None else inputs["input_ids"]
472
+ assert num_remaining_image_gen_token >= 0
473
+ true_input_ids = curr_input_ids if num_remaining_image_gen_token == 0 else curr_input_ids[:,:-1 * (num_remaining_image_gen_token + 1)]
474
+
475
+ curr_query_embeds_image = inputs["query_embeds_image"]
476
+ if new_query_embeds_images is not None:
477
+ if curr_query_embeds_image is None:
478
+ curr_query_embeds_image = new_query_embeds_images
479
+ else:
480
+ curr_query_embeds_image = torch.cat([
481
+ curr_query_embeds_image,
482
+ new_query_embeds_images
483
+ ], dim=0)
484
+
485
+ if true_input_ids[0][-1].tolist() == self.glm_config.image_start_token:
486
+ assert num_remaining_image_gen_token == 0
487
+ apppended_query_embeds_image, curr_image_grid_thw = append_understand_embeds_with_learnable_scales(
488
+ clip_feat=curr_query_embeds_image,
489
+ image_grid_thw=curr_image_grid_thw,
490
+ scales=self.scales,
491
+ dtype=torch.bfloat16,
492
+ device=device,
493
+ feat_dim=self.glm_model.config.hidden_size,
494
+ learnable_queries_1d=self.learnable_queries_1d,
495
+ )
496
+ curr_input_ids, labels = append_input_ids_with_learnable_scales(
497
+ text_ids=true_input_ids,
498
+ scales=self.scales,
499
+ start_token_id=self.glm_model.config.image_start_token,
500
+ end_token_id=self.glm_model.config.image_end_token,
501
+ patch_token_id=self.glm_model.config.image_patch_token,
502
+ )
503
+
504
+ learnable_queries_repeat = torch.cat(
505
+ [self.query_tokens_dict[f"{scale}x{scale}"] for scale in self.scales],
506
+ dim=0,
507
+ )
508
+
509
+ # 现在基于更新后的text_ids和labels计算inner_gen_mask
510
+ image_token_mask = (curr_input_ids == self.glm_model.config.image_patch_token).to(device)
511
+ inner_gen_mask = torch.masked_select(labels, image_token_mask) == self.glm_model.config.image_patch_token
512
+ inner_gen_mask = inner_gen_mask.unsqueeze(-1).expand_as(apppended_query_embeds_image).to(apppended_query_embeds_image.device)
513
+
514
+ apppended_query_embeds_image = apppended_query_embeds_image.masked_scatter(
515
+ inner_gen_mask,
516
+ learnable_queries_repeat
517
+ )
518
+ assert new_token_ids is None
519
+ new_token_ids = curr_input_ids[:, true_input_ids.shape[1]:]
520
+ assert new_query_embeds_images is None
521
+ new_query_embeds_images = apppended_query_embeds_image[curr_query_embeds_image.shape[0]:, :] if curr_query_embeds_image is not None else apppended_query_embeds_image
522
+
523
+ continue
524
+
525
+ curr_position_ids = self.glm_model.get_rope_index(curr_input_ids, curr_image_grid_thw)[0]
526
+ true_position_ids = curr_position_ids[:,:,:true_input_ids.shape[1]]
527
+
528
+ outputs = self.glm_model(
529
+ input_ids=true_input_ids,
530
+ query_embeds_image=curr_query_embeds_image,
531
+ query_embeds_video=inputs["query_embeds_video"],
532
+ query_embeds_audio=None,
533
+ target_embeds=None,
534
+ position_ids=true_position_ids,
535
+ attention_mask=None,
536
+ labels=None,
537
+ weights=None,
538
+ image_grid_thw=curr_image_grid_thw,
539
+ image_grid_thw_video=image_grid_thw_video,
540
+ )
541
+
542
+ if new_query_embeds_images is not None:
543
+ assert labels.shape == true_input_ids.shape
544
+ gen_image_mask = labels == self.glm_model.config.image_patch_token
545
+ assert gen_image_mask.sum().cpu().item() == new_query_embeds_images.shape[0]
546
+ hidden_states_gen = outputs.last_hidden_state[gen_image_mask].view(outputs.last_hidden_state.shape[0], -1, outputs.last_hidden_state.shape[-1])
547
+ assert hidden_states_gen.shape[1] == new_query_embeds_images.shape[0]
548
+ scale_start_idxes = [0] + self.scale_indices[:-1]
549
+ scale_end_idxes = self.scale_indices
550
+ assert scale_end_idxes[-1] == hidden_states_gen.shape[1]
551
+ new_query_embeds_images = {}
552
+ for scale, scale_start_idx, scale_end_idx in zip(self.scales, scale_start_idxes, scale_end_idxes):
553
+ scale_name = f"{scale}x{scale}"
554
+ scale_hidden = hidden_states_gen[:, scale_start_idx : scale_end_idx, :]
555
+
556
+
557
+ scale_embeds = self.proj_in(scale_hidden)
558
+ seq_shape = scale_embeds.shape
559
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
560
+ scale_embeds = self.connector(
561
+ inputs_embeds=scale_embeds,
562
+ attention_mask=torch.ones(seq_shape[0],1,seq_shape[1],seq_shape[1]).to(scale_embeds.device),
563
+ output_hidden_states=True
564
+ ).hidden_states[-1]
565
+ scale_embeds = self.proj_out(scale_embeds)
566
+
567
+
568
+ scale_embeds = torch.nn.functional.normalize(scale_embeds, dim=-1)
569
+ new_query_embeds_images[scale_name] = scale_embeds
570
+
571
+ break
572
+
573
+ assert num_remaining_image_gen_token == 0
574
+ new_token_id = outputs.logits[:,-1:,:].argmax(dim=-1)
575
+ if (new_token_id.tolist())[0][0] == self.eos_token_id:
576
+ break
577
+
578
+ new_token_ids = torch.cat([new_token_ids, new_token_id], dim=1) if new_token_ids is not None else new_token_id
579
+ output_str = output_str + self.glm_tokenizer.decode(new_token_id.tolist()[0])
580
+
581
+ #multiscale_result = None
582
+ if self.diffloss is not None and new_query_embeds_images is not None:
583
+ #print("curr_image_grid_thw: ", curr_image_grid_thw)
584
+ imgs = []
585
+ for scale in self.scales:
586
+ imgs.append(self.diffloss.sample(new_query_embeds_images[f"{scale}x{scale}"], steps=steps, seed=seed, cfg=cfg, height=height, width=width))
587
+
588
+ #multiscale_result = concat_horizontal(imgs)
589
+ new_query_embeds_images = imgs[-1]
590
+
591
+ # if self.use_multi_scale:
592
+ # return output_str, new_query_embeds_images, multiscale_result
593
+
594
+ return output_str, new_query_embeds_images
595
+
596
+ # Usage example:
597
+ # from MingUniInference import Ming_Uni_Inference
598
+ # model = Ming_Uni_Inference('/videomm/share/models/xinyu/test1')
Ming_Uni/Templates_native.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # special tokens
2
+ DEFAULT_IMAGE_PATCH_TOKEN = "<imagePatch>"
3
+ DEFAULT_IM_START_TOKEN = "<image>"
4
+ DEFAULT_IM_END_TOKEN = "</image>"
5
+ DEFAULT_VID_START_TOKEN = "<video>"
6
+ DEFAULT_VID_END_TOKEN = "</video>"
7
+ DEFAULT_GEN_IMAGE_PATCH_TOKEN = "<gen_imagePatch>"
8
+ DEFAULT_GEN_IM_START_TOKEN = "<gen_image>"
9
+ DEFAULT_GEN_IM_END_TOKEN = "</gen_image>"
10
+ PLACEHOLDER_IMAGE_TOKEN_IN_TEXT = "<imageHere>"
11
+ DEFAULT_END_OF_CHUNK_TOKEN = "<end_of_chunk>"
12
+
13
+ DEFAULT_END_OF_AUDIO_TOKEN = "<end_of_audio>"
14
+ DEFAULT_AUDIO_PATCH_TOKEN = "<audioPatch>"
15
+ DEFAULT_AU_START_TOKEN = "<audio>"
16
+ DEFAULT_AU_END_TOKEN = "</audio>"
17
+ DEFAULT_GEN_AUDIO_PATCH_TOKEN = "<gen_audioPatch>"
18
+ DEFAULT_GEN_AU_START_TOKEN = "<gen_audio>"
19
+ DEFAULT_GEN_AU_END_TOKEN = "</gen_audio>"
20
+ PLACEHOLDER_AUDIO_TOKEN_IN_TEXT = "<audioHere>"
21
+ DEFAULT_FRAME_PATCH_TOKEN = "<framePatch>"
22
+
23
+ interleave_tokens = [
24
+ DEFAULT_IMAGE_PATCH_TOKEN,
25
+ DEFAULT_IM_START_TOKEN,
26
+ DEFAULT_IM_END_TOKEN,
27
+ DEFAULT_VID_START_TOKEN,
28
+ DEFAULT_VID_END_TOKEN,
29
+ DEFAULT_GEN_IMAGE_PATCH_TOKEN,
30
+ DEFAULT_GEN_IM_START_TOKEN,
31
+ DEFAULT_GEN_IM_END_TOKEN,
32
+ PLACEHOLDER_IMAGE_TOKEN_IN_TEXT,
33
+ DEFAULT_END_OF_CHUNK_TOKEN,
34
+ DEFAULT_END_OF_AUDIO_TOKEN,
35
+ DEFAULT_AUDIO_PATCH_TOKEN,
36
+ DEFAULT_AU_START_TOKEN,
37
+ DEFAULT_AU_END_TOKEN,
38
+ DEFAULT_GEN_AUDIO_PATCH_TOKEN,
39
+ DEFAULT_GEN_AU_START_TOKEN,
40
+ DEFAULT_GEN_AU_END_TOKEN,
41
+ PLACEHOLDER_AUDIO_TOKEN_IN_TEXT,
42
+ DEFAULT_FRAME_PATCH_TOKEN
43
+ ]
44
+
45
+
46
+ # prompts for qwen2
47
+ START_HEADER_QWEN2 = "<|im_start|>"
48
+ END_HEADER_QWEN2 = "<|im_end|>"
49
+ QWEN2_SYSTEM_PREFIX = "<|im_start|>system\nYou are a helpful assistant."
50
+ QWEN2_USER_PREFIX = "<|im_end|>\n<|im_start|>user\n"
51
+ QWEN2_ASSISTANT_PREFIX = "<|im_end|>\n<|im_start|>assistant\n"
52
+
53
+ # special tokens for llama3
54
+ START_HEADER = "<|start_header_id|>" # Specifies the role for the following message, i.e. “system” 128006
55
+ END_HEADER = "<|end_header_id|>" # 128007
56
+ EOT = "<|eot_id|>" # Specifies the end of the input message [128009]
57
+ SYSTEM_PREFIX = START_HEADER + "system" + END_HEADER + "\n\n" # system [128006, 9125, 128007, 271]
58
+ USER_PREFIX = START_HEADER + "user" + END_HEADER + "\n\n" # user [128006, 882, 128007, 271]
59
+ ASSISTANT_PREFIX = START_HEADER + "assistant" + END_HEADER + "\n\n" # assistant [128006, 78191, 128007, 271]
60
+
61
+ GLM_USER_PREFIX = "<role>HUMAN</role>"
62
+ GLM_ASSISTANT_PREFIX = "<role>ASSISTANT</role>"
Ming_Uni/__init__.py ADDED
File without changes
Ming_Uni/__pycache__/MingUniInference.cpython-38.pyc ADDED
Binary file (14.8 kB). View file
 
Ming_Uni/__pycache__/Templates_native.cpython-38.pyc ADDED
Binary file (1.74 kB). View file
 
Ming_Uni/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (162 Bytes). View file
 
Ming_Uni/__pycache__/modeling_qwen2_native.cpython-38.pyc ADDED
Binary file (44.1 kB). View file
 
Ming_Uni/__pycache__/modeling_rope_utils.cpython-38.pyc ADDED
Binary file (17.3 kB). View file
 
Ming_Uni/__pycache__/pipeline_sana.cpython-38.pyc ADDED
Binary file (32.3 kB). View file
 
Ming_Uni/__pycache__/process.cpython-38.pyc ADDED
Binary file (7.57 kB). View file
 
Ming_Uni/__pycache__/qwen2_5_vit.cpython-38.pyc ADDED
Binary file (16.1 kB). View file
 
Ming_Uni/__pycache__/qwen2vl_processor.cpython-38.pyc ADDED
Binary file (16.8 kB). View file
 
Ming_Uni/__pycache__/sana_loss.cpython-38.pyc ADDED
Binary file (7.79 kB). View file
 
Ming_Uni/__pycache__/sana_transformer.cpython-38.pyc ADDED
Binary file (17.7 kB). View file
 
Ming_Uni/modeling_qwen2_native.py ADDED
@@ -0,0 +1,1497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers.models.qwen2.modeling_qwen2 import (
6
+ Qwen2MLP,
7
+ Qwen2RMSNorm,
8
+ Qwen2PreTrainedModel,
9
+ rotate_half,
10
+ repeat_kv,
11
+ QWEN2_START_DOCSTRING,
12
+ QWEN2_INPUTS_DOCSTRING,
13
+ Qwen2RotaryEmbedding,
14
+ apply_rotary_pos_emb
15
+ )
16
+
17
+ from IPython import embed
18
+
19
+ from transformers.cache_utils import Cache, SlidingWindowCache, StaticCache
20
+ from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, rope_config_validation
21
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
22
+
23
+ from dataclasses import dataclass
24
+ from typing import List, Optional, Tuple, Union, Dict, Any
25
+ from transformers.utils import (
26
+ add_start_docstrings,
27
+ add_start_docstrings_to_model_forward,
28
+ is_flash_attn_2_available,
29
+ is_flash_attn_greater_or_equal_2_10,
30
+ logging,
31
+ replace_return_docstrings
32
+ )
33
+
34
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
35
+ from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
36
+
37
+ if is_flash_attn_2_available():
38
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
39
+ else:
40
+ flash_attn_varlen_func = None
41
+
42
+ _CONFIG_FOR_DOC = "Qwen2Config"
43
+ logger = logging.get_logger(__name__)
44
+
45
+ @dataclass
46
+ class Bailing2CausalLMOutputWithPast(ModelOutput):
47
+ """
48
+ Base class for Bailing2 causal language model (or autoregressive) outputs.
49
+
50
+ Args:
51
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
52
+ Language modeling loss (for next-token prediction).
53
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
54
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
55
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
56
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
57
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
58
+
59
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
60
+ `past_key_values` input) to speed up sequential decoding.
61
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
62
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
63
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
64
+
65
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
66
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
67
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
68
+ sequence_length)`.
69
+
70
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
71
+ heads.
72
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
73
+ The rope index difference between sequence length and multimodal rope.
74
+ """
75
+
76
+ loss: Optional[torch.FloatTensor] = None
77
+ logits: torch.FloatTensor = None
78
+ past_key_values: Optional[List[torch.FloatTensor]] = None
79
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
80
+ last_hidden_state: Optional[torch.FloatTensor] = None
81
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
82
+ rope_deltas: Optional[torch.LongTensor] = None
83
+
84
+ class Qwen2_5_VLRotaryEmbedding(nn.Module):
85
+ def __init__(self, config: Qwen2Config, device=None):
86
+ super().__init__()
87
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
88
+ self.rope_scaling = config.rope_scaling
89
+ if self.rope_scaling["type"] == "mrope":
90
+ self.rope_scaling["type"] = "default"
91
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
92
+ rope_config_validation(self, ignore_keys={"mrope_section"})
93
+ self.rope_type = self.rope_scaling["rope_type"]
94
+ else:
95
+ self.rope_type = "default"
96
+
97
+ self.max_seq_len_cached = config.max_position_embeddings
98
+ self.original_max_seq_len = config.max_position_embeddings
99
+
100
+ self.config = config
101
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
102
+
103
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
104
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
105
+ self.original_inv_freq = self.inv_freq
106
+
107
+ def _dynamic_frequency_update(self, position_ids, device):
108
+ """
109
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
110
+ 1 - growing beyond the cached sequence length (allow scaling)
111
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
112
+ """
113
+ seq_len = torch.max(position_ids) + 1
114
+ if seq_len > self.max_seq_len_cached: # growth
115
+ inv_freq, self.attention_scaling = self.rope_init_fn(
116
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
117
+ )
118
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
119
+ self.max_seq_len_cached = seq_len
120
+
121
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
122
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
123
+ self.max_seq_len_cached = self.original_max_seq_len
124
+
125
+ @torch.no_grad()
126
+ def forward(self, x, position_ids):
127
+ if "dynamic" in self.rope_type:
128
+ self._dynamic_frequency_update(position_ids, device=x.device)
129
+
130
+ # Core RoPE block. In contrast to other models, Qwen2 has different position ids for thw grids
131
+ # So we expand the inv_freq to shape (3, ...)
132
+ inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
133
+ position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
134
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
135
+ device_type = x.device.type
136
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
137
+ with torch.autocast(device_type=device_type, enabled=False):
138
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
139
+ emb = torch.cat((freqs, freqs), dim=-1)
140
+ cos = emb.cos()
141
+ sin = emb.sin()
142
+
143
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
144
+ cos = cos * self.attention_scaling
145
+ sin = sin * self.attention_scaling
146
+
147
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
148
+
149
+ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section=[16, 24, 24], unsqueeze_dim=1):
150
+ """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
151
+
152
+ Explanation:
153
+ Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
154
+ sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
155
+ vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
156
+ Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
157
+ For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
158
+ height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
159
+ difference with modern LLMs.
160
+
161
+ Args:
162
+ q (`torch.Tensor`): The query tensor.
163
+ k (`torch.Tensor`): The key tensor.
164
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
165
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
166
+ position_ids (`torch.Tensor`):
167
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
168
+ used to pass offsetted position ids when working with a KV-cache.
169
+ mrope_section(`List(int)`):
170
+ Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
171
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
172
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
173
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
174
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
175
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
176
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
177
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
178
+ Returns:
179
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
180
+ """
181
+ mrope_section = mrope_section * 2
182
+ cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
183
+ unsqueeze_dim
184
+ )
185
+ sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
186
+ unsqueeze_dim
187
+ )
188
+
189
+ q_embed = (q * cos) + (rotate_half(q) * sin)
190
+ k_embed = (k * cos) + (rotate_half(k) * sin)
191
+ return q_embed, k_embed
192
+
193
+ class Qwen2Attention(nn.Module):
194
+ """
195
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
196
+ and "Generating Long Sequences with Sparse Transformers".
197
+ """
198
+
199
+ def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
200
+ super().__init__()
201
+ self.config = config
202
+ self.layer_idx = layer_idx
203
+ if layer_idx is None:
204
+ logger.warning_once(
205
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
206
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
207
+ "when creating this class."
208
+ )
209
+
210
+ self.hidden_size = config.hidden_size
211
+ self.num_heads = config.num_attention_heads
212
+ self.head_dim = self.hidden_size // self.num_heads
213
+ self.num_key_value_heads = config.num_key_value_heads
214
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
215
+ self.is_causal = True
216
+ self.attention_dropout = config.attention_dropout
217
+ self.rope_scaling = config.rope_scaling
218
+
219
+ if (self.head_dim * self.num_heads) != self.hidden_size:
220
+ raise ValueError(
221
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
222
+ f" and `num_heads`: {self.num_heads})."
223
+ )
224
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
225
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
226
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
227
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
228
+
229
+ self.use_llm_3drope = config.use_llm_3drope
230
+ if self.use_llm_3drope:
231
+ self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config)
232
+ else:
233
+ self.rotary_emb = Qwen2RotaryEmbedding(config=config)
234
+
235
+ def forward(
236
+ self,
237
+ hidden_states: torch.Tensor,
238
+ attention_mask: Optional[torch.Tensor] = None,
239
+ position_ids: Optional[torch.LongTensor] = None,
240
+ past_key_value: Optional[Cache] = None,
241
+ output_attentions: bool = False,
242
+ use_cache: bool = False,
243
+ cache_position: Optional[torch.LongTensor] = None,
244
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
245
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
246
+ bsz, q_len, _ = hidden_states.size()
247
+
248
+ query_states = self.q_proj(hidden_states)
249
+ key_states = self.k_proj(hidden_states)
250
+ value_states = self.v_proj(hidden_states)
251
+
252
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
253
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
254
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
255
+
256
+ cos, sin = position_embeddings
257
+ if self.use_llm_3drope:
258
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
259
+ query_states, key_states, cos, sin,
260
+ mrope_section=self.rope_scaling["mrope_section"],
261
+ )
262
+ else:
263
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
264
+
265
+ if past_key_value is not None:
266
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
267
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
268
+
269
+ # repeat k/v heads if n_kv_heads < n_heads
270
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
271
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
272
+
273
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
274
+
275
+ if attention_mask is not None: # no matter the length, we just slice it
276
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
277
+ attn_weights = attn_weights + causal_mask
278
+
279
+ # Fix precision issues in Qwen2-VL float16 inference
280
+ # Replace inf values with zeros in attention weights to prevent NaN propagation
281
+ if query_states.dtype == torch.float16:
282
+ attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
283
+
284
+ # upcast attention to fp32
285
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
286
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
287
+ attn_output = torch.matmul(attn_weights, value_states)
288
+
289
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
290
+ raise ValueError(
291
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
292
+ f" {attn_output.size()}"
293
+ )
294
+
295
+ attn_output = attn_output.transpose(1, 2).contiguous()
296
+ attn_output = attn_output.reshape(bsz, q_len, -1)
297
+
298
+ attn_output = self.o_proj(attn_output)
299
+
300
+ if not output_attentions:
301
+ attn_weights = None
302
+
303
+ return attn_output, attn_weights, past_key_value
304
+
305
+ class Qwen2FlashAttention2(Qwen2Attention):
306
+ """
307
+ Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
308
+ as the weights of the module stays untouched. The only required change would be on the forward pass
309
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
310
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
311
+ config.max_window_layers layers.
312
+ """
313
+
314
+ def __init__(self, *args, **kwargs):
315
+ super().__init__(*args, **kwargs)
316
+
317
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
318
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
319
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
320
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
321
+
322
+ def forward(
323
+ self,
324
+ hidden_states: torch.Tensor,
325
+ attention_mask: Optional[torch.Tensor] = None,
326
+ position_ids: Optional[torch.LongTensor] = None,
327
+ past_key_value: Optional[Cache] = None,
328
+ output_attentions: bool = False,
329
+ use_cache: bool = False,
330
+ cache_position: Optional[torch.LongTensor] = None,
331
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
332
+ ):
333
+ bsz, q_len, _ = hidden_states.size()
334
+
335
+ query_states = self.q_proj(hidden_states)
336
+ key_states = self.k_proj(hidden_states)
337
+ value_states = self.v_proj(hidden_states)
338
+
339
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
340
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
341
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
342
+
343
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
344
+ cos, sin = position_embeddings
345
+ if self.use_llm_3drope:
346
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
347
+ query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
348
+ )
349
+ else:
350
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
351
+ if past_key_value is not None:
352
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
353
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
354
+
355
+ # repeat k/v heads if n_kv_heads < n_heads
356
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
357
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
358
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
359
+
360
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
361
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
362
+ # cast them back in float16 just to be sure everything works as expected.
363
+ input_dtype = query_states.dtype
364
+ if input_dtype == torch.float32:
365
+ if torch.is_autocast_enabled():
366
+ target_dtype = torch.get_autocast_gpu_dtype()
367
+ # Handle the case where the model is quantized
368
+ elif hasattr(self.config, "_pre_quantization_dtype"):
369
+ target_dtype = self.config._pre_quantization_dtype
370
+ else:
371
+ target_dtype = self.q_proj.weight.dtype
372
+
373
+ logger.warning_once(
374
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
375
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
376
+ f" {target_dtype}."
377
+ )
378
+
379
+ query_states = query_states.to(target_dtype)
380
+ key_states = key_states.to(target_dtype)
381
+ value_states = value_states.to(target_dtype)
382
+
383
+ # Reashape to the expected shape for Flash Attention
384
+ query_states = query_states.transpose(1, 2)
385
+ key_states = key_states.transpose(1, 2)
386
+ value_states = value_states.transpose(1, 2)
387
+
388
+ if (
389
+ self.config.use_sliding_window
390
+ and getattr(self.config, "sliding_window", None) is not None
391
+ and self.layer_idx >= self.config.max_window_layers
392
+ ):
393
+ sliding_window = self.config.sliding_window
394
+ else:
395
+ sliding_window = None
396
+
397
+ attn_output = _flash_attention_forward(
398
+ query_states,
399
+ key_states,
400
+ value_states,
401
+ attention_mask,
402
+ q_len,
403
+ dropout=dropout_rate,
404
+ sliding_window=sliding_window,
405
+ is_causal=self.is_causal,
406
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
407
+ )
408
+
409
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
410
+ attn_output = self.o_proj(attn_output)
411
+
412
+ if not output_attentions:
413
+ attn_weights = None
414
+
415
+ return attn_output, attn_weights, past_key_value
416
+
417
+ class Qwen2SdpaAttention(Qwen2Attention):
418
+ """
419
+ Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
420
+ `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
421
+ SDPA API.
422
+ """
423
+
424
+ # Adapted from Qwen2Attention.forward
425
+ def forward(
426
+ self,
427
+ hidden_states: torch.Tensor,
428
+ attention_mask: Optional[torch.Tensor] = None,
429
+ position_ids: Optional[torch.LongTensor] = None,
430
+ past_key_value: Optional[Cache] = None,
431
+ output_attentions: bool = False,
432
+ use_cache: bool = False,
433
+ cache_position: Optional[torch.LongTensor] = None,
434
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
435
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
436
+ if output_attentions:
437
+ logger.warning_once(
438
+ "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
439
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
440
+ )
441
+ return super().forward(
442
+ hidden_states=hidden_states,
443
+ attention_mask=attention_mask,
444
+ position_ids=position_ids,
445
+ past_key_value=past_key_value,
446
+ output_attentions=output_attentions,
447
+ use_cache=use_cache,
448
+ cache_position=cache_position,
449
+ position_embeddings=position_embeddings,
450
+ )
451
+
452
+ bsz, q_len, _ = hidden_states.size()
453
+
454
+ query_states = self.q_proj(hidden_states)
455
+ key_states = self.k_proj(hidden_states)
456
+ value_states = self.v_proj(hidden_states)
457
+
458
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
459
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
460
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
461
+
462
+ cos, sin = position_embeddings
463
+ if self.use_llm_3drope:
464
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
465
+ query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
466
+ )
467
+ else:
468
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
469
+
470
+ if past_key_value is not None:
471
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
472
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
473
+
474
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
475
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
476
+
477
+ causal_mask = attention_mask
478
+ if attention_mask is not None: # no matter the length, we just slice it
479
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
480
+
481
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
482
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
483
+ if query_states.device.type == "cuda" and attention_mask is not None:
484
+ query_states = query_states.contiguous()
485
+ key_states = key_states.contiguous()
486
+ value_states = value_states.contiguous()
487
+
488
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
489
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
490
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
491
+ is_causal = True if causal_mask is None and q_len > 1 else False
492
+
493
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
494
+ query_states,
495
+ key_states,
496
+ value_states,
497
+ attn_mask=causal_mask,
498
+ dropout_p=self.attention_dropout if self.training else 0.0,
499
+ is_causal=is_causal,
500
+ )
501
+
502
+ attn_output = attn_output.transpose(1, 2).contiguous()
503
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
504
+
505
+ attn_output = self.o_proj(attn_output)
506
+
507
+ return attn_output, None, past_key_value
508
+
509
+ QWEN2_5_ATTENTION_CLASSES = {
510
+ "eager": Qwen2Attention,
511
+ "flash_attention_2": Qwen2FlashAttention2,
512
+ "sdpa": Qwen2SdpaAttention,
513
+ }
514
+
515
+ class Qwen2DecoderLayer(nn.Module):
516
+ def __init__(self, config: Qwen2Config, layer_idx: int):
517
+ super().__init__()
518
+ self.hidden_size = config.hidden_size
519
+
520
+ if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
521
+ logger.warning_once(
522
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
523
+ "unexpected results may be encountered."
524
+ )
525
+ self.self_attn = QWEN2_5_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
526
+
527
+ self.mlp = Qwen2MLP(config)
528
+ self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
529
+ self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
530
+
531
+ def forward(
532
+ self,
533
+ hidden_states: torch.Tensor,
534
+ attention_mask: Optional[torch.Tensor] = None,
535
+ position_ids: Optional[torch.LongTensor] = None,
536
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
537
+ output_attentions: Optional[bool] = False,
538
+ use_cache: Optional[bool] = False,
539
+ cache_position: Optional[torch.LongTensor] = None,
540
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
541
+ **kwargs,
542
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
543
+ """
544
+ Args:
545
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
546
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
547
+ `(batch, sequence_length)` where padding elements are indicated by 0.
548
+ output_attentions (`bool`, *optional*):
549
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
550
+ returned tensors for more detail.
551
+ use_cache (`bool`, *optional*):
552
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
553
+ (see `past_key_values`).
554
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
555
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
556
+ Indices depicting the position of the input sequence tokens in the sequence.
557
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
558
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
559
+ with `head_dim` being the embedding dimension of each attention head.
560
+ kwargs (`dict`, *optional*):
561
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
562
+ into the model
563
+ """
564
+
565
+ residual = hidden_states
566
+
567
+ hidden_states = self.input_layernorm(hidden_states)
568
+
569
+ # Self Attention
570
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
571
+ hidden_states=hidden_states,
572
+ attention_mask=attention_mask,
573
+ position_ids=position_ids,
574
+ past_key_value=past_key_value,
575
+ output_attentions=output_attentions,
576
+ use_cache=use_cache,
577
+ cache_position=cache_position,
578
+ position_embeddings=position_embeddings,
579
+ )
580
+ hidden_states = residual + hidden_states
581
+
582
+ # Fully Connected
583
+ residual = hidden_states
584
+ hidden_states = self.post_attention_layernorm(hidden_states)
585
+ hidden_states = self.mlp(hidden_states)
586
+ hidden_states = residual + hidden_states
587
+
588
+ outputs = (hidden_states,)
589
+
590
+ if output_attentions:
591
+ outputs += (self_attn_weights,)
592
+
593
+ if use_cache:
594
+ outputs += (present_key_value,)
595
+
596
+ return outputs
597
+
598
+ @add_start_docstrings(
599
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
600
+ QWEN2_START_DOCSTRING,
601
+ )
602
+ class Qwen2Model(Qwen2PreTrainedModel):
603
+ def __init__(self, config: Qwen2Config):
604
+ super().__init__(config)
605
+ self.padding_idx = config.pad_token_id
606
+ self.vocab_size = config.vocab_size
607
+
608
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
609
+ self.layers = nn.ModuleList(
610
+ [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
611
+ )
612
+ self._attn_implementation = config._attn_implementation
613
+ self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
614
+
615
+ self.use_llm_3drope = config.use_llm_3drope
616
+ if self.use_llm_3drope:
617
+ self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config)
618
+ else:
619
+ self.rotary_emb = Qwen2RotaryEmbedding(config=config)
620
+
621
+ self.gradient_checkpointing = False
622
+ # Initialize weights and apply final processing
623
+ self.post_init()
624
+
625
+ def get_input_embeddings(self):
626
+ return self.embed_tokens
627
+
628
+ def set_input_embeddings(self, value):
629
+ self.embed_tokens = value
630
+
631
+ def prompt_wrap(self, input_ids, query_embeds_visual=None, query_embeds_audio=None, target_embeds=None):
632
+ inputs_embeds = self.embed_tokens(input_ids)
633
+ if query_embeds_visual is None and query_embeds_audio is None and target_embeds is None:
634
+ return inputs_embeds
635
+
636
+ if query_embeds_visual is not None:
637
+ inputs_embeds = inputs_embeds.to(dtype=query_embeds_visual.dtype, device=query_embeds_visual.device)
638
+ image_mask = input_ids == self.config.image_patch_token
639
+ query_embeds_visual = query_embeds_visual.view(-1, query_embeds_visual.shape[-1])
640
+ try:
641
+ inputs_embeds[image_mask] = query_embeds_visual
642
+ except Exception as e:
643
+ temp_embeds = torch.zeros_like(inputs_embeds[image_mask]).to(dtype=inputs_embeds.dtype,
644
+ device=inputs_embeds.device)
645
+ inputs_embeds[image_mask] = temp_embeds
646
+ return inputs_embeds
647
+
648
+ if query_embeds_audio is not None:
649
+ inputs_embeds = inputs_embeds.to(dtype=query_embeds_audio.dtype, device=query_embeds_audio.device)
650
+ audio_mask = input_ids == self.config.audio_patch_token
651
+ query_embeds_audio = query_embeds_audio.view(-1, query_embeds_audio.shape[-1])
652
+ inputs_embeds[audio_mask] = query_embeds_audio
653
+
654
+ if target_embeds is not None:
655
+ inputs_embeds = inputs_embeds.to(dtype=target_embeds.dtype, device=target_embeds.device)
656
+ target_mask = input_ids == self.config.gen_image_patch_token
657
+ target_embeds = target_embeds.view(-1, target_embeds.shape[-1])
658
+ inputs_embeds[target_mask] = target_embeds
659
+
660
+ return inputs_embeds
661
+
662
+ def prompt_wrap_vision(self, input_ids, inputs_embeds, vision_embeds, image_token_id=None):
663
+ if vision_embeds is None or input_ids is None:
664
+ return inputs_embeds
665
+ if len(vision_embeds.shape) == 3:
666
+ vision_embeds = vision_embeds.reshape(-1, vision_embeds.shape[-1])
667
+ self.config.image_token_id = image_token_id if image_token_id is not None else self.config.image_patch_token
668
+ n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
669
+ n_image_features = vision_embeds.shape[0]
670
+ if n_image_tokens != n_image_features:
671
+ raise ValueError(
672
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
673
+ )
674
+ image_mask = (
675
+ (input_ids == self.config.image_token_id)
676
+ .unsqueeze(-1)
677
+ .expand_as(inputs_embeds)
678
+ .to(inputs_embeds.device)
679
+ )
680
+ #if torch.distributed.get_rank() == 0:
681
+ # embed()
682
+ #torch.distributed.barrier()
683
+
684
+
685
+ image_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
686
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
687
+ return inputs_embeds
688
+
689
+ def prompt_wrap_audio(self, input_ids, inputs_embeds, audio_embeds, audio_token_id=None):
690
+ if audio_embeds is None or input_ids is None:
691
+ return inputs_embeds
692
+ if len(audio_embeds.shape) == 3:
693
+ audio_embeds = audio_embeds.reshape(-1, audio_embeds.shape[-1])
694
+ self.config.audio_token_id = audio_token_id if audio_token_id is not None else self.config.audio_patch_token
695
+ n_audio_tokens = (input_ids == self.config.audio_token_id).sum().item()
696
+ n_audio_features = audio_embeds.shape[0]
697
+ if n_audio_tokens != n_audio_features:
698
+ raise ValueError(
699
+ f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}"
700
+ )
701
+ audio_mask = (
702
+ (input_ids == self.config.audio_token_id)
703
+ .unsqueeze(-1)
704
+ .expand_as(inputs_embeds)
705
+ .to(inputs_embeds.device)
706
+ )
707
+ audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
708
+ inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_embeds)
709
+ return inputs_embeds
710
+
711
+ def prompt_wrap_navit(self, input_ids, query_embeds_image=None, query_embeds_video=None, query_embeds_audio=None,
712
+ target_embeds=None):
713
+ inputs_embeds = self.embed_tokens(input_ids)
714
+ if query_embeds_image is None and query_embeds_video is None and query_embeds_audio is None and target_embeds is None:
715
+ return inputs_embeds
716
+ if query_embeds_image is not None:
717
+ inputs_embeds = self.prompt_wrap_vision(input_ids, inputs_embeds, query_embeds_image)
718
+ if query_embeds_video is not None:
719
+ inputs_embeds = self.prompt_wrap_vision(input_ids, inputs_embeds, query_embeds_video)
720
+ if query_embeds_audio is not None:
721
+ inputs_embeds = self.prompt_wrap_audio(input_ids, inputs_embeds, query_embeds_audio)
722
+ return inputs_embeds
723
+
724
+ def forward(
725
+ self,
726
+ input_ids: torch.LongTensor = None,
727
+ attention_mask: Optional[torch.Tensor] = None,
728
+ position_ids: Optional[torch.LongTensor] = None,
729
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
730
+ query_embeds_image: Optional[torch.Tensor] = None,
731
+ query_embeds_video: Optional[torch.Tensor] = None,
732
+ query_embeds_audio: Optional[torch.Tensor] = None,
733
+ target_embeds: Optional[torch.Tensor] = None,
734
+ img_gen_embeds: Optional[torch.Tensor] = None,
735
+ inputs_embeds: Optional[torch.FloatTensor] = None,
736
+ use_cache: Optional[bool] = None,
737
+ output_attentions: Optional[bool] = None,
738
+ output_hidden_states: Optional[bool] = None,
739
+ return_dict: Optional[bool] = None,
740
+ image_grid_thw: Optional[torch.Tensor] = None,
741
+ image_grid_thw_video: Optional[torch.Tensor] = None,
742
+ cache_position: Optional[torch.LongTensor] = None,
743
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
744
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
745
+ output_hidden_states = (
746
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
747
+ )
748
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
749
+
750
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
751
+
752
+ if (input_ids is None) ^ (inputs_embeds is not None):
753
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
754
+
755
+ if self.gradient_checkpointing and self.training:
756
+ if use_cache:
757
+ logger.warning_once(
758
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
759
+ )
760
+ use_cache = False
761
+
762
+ if inputs_embeds is None:
763
+ if (
764
+ query_embeds_image is None
765
+ and query_embeds_video is None
766
+ and query_embeds_audio is None
767
+ and target_embeds is None
768
+ ) or input_ids.size(1) == 1: # only text_ids
769
+ inputs_embeds = self.embed_tokens(input_ids.clip(0, self.embed_tokens.weight.shape[0] - 1))
770
+ else:
771
+ if image_grid_thw is None and image_grid_thw_video is None:
772
+ inputs_embeds = self.prompt_wrap(
773
+ input_ids.clip(0, self.embed_tokens.weight.shape[0] - 1), query_embeds_image,
774
+ query_embeds_audio, target_embeds # noqa
775
+ )
776
+ else:
777
+ # print("query_embeds_image: ", query_embeds_image.shape)
778
+ # print("image_grid_thw:", image_grid_thw, image_grid_thw.shape)
779
+ inputs_embeds = self.prompt_wrap_navit(
780
+ input_ids.clip(0, self.embed_tokens.weight.shape[0] - 1), query_embeds_image,
781
+ query_embeds_video, query_embeds_audio, target_embeds)
782
+
783
+ if img_gen_embeds is not None:
784
+ gen_length = img_gen_embeds.shape[1]
785
+ inputs_embeds[:, -gen_length:] = img_gen_embeds
786
+
787
+ if cache_position is None:
788
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
789
+ cache_position = torch.arange(
790
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
791
+ )
792
+
793
+ if self.use_llm_3drope:
794
+ # the hard coded `3` is for temporal, height and width.
795
+ if position_ids is None:
796
+ position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
797
+ elif position_ids.dim() == 2:
798
+ position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
799
+ else:
800
+ if position_ids is None:
801
+ position_ids = cache_position.unsqueeze(0)
802
+
803
+ causal_mask = self._update_causal_mask(
804
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
805
+ )
806
+
807
+ hidden_states = inputs_embeds
808
+
809
+ # create position embeddings to be shared across the decoder layers
810
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
811
+
812
+ # decoder layers
813
+ all_hidden_states = () if output_hidden_states else None
814
+ all_self_attns = () if output_attentions else None
815
+ next_decoder_cache = None
816
+
817
+ for decoder_layer in self.layers:
818
+ if output_hidden_states:
819
+ all_hidden_states += (hidden_states,)
820
+
821
+ if self.gradient_checkpointing and self.training:
822
+ layer_outputs = self._gradient_checkpointing_func(
823
+ decoder_layer.__call__,
824
+ hidden_states,
825
+ causal_mask,
826
+ position_ids,
827
+ past_key_values,
828
+ output_attentions,
829
+ use_cache,
830
+ cache_position,
831
+ position_embeddings,
832
+ )
833
+ else:
834
+ layer_outputs = decoder_layer(
835
+ hidden_states,
836
+ attention_mask=causal_mask,
837
+ position_ids=position_ids,
838
+ past_key_value=past_key_values,
839
+ output_attentions=output_attentions,
840
+ use_cache=use_cache,
841
+ cache_position=cache_position,
842
+ position_embeddings=position_embeddings,
843
+ )
844
+
845
+ hidden_states = layer_outputs[0]
846
+
847
+ if use_cache:
848
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
849
+
850
+ if output_attentions:
851
+ all_self_attns += (layer_outputs[1],)
852
+
853
+ hidden_states = self.norm(hidden_states)
854
+
855
+ # add hidden states from the last decoder layer
856
+ if output_hidden_states:
857
+ all_hidden_states += (hidden_states,)
858
+
859
+ next_cache = next_decoder_cache if use_cache else None
860
+
861
+ if not return_dict:
862
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
863
+ return BaseModelOutputWithPast(
864
+ last_hidden_state=hidden_states,
865
+ past_key_values=next_cache,
866
+ hidden_states=all_hidden_states,
867
+ attentions=all_self_attns,
868
+ )
869
+
870
+ def _update_causal_mask(
871
+ self,
872
+ attention_mask: torch.Tensor,
873
+ input_tensor: torch.Tensor,
874
+ cache_position: torch.Tensor,
875
+ past_key_values: Cache,
876
+ output_attentions: bool,
877
+ ):
878
+ if self.config._attn_implementation == "flash_attention_2":
879
+ if attention_mask is not None and past_key_values is not None:
880
+ is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
881
+ if is_padding_right:
882
+ logger.warning_once(
883
+ "You are attempting to perform batched generation with padding_side='right'"
884
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
885
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
886
+ )
887
+ if attention_mask is not None and 0.0 in attention_mask:
888
+ return attention_mask
889
+ return None
890
+
891
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
892
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
893
+ # to infer the attention mask.
894
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
895
+ using_static_cache = isinstance(past_key_values, StaticCache)
896
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
897
+
898
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
899
+ if (
900
+ self.config._attn_implementation == "sdpa"
901
+ and not (using_static_cache or using_sliding_window_cache)
902
+ and not output_attentions
903
+ ):
904
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
905
+ attention_mask,
906
+ inputs_embeds=input_tensor,
907
+ past_key_values_length=past_seen_tokens,
908
+ sliding_window=self.config.sliding_window,
909
+ is_training=self.training,
910
+ ):
911
+ return None
912
+
913
+ dtype, device = input_tensor.dtype, input_tensor.device
914
+ min_dtype = torch.finfo(dtype).min
915
+ sequence_length = input_tensor.shape[1]
916
+ # SlidingWindowCache or StaticCache
917
+ if using_sliding_window_cache or using_static_cache:
918
+ # target_length = past_key_values.get_max_cache_shape()
919
+ target_length = past_key_values.get_max_length()
920
+ # DynamicCache or no cache
921
+ else:
922
+ target_length = (
923
+ attention_mask.shape[-1]
924
+ if isinstance(attention_mask, torch.Tensor)
925
+ else past_seen_tokens + sequence_length + 1
926
+ )
927
+
928
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
929
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
930
+ attention_mask,
931
+ sequence_length=sequence_length,
932
+ target_length=target_length,
933
+ dtype=dtype,
934
+ device=device,
935
+ min_dtype=min_dtype,
936
+ cache_position=cache_position,
937
+ batch_size=input_tensor.shape[0],
938
+ )
939
+
940
+ if (
941
+ self.config._attn_implementation == "sdpa"
942
+ and attention_mask is not None
943
+ and attention_mask.device.type in ["cuda", "xpu"]
944
+ and not output_attentions
945
+ ):
946
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
947
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
948
+ # Details: https://github.com/pytorch/pytorch/issues/110213
949
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
950
+
951
+ return causal_mask
952
+
953
+ @staticmethod
954
+ def _prepare_4d_causal_attention_mask_with_cache_position(
955
+ attention_mask: torch.Tensor,
956
+ sequence_length: int,
957
+ target_length: int,
958
+ dtype: torch.dtype,
959
+ device: torch.device,
960
+ min_dtype: float,
961
+ cache_position: torch.Tensor,
962
+ batch_size: int,
963
+ ):
964
+ """
965
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
966
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
967
+
968
+ Args:
969
+ attention_mask (`torch.Tensor`):
970
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
971
+ sequence_length (`int`):
972
+ The sequence length being processed.
973
+ target_length (`int`):
974
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
975
+ dtype (`torch.dtype`):
976
+ The dtype to use for the 4D attention mask.
977
+ device (`torch.device`):
978
+ The device to plcae the 4D attention mask on.
979
+ min_dtype (`float`):
980
+ The minimum value representable with the dtype `dtype`.
981
+ cache_position (`torch.Tensor`):
982
+ Indices depicting the position of the input sequence tokens in the sequence.
983
+ batch_size (`torch.Tensor`):
984
+ Batch size.
985
+ """
986
+ if attention_mask is not None and attention_mask.dim() == 4:
987
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
988
+ causal_mask = attention_mask
989
+ else:
990
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
991
+ if sequence_length != 1:
992
+ causal_mask = torch.triu(causal_mask, diagonal=1)
993
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
994
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
995
+ if attention_mask is not None:
996
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
997
+ mask_length = attention_mask.shape[-1]
998
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
999
+ padding_mask = padding_mask == 0
1000
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1001
+ padding_mask, min_dtype
1002
+ )
1003
+
1004
+ return causal_mask
1005
+
1006
+ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
1007
+ _tied_weights_keys = ["lm_head.weight"]
1008
+
1009
+ def __init__(self, config: Qwen2Config):
1010
+ super().__init__(config)
1011
+ self.config = config
1012
+
1013
+ self.use_llm_3drope = config.use_llm_3drope
1014
+ if self.use_llm_3drope:
1015
+ self.config.rope_scaling = {"type": "mrope", "mrope_section": [16, 24, 24]}
1016
+
1017
+ self.model = Qwen2Model(self.config)
1018
+ self.vocab_size = config.vocab_size
1019
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1020
+
1021
+ self.audio_vocab_size = config.audio_vocab_size
1022
+ self.audio_id_shift = config.audio_id_shift
1023
+
1024
+ # Initialize weights and apply final processing
1025
+ self.post_init()
1026
+
1027
+ def get_input_embeddings(self):
1028
+ return self.model.embed_tokens
1029
+
1030
+ def set_input_embeddings(self, value):
1031
+ self.model.embed_tokens = value
1032
+
1033
+ def get_output_embeddings(self):
1034
+ return self.lm_head
1035
+
1036
+ def set_output_embeddings(self, new_embeddings):
1037
+ self.lm_head = new_embeddings
1038
+
1039
+ def set_decoder(self, decoder):
1040
+ self.model = decoder
1041
+
1042
+ def get_decoder(self):
1043
+ return self.model
1044
+
1045
+ def audio_decoder_sample(self, logits, topk=10, filter_value=-float("Inf")):
1046
+ """
1047
+ - logits: size(batch, audio_vocab_size)
1048
+
1049
+ Return
1050
+ - token_id: int
1051
+ """
1052
+ assert logits.dim() == 2 and logits.size(1) == self.config.audio_vocab_size
1053
+ indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
1054
+ logits[indices_to_remove] = filter_value
1055
+ token_id = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1)
1056
+ return token_id
1057
+
1058
+ def get_rope_index(
1059
+ self,
1060
+ input_ids: Optional[torch.LongTensor] = None,
1061
+ image_grid_thw: Optional[torch.LongTensor] = None,
1062
+ video_grid_thw: Optional[torch.LongTensor] = None,
1063
+ second_per_grid_ts: Optional[torch.Tensor] = None,
1064
+ attention_mask: Optional[torch.Tensor] = None,
1065
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
1066
+ """
1067
+ Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
1068
+
1069
+ Explanation:
1070
+ Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
1071
+
1072
+ For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
1073
+ Examples:
1074
+ input_ids: [T T T T T], here T is for text.
1075
+ temporal position_ids: [0, 1, 2, 3, 4]
1076
+ height position_ids: [0, 1, 2, 3, 4]
1077
+ width position_ids: [0, 1, 2, 3, 4]
1078
+
1079
+ For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
1080
+ and 1D rotary position embeddin for text part.
1081
+ Examples:
1082
+ Temporal (Time): 3 patches, representing different segments of the video in time.
1083
+ Height: 2 patches, dividing each frame vertically.
1084
+ Width: 2 patches, dividing each frame horizontally.
1085
+ We also have some important parameters:
1086
+ fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
1087
+ tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
1088
+ temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
1089
+ interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
1090
+ input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
1091
+ vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
1092
+ vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
1093
+ vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
1094
+ text temporal position_ids: [101, 102, 103, 104, 105]
1095
+ text height position_ids: [101, 102, 103, 104, 105]
1096
+ text width position_ids: [101, 102, 103, 104, 105]
1097
+ Here we calculate the text start position_ids as the max vision position_ids plus 1.
1098
+
1099
+ Args:
1100
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1101
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1102
+ it.
1103
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1104
+ The temporal, height and width of feature shape of each image in LLM.
1105
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1106
+ The temporal, height and width of feature shape of each video in LLM.
1107
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
1108
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
1109
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1110
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1111
+
1112
+ - 1 for tokens that are **not masked**,
1113
+ - 0 for tokens that are **masked**.
1114
+
1115
+ Returns:
1116
+ position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
1117
+ mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
1118
+ """
1119
+ spatial_merge_size = self.config.spatial_merge_size
1120
+ image_token_id = self.config.image_patch_token
1121
+ video_token_id = self.config.video_patch_token
1122
+ image_start_token_id = self.config.image_start_token
1123
+ video_start_token_id = self.config.video_start_token
1124
+ use_abs_time_pos = second_per_grid_ts is not None
1125
+
1126
+ mrope_position_deltas = []
1127
+ if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
1128
+ total_input_ids = input_ids
1129
+ if attention_mask is None:
1130
+ attention_mask = torch.ones_like(total_input_ids)
1131
+ position_ids = torch.ones(
1132
+ 3,
1133
+ input_ids.shape[0],
1134
+ input_ids.shape[1],
1135
+ dtype=input_ids.dtype,
1136
+ device=input_ids.device,
1137
+ )
1138
+ image_index, video_index = 0, 0
1139
+ attention_mask = attention_mask.to(total_input_ids.device)
1140
+ for i, input_ids in enumerate(total_input_ids):
1141
+ input_ids = input_ids[attention_mask[i] == 1]
1142
+ image_nums, video_nums = 0, 0
1143
+ if image_grid_thw is not None:
1144
+ vision_start_indices = torch.argwhere(input_ids == image_start_token_id).squeeze(1)
1145
+ vision_tokens = input_ids[vision_start_indices + 1]
1146
+ image_nums = (vision_tokens == image_token_id).sum()
1147
+ if video_grid_thw is not None:
1148
+ vision_start_indices = torch.argwhere(input_ids == video_start_token_id).squeeze(1)
1149
+ vision_tokens = input_ids[vision_start_indices + 1]
1150
+ video_nums = (vision_tokens == video_token_id).sum()
1151
+
1152
+ input_tokens = input_ids.tolist()
1153
+ llm_pos_ids_list: list = []
1154
+ st = 0
1155
+ remain_images, remain_videos = image_nums, video_nums
1156
+ for _ in range(image_nums + video_nums):
1157
+ if image_token_id in input_tokens and remain_images > 0:
1158
+ ed_image = input_tokens.index(image_token_id, st)
1159
+ else:
1160
+ ed_image = len(input_tokens) + 1
1161
+ if video_token_id in input_tokens and remain_videos > 0:
1162
+ ed_video = input_tokens.index(video_token_id, st)
1163
+ else:
1164
+ ed_video = len(input_tokens) + 1
1165
+ if ed_image < ed_video:
1166
+ t, h, w = (
1167
+ image_grid_thw[image_index][0],
1168
+ image_grid_thw[image_index][1],
1169
+ image_grid_thw[image_index][2],
1170
+ )
1171
+ second_per_grid_t = 0
1172
+ image_index += 1
1173
+ remain_images -= 1
1174
+ ed = ed_image
1175
+ else:
1176
+ t, h, w = (
1177
+ video_grid_thw[video_index][0],
1178
+ video_grid_thw[video_index][1],
1179
+ video_grid_thw[video_index][2],
1180
+ )
1181
+ if second_per_grid_ts is not None:
1182
+ second_per_grid_t = second_per_grid_ts[video_index]
1183
+ else:
1184
+ second_per_grid_t = 1.0
1185
+ video_index += 1
1186
+ remain_videos -= 1
1187
+ ed = ed_video
1188
+ llm_grid_t, llm_grid_h, llm_grid_w = (
1189
+ t.item(),
1190
+ h.item() // spatial_merge_size,
1191
+ w.item() // spatial_merge_size,
1192
+ )
1193
+ text_len = ed - st
1194
+
1195
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
1196
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
1197
+
1198
+ range_tensor = torch.arange(llm_grid_t).view(-1, 1)
1199
+ expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
1200
+ if use_abs_time_pos:
1201
+ time_tensor = expanded_range * second_per_grid_t * self.config.tokens_per_second
1202
+ time_tensor_long = time_tensor.long()
1203
+ else:
1204
+ time_tensor_long = expanded_range.long()
1205
+ t_index = time_tensor_long.flatten()
1206
+
1207
+ h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
1208
+ w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
1209
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
1210
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
1211
+
1212
+ if st < len(input_tokens):
1213
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
1214
+ text_len = len(input_tokens) - st
1215
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
1216
+
1217
+ llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
1218
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
1219
+ mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
1220
+ mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
1221
+ return position_ids, mrope_position_deltas
1222
+ else:
1223
+ if attention_mask is not None:
1224
+ position_ids = attention_mask.long().cumsum(-1) - 1
1225
+ position_ids.masked_fill_(attention_mask == 0, 1)
1226
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
1227
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
1228
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
1229
+ else:
1230
+ position_ids = (
1231
+ torch.arange(input_ids.shape[1], device=input_ids.device)
1232
+ .view(1, 1, -1)
1233
+ .expand(3, input_ids.shape[0], -1)
1234
+ )
1235
+ mrope_position_deltas = torch.zeros(
1236
+ [input_ids.shape[0], 1],
1237
+ device=input_ids.device,
1238
+ dtype=input_ids.dtype,
1239
+ )
1240
+ return position_ids, mrope_position_deltas
1241
+
1242
+ def _update_model_kwargs_for_generation(
1243
+ self,
1244
+ outputs: ModelOutput,
1245
+ model_kwargs: Dict[str, Any],
1246
+ is_encoder_decoder: bool = False,
1247
+ num_new_tokens: int = 1,
1248
+ ) -> Dict[str, Any]:
1249
+ model_kwargs = super()._update_model_kwargs_for_generation(
1250
+ outputs=outputs,
1251
+ model_kwargs=model_kwargs,
1252
+ is_encoder_decoder=is_encoder_decoder,
1253
+ num_new_tokens=num_new_tokens,
1254
+ )
1255
+
1256
+ if getattr(outputs, "rope_deltas", None) is not None:
1257
+ model_kwargs["rope_deltas"] = outputs.rope_deltas
1258
+
1259
+ return model_kwargs
1260
+
1261
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1262
+ @replace_return_docstrings(output_type=Bailing2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1263
+ def forward(
1264
+ self,
1265
+ input_ids: torch.LongTensor = None,
1266
+ attention_mask: Optional[torch.Tensor] = None,
1267
+ position_ids: Optional[torch.LongTensor] = None,
1268
+ query_embeds_image: Optional[torch.Tensor] = None,
1269
+ query_embeds_video: Optional[torch.Tensor] = None,
1270
+ query_embeds_audio: Optional[torch.Tensor] = None,
1271
+ target_embeds: Optional[torch.LongTensor] = None,
1272
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1273
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1274
+ img_gen_embeds: Optional[torch.Tensor] = None,
1275
+ labels: Optional[torch.LongTensor] = None,
1276
+ use_cache: Optional[bool] = None,
1277
+ output_attentions: Optional[bool] = None,
1278
+ output_hidden_states: Optional[bool] = None,
1279
+ return_dict: Optional[bool] = None,
1280
+ reduction: Optional[str] = "mean",
1281
+ weights=None,
1282
+ is_pretrain=False,
1283
+ image_grid_thw: Optional[torch.LongTensor] = None,
1284
+ image_grid_thw_video: Optional[torch.LongTensor] = None,
1285
+ cache_position: Optional[torch.LongTensor] = None,
1286
+ rope_deltas: Optional[torch.LongTensor] = None,
1287
+ second_per_grid_ts: Optional[torch.Tensor] = None,
1288
+ is_audio_generation_mode=False,
1289
+ no_image_end_prediction=False,
1290
+ ) -> Union[Tuple, Bailing2CausalLMOutputWithPast]:
1291
+ r"""
1292
+ Args:
1293
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1294
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1295
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1296
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1297
+
1298
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
1299
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
1300
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1301
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1302
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
1303
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
1304
+
1305
+ Returns:
1306
+
1307
+ Example:
1308
+
1309
+ ```python
1310
+ >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
1311
+
1312
+ >>> model = Qwen2ForCausalLM.from_pretrained("meta-qwen2/Qwen2-2-7b-hf")
1313
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-qwen2/Qwen2-2-7b-hf")
1314
+
1315
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1316
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1317
+
1318
+ >>> # Generate
1319
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1320
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1321
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1322
+ ```"""
1323
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1324
+ output_hidden_states = (
1325
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1326
+ )
1327
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1328
+
1329
+ ignore_flag = False
1330
+ if self.use_llm_3drope:
1331
+ # update position_ids for llm_3drope
1332
+ if position_ids is None and input_ids is not None:
1333
+ # try:
1334
+ # position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, image_grid_thw_video,
1335
+ # attention_mask)
1336
+ # except Exception as e:
1337
+ # position_ids, _ = self.get_rope_index(input_ids, attention_mask=attention_mask)
1338
+ # ignore_flag = True
1339
+ position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, image_grid_thw_video, attention_mask)
1340
+
1341
+ #embed()
1342
+ outputs = self.model(
1343
+ input_ids=input_ids,
1344
+ attention_mask=attention_mask,
1345
+ query_embeds_image=query_embeds_image,
1346
+ query_embeds_video=query_embeds_video,
1347
+ query_embeds_audio=query_embeds_audio,
1348
+ target_embeds=target_embeds,
1349
+ position_ids=position_ids,
1350
+ past_key_values=past_key_values,
1351
+ inputs_embeds=inputs_embeds,
1352
+ img_gen_embeds=img_gen_embeds,
1353
+ use_cache=use_cache,
1354
+ output_attentions=output_attentions,
1355
+ output_hidden_states=output_hidden_states,
1356
+ return_dict=return_dict,
1357
+ image_grid_thw=image_grid_thw,
1358
+ image_grid_thw_video=image_grid_thw_video,
1359
+ cache_position=cache_position,
1360
+ )
1361
+
1362
+ hidden_states = outputs[0]
1363
+ logits = self.lm_head(hidden_states)
1364
+
1365
+ if is_audio_generation_mode is True:
1366
+ need_replace = torch.argmax(logits[:, -1, :], -1) >= self.audio_id_shift
1367
+ next_audio_token_logits_for_generation = logits[:, -1, self.audio_id_shift:]
1368
+ next_audio_token_for_generation = (
1369
+ self.audio_decoder_sample(next_audio_token_logits_for_generation) + self.audio_id_shift).view(
1370
+ -1)
1371
+ logits[torch.tensor(range(logits.size(0)), device=logits.device)[need_replace], -1,
1372
+ next_audio_token_for_generation[need_replace]] = 99999
1373
+
1374
+ loss = None
1375
+ assert labels is None
1376
+
1377
+
1378
+ if not return_dict:
1379
+ output = (logits,) + outputs[1:]
1380
+ return (loss,) + output if loss is not None else output
1381
+
1382
+ return Bailing2CausalLMOutputWithPast(
1383
+ loss=loss,
1384
+ logits=logits,
1385
+ past_key_values=outputs.past_key_values,
1386
+ hidden_states=outputs.hidden_states,
1387
+ attentions=outputs.attentions,
1388
+ rope_deltas=rope_deltas,
1389
+ last_hidden_state=outputs.last_hidden_state,
1390
+ )
1391
+
1392
+ def prepare_inputs_for_generation(
1393
+ self,
1394
+ input_ids,
1395
+ query_embeds_image=None,
1396
+ query_embeds_video=None,
1397
+ query_embeds_audio=None,
1398
+ past_key_values=None,
1399
+ attention_mask=None,
1400
+ inputs_embeds=None,
1401
+ cache_position=None,
1402
+ position_ids=None,
1403
+ use_cache=True,
1404
+ image_grid_thw=None,
1405
+ image_grid_thw_video=None,
1406
+ second_per_grid_ts=None,
1407
+ is_audio_generation_mode=False,
1408
+ **kwargs,
1409
+ ):
1410
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
1411
+
1412
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1413
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
1414
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1415
+ # Exception 3: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
1416
+ # generate the first token for each sequence. Later use the generated Input ids for continuation.
1417
+ if past_key_values is not None:
1418
+ if inputs_embeds is not None:
1419
+ input_ids = input_ids[:, -cache_position.shape[0]:]
1420
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1421
+ input_ids = input_ids[:, cache_position]
1422
+
1423
+ img_gen_embeds = None
1424
+
1425
+ rope_deltas = kwargs.get("rope_deltas", None)
1426
+ if attention_mask is not None and position_ids is None:
1427
+ if self.use_llm_3drope:
1428
+ if cache_position is None or (cache_position is not None and cache_position[0] == 0):
1429
+ position_ids, rope_deltas = self.get_rope_index(
1430
+ input_ids, image_grid_thw, image_grid_thw_video, attention_mask
1431
+ )
1432
+ else:
1433
+ batch_size, seq_length = input_ids.shape
1434
+ delta = (
1435
+ cache_position[0] + rope_deltas if cache_position is not None and rope_deltas is not None else 0
1436
+ )
1437
+ position_ids = torch.arange(seq_length, device=input_ids.device)
1438
+ position_ids = position_ids.view(1, -1).expand(batch_size, -1)
1439
+ position_ids = position_ids.add(delta)
1440
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
1441
+ else:
1442
+ position_ids = attention_mask.long().cumsum(-1) - 1
1443
+ position_ids.masked_fill_(attention_mask == 0, 1)
1444
+ if past_key_values:
1445
+ position_ids = position_ids[:, -input_ids.shape[1]:]
1446
+
1447
+ # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
1448
+ position_ids = position_ids.clone(memory_format=torch.contiguous_format)
1449
+
1450
+ if cache_position[0] != 0:
1451
+ query_embeds_image = None
1452
+ query_embeds_video = None
1453
+ query_embeds_audio = None
1454
+
1455
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1456
+ if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
1457
+ model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
1458
+ else:
1459
+ model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
1460
+
1461
+ if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
1462
+ if model_inputs["inputs_embeds"] is not None:
1463
+ batch_size, sequence_length, _ = inputs_embeds.shape
1464
+ device = inputs_embeds.device
1465
+ else:
1466
+ batch_size, sequence_length = input_ids.shape
1467
+ device = input_ids.device
1468
+
1469
+ attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
1470
+ attention_mask,
1471
+ sequence_length=sequence_length,
1472
+ target_length=past_key_values.get_max_cache_shape(),
1473
+ dtype=self.lm_head.weight.dtype,
1474
+ device=device,
1475
+ cache_position=cache_position,
1476
+ batch_size=batch_size,
1477
+ )
1478
+
1479
+ model_inputs.update(
1480
+ {
1481
+ "position_ids": position_ids,
1482
+ "query_embeds_image": query_embeds_image,
1483
+ "query_embeds_video": query_embeds_video,
1484
+ "query_embeds_audio": query_embeds_audio,
1485
+ "past_key_values": past_key_values,
1486
+ "use_cache": use_cache,
1487
+ "attention_mask": attention_mask,
1488
+ "img_gen_embeds": img_gen_embeds,
1489
+ "image_grid_thw": image_grid_thw,
1490
+ "image_grid_thw_video": image_grid_thw_video,
1491
+ "cache_position": cache_position,
1492
+ "rope_deltas": rope_deltas,
1493
+ "second_per_grid_ts": second_per_grid_ts,
1494
+ "is_audio_generation_mode": is_audio_generation_mode,
1495
+ }
1496
+ )
1497
+ return model_inputs
Ming_Uni/modeling_rope_utils.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from typing import Optional, Tuple
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import is_torch_available, logging
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ if is_torch_available():
24
+ import torch
25
+
26
+ def _compute_default_rope_parameters(
27
+ config: Optional[PretrainedConfig] = None,
28
+ device: Optional["torch.device"] = None,
29
+ seq_len: Optional[int] = None,
30
+ **rope_kwargs,
31
+ ) -> Tuple["torch.Tensor", float]:
32
+ """
33
+ Computes the inverse frequencies according to the original RoPE implementation
34
+ Args:
35
+ config ([`~transformers.PretrainedConfig`]):
36
+ The model configuration.
37
+ device (`torch.device`):
38
+ The device to use for initialization of the inverse frequencies.
39
+ seq_len (`int`, *optional*):
40
+ The current sequence length. Unused for this type of RoPE.
41
+ rope_kwargs (`Dict`, *optional*):
42
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
43
+ Returns:
44
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
45
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
46
+ """
47
+ if config is not None and len(rope_kwargs) > 0:
48
+ raise ValueError(
49
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
50
+ f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
51
+ )
52
+ if len(rope_kwargs) > 0:
53
+ base = rope_kwargs["base"]
54
+ dim = rope_kwargs["dim"]
55
+ elif config is not None:
56
+ base = config.rope_theta
57
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
58
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
59
+ dim = int(head_dim * partial_rotary_factor)
60
+
61
+ attention_factor = 1.0 # Unused in this type of RoPE
62
+
63
+ # Compute the inverse frequencies
64
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
65
+ return inv_freq, attention_factor
66
+
67
+ def _compute_linear_scaling_rope_parameters(
68
+ config: Optional[PretrainedConfig] = None,
69
+ device: Optional["torch.device"] = None,
70
+ seq_len: Optional[int] = None,
71
+ **rope_kwargs,
72
+ ) -> Tuple["torch.Tensor", float]:
73
+ """
74
+ Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
75
+ Args:
76
+ config ([`~transformers.PretrainedConfig`]):
77
+ The model configuration.
78
+ device (`torch.device`):
79
+ The device to use for initialization of the inverse frequencies.
80
+ seq_len (`int`, *optional*):
81
+ The current sequence length. Unused for this type of RoPE.
82
+ rope_kwargs (`Dict`, *optional*):
83
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
84
+ Returns:
85
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
86
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
87
+ """
88
+ if config is not None and len(rope_kwargs) > 0:
89
+ raise ValueError(
90
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
91
+ f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
92
+ )
93
+ if len(rope_kwargs) > 0:
94
+ factor = rope_kwargs["factor"]
95
+ elif config is not None:
96
+ factor = config.rope_scaling["factor"]
97
+
98
+ # Gets the default RoPE parameters
99
+ inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
100
+
101
+ # Then applies linear scaling to the frequencies.
102
+ # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
103
+ # applying scaling to the inverse frequencies is equivalent.
104
+ inv_freq /= factor
105
+ return inv_freq, attention_factor
106
+
107
+ def _compute_dynamic_ntk_parameters(
108
+ config: Optional[PretrainedConfig] = None,
109
+ device: Optional["torch.device"] = None,
110
+ seq_len: Optional[int] = None,
111
+ **rope_kwargs,
112
+ ) -> Tuple["torch.Tensor", float]:
113
+ """
114
+ Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
115
+ Args:
116
+ config ([`~transformers.PretrainedConfig`]):
117
+ The model configuration.
118
+ device (`torch.device`):
119
+ The device to use for initialization of the inverse frequencies.
120
+ seq_len (`int`, *optional*):
121
+ The current sequence length, used to update the dynamic RoPE at inference time.
122
+ rope_kwargs (`Dict`, *optional*):
123
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
124
+ Returns:
125
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
126
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
127
+ """
128
+ # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
129
+ if config is not None and len(rope_kwargs) > 0:
130
+ raise ValueError(
131
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
132
+ f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
133
+ )
134
+ if len(rope_kwargs) > 0:
135
+ base = rope_kwargs["base"]
136
+ dim = rope_kwargs["dim"]
137
+ max_position_embeddings = rope_kwargs["max_position_embeddings"]
138
+ factor = rope_kwargs["factor"]
139
+ elif config is not None:
140
+ base = config.rope_theta
141
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
142
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
143
+ dim = int(head_dim * partial_rotary_factor)
144
+ max_position_embeddings = config.max_position_embeddings
145
+ factor = config.rope_scaling["factor"]
146
+
147
+ attention_factor = 1.0 # Unused in this type of RoPE
148
+
149
+ # seq_len: default to max_position_embeddings, e.g. at init time
150
+ seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
151
+
152
+ # Compute the inverse frequencies
153
+ base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
154
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
155
+ return inv_freq, attention_factor
156
+
157
+ def _compute_yarn_parameters(
158
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
159
+ ) -> Tuple["torch.Tensor", float]:
160
+ """
161
+ Computes the inverse frequencies with NTK scaling. Please refer to the
162
+ [original paper](https://arxiv.org/abs/2309.00071)
163
+ Args:
164
+ config ([`~transformers.PretrainedConfig`]):
165
+ The model configuration.
166
+ device (`torch.device`):
167
+ The device to use for initialization of the inverse frequencies.
168
+ seq_len (`int`, *optional*):
169
+ The current sequence length. Unused for this type of RoPE.
170
+ rope_kwargs (`Dict`, *optional*):
171
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
172
+ Returns:
173
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
174
+ post-processing scaling factor applied to the computed cos/sin.
175
+ """
176
+ # No need to keep BC with yarn, unreleased when this new pattern was created.
177
+ if len(rope_kwargs) > 0:
178
+ raise ValueError(
179
+ f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
180
+ )
181
+
182
+ base = config.rope_theta
183
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
184
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
185
+ dim = int(head_dim * partial_rotary_factor)
186
+ max_position_embeddings = config.max_position_embeddings
187
+ factor = config.rope_scaling["factor"]
188
+
189
+ # Sets the attention factor as suggested in the paper
190
+ attention_factor = config.rope_scaling.get("attention_factor")
191
+ if attention_factor is None:
192
+ attention_factor = 0.1 * math.log(factor) + 1.0
193
+
194
+ # Optional config options
195
+ # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
196
+ beta_fast = config.rope_scaling.get("beta_fast") or 32
197
+ beta_slow = config.rope_scaling.get("beta_slow") or 1
198
+
199
+ # Compute the inverse frequencies
200
+ def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
201
+ """Inverse dimension formula to find the dimension based on the number of rotations"""
202
+ return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
203
+
204
+ def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
205
+ """Find dimension range bounds based on rotations"""
206
+ low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
207
+ high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
208
+ return max(low, 0), min(high, dim - 1)
209
+
210
+ def linear_ramp_factor(min, max, dim):
211
+ if min == max:
212
+ max += 0.001 # Prevent singularity
213
+
214
+ linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
215
+ ramp_func = torch.clamp(linear_func, 0, 1)
216
+ return ramp_func
217
+
218
+ # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
219
+ # to expand the possible context length. In other words, interpolation = apply scaling factor.
220
+ pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
221
+ inv_freq_extrapolation = 1.0 / pos_freqs
222
+ inv_freq_interpolation = 1.0 / (factor * pos_freqs)
223
+
224
+ low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
225
+
226
+ # Get n-dimensional rotational scaling corrected for extrapolation
227
+ inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
228
+ inv_freq = (
229
+ inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
230
+ + inv_freq_extrapolation * inv_freq_extrapolation_factor
231
+ )
232
+
233
+ return inv_freq, attention_factor
234
+
235
+ def _compute_longrope_parameters(
236
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
237
+ ) -> Tuple["torch.Tensor", float]:
238
+ """
239
+ Computes the inverse frequencies with LongRoPE scaling. Please refer to the
240
+ [original implementation](https://github.com/microsoft/LongRoPE)
241
+ Args:
242
+ config ([`~transformers.PretrainedConfig`]):
243
+ The model configuration.
244
+ device (`torch.device`):
245
+ The device to use for initialization of the inverse frequencies.
246
+ seq_len (`int`, *optional*):
247
+ The current sequence length.
248
+ rope_kwargs (`Dict`, *optional*):
249
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
250
+ Returns:
251
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
252
+ post-processing scaling factor applied to the computed cos/sin.
253
+ """
254
+ # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
255
+ # No need to keep BC with longrope, unreleased when this new pattern was created.
256
+ if len(rope_kwargs) > 0:
257
+ raise ValueError(
258
+ "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
259
+ f"{rope_kwargs}"
260
+ )
261
+
262
+ base = config.rope_theta
263
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
264
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
265
+ dim = int(head_dim * partial_rotary_factor)
266
+ long_factor = config.rope_scaling["long_factor"]
267
+ short_factor = config.rope_scaling["short_factor"]
268
+ factor = config.rope_scaling.get("factor")
269
+ attention_factor = config.rope_scaling.get("attention_factor")
270
+
271
+ # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
272
+ # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
273
+ # values to compute the default attention scaling factor, instead of using `factor`.
274
+ if hasattr(config, "original_max_position_embeddings"):
275
+ original_max_position_embeddings = config.original_max_position_embeddings
276
+ factor = config.max_position_embeddings / config.original_max_position_embeddings
277
+ else:
278
+ original_max_position_embeddings = config.max_position_embeddings
279
+
280
+ # Sets the attention factor as suggested in the paper
281
+ if attention_factor is None:
282
+ if factor <= 1.0:
283
+ attention_factor = 1.0
284
+ else:
285
+ attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
286
+
287
+ # Compute the inverse frequencies -- scaled based on the target sequence length
288
+ if seq_len and seq_len > original_max_position_embeddings:
289
+ ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
290
+ else:
291
+ ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
292
+ inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
293
+ inv_freq = 1.0 / (ext_factors * base ** inv_freq_shape)
294
+
295
+ return inv_freq, attention_factor
296
+
297
+ def _compute_llama3_parameters(
298
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
299
+ ) -> Tuple["torch.Tensor", float]:
300
+ """
301
+ Computes the inverse frequencies for llama 3.1.
302
+
303
+ Args:
304
+ config ([`~transformers.PretrainedConfig`]):
305
+ The model configuration.
306
+ device (`torch.device`):
307
+ The device to use for initialization of the inverse frequencies.
308
+ seq_len (`int`, *optional*):
309
+ The current sequence length. Unused for this type of RoPE.
310
+ rope_kwargs (`Dict`, *optional*):
311
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
312
+ Returns:
313
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
314
+ post-processing scaling factor applied to the computed cos/sin.
315
+ """
316
+ # Gets the default RoPE parameters
317
+ inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
318
+
319
+ factor = config.rope_scaling["factor"] # `8` in the original implementation
320
+ low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation
321
+ high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation
322
+ old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation
323
+
324
+ low_freq_wavelen = old_context_len / low_freq_factor
325
+ high_freq_wavelen = old_context_len / high_freq_factor
326
+
327
+ wavelen = 2 * math.pi / inv_freq
328
+ # wavelen < high_freq_wavelen: do nothing
329
+ # wavelen > low_freq_wavelen: divide by factor
330
+ inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
331
+ # otherwise: interpolate between the two, using a smooth factor
332
+ smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
333
+ smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
334
+ is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
335
+ inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
336
+
337
+ return inv_freq_llama, attention_factor
338
+
339
+ # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
340
+ # from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
341
+ # parameterizations, as long as the callable has the same signature.
342
+ ROPE_INIT_FUNCTIONS = {
343
+ "default": _compute_default_rope_parameters,
344
+ "linear": _compute_linear_scaling_rope_parameters,
345
+ "dynamic": _compute_dynamic_ntk_parameters,
346
+ "yarn": _compute_yarn_parameters,
347
+ "longrope": _compute_longrope_parameters,
348
+ "llama3": _compute_llama3_parameters,
349
+ }
350
+
351
+ def _check_received_keys(
352
+ rope_type: str,
353
+ received_keys: set,
354
+ required_keys: set,
355
+ optional_keys: Optional[set] = None,
356
+ ignore_keys: Optional[set] = None,
357
+ ):
358
+ """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
359
+ # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
360
+ if "type" in received_keys:
361
+ received_keys -= {"type"}
362
+ required_keys.add("rope_type")
363
+
364
+ # Some models need to store model-specific keys, and we don't want to throw warning at them
365
+ if ignore_keys is not None:
366
+ received_keys -= ignore_keys
367
+
368
+ missing_keys = required_keys - received_keys
369
+ if missing_keys:
370
+ raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
371
+
372
+ if optional_keys is not None:
373
+ unused_keys = received_keys - required_keys - optional_keys
374
+ else:
375
+ unused_keys = received_keys - required_keys
376
+ if unused_keys:
377
+ logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
378
+
379
+ def _validate_default_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
380
+ rope_scaling = config.rope_scaling
381
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
382
+ required_keys = {"rope_type"}
383
+ received_keys = set(rope_scaling.keys())
384
+ _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
385
+
386
+ def _validate_linear_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
387
+ rope_scaling = config.rope_scaling
388
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
389
+ required_keys = {"rope_type", "factor"}
390
+ received_keys = set(rope_scaling.keys())
391
+ _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
392
+
393
+ factor = rope_scaling["factor"]
394
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
395
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
396
+
397
+ def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
398
+ rope_scaling = config.rope_scaling
399
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
400
+ required_keys = {"rope_type", "factor"}
401
+ # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
402
+ optional_keys = {"original_max_position_embeddings"}
403
+ received_keys = set(rope_scaling.keys())
404
+ _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
405
+
406
+ factor = rope_scaling["factor"]
407
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
408
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
409
+
410
+ def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
411
+ rope_scaling = config.rope_scaling
412
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
413
+ required_keys = {"rope_type", "factor"}
414
+ optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
415
+ received_keys = set(rope_scaling.keys())
416
+ _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
417
+
418
+ factor = rope_scaling["factor"]
419
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
420
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
421
+
422
+ attention_factor = rope_scaling.get("attention_factor")
423
+ if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
424
+ logger.warning(
425
+ f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
426
+ )
427
+ beta_fast = rope_scaling.get("beta_fast")
428
+ if beta_fast is not None and not isinstance(beta_fast, float):
429
+ logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
430
+ beta_slow = rope_scaling.get("beta_slow")
431
+ if beta_slow is not None and not isinstance(beta_slow, float):
432
+ logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")
433
+
434
+ if (beta_fast or 32) < (beta_slow or 1):
435
+ logger.warning(
436
+ f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
437
+ f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
438
+ )
439
+
440
+ def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
441
+ rope_scaling = config.rope_scaling
442
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
443
+ required_keys = {"rope_type", "short_factor", "long_factor"}
444
+ # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
445
+ optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
446
+ received_keys = set(rope_scaling.keys())
447
+ _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
448
+
449
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
450
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
451
+ dim = int(head_dim * partial_rotary_factor)
452
+
453
+ short_factor = rope_scaling.get("short_factor")
454
+ if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
455
+ logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
456
+ if not len(short_factor) == dim // 2:
457
+ logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")
458
+
459
+ long_factor = rope_scaling.get("long_factor")
460
+ if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
461
+ logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
462
+ if not len(long_factor) == dim // 2:
463
+ logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")
464
+
465
+ # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
466
+ # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
467
+ # unique to longrope (= undesirable)
468
+ if hasattr(config, "original_max_position_embeddings"):
469
+ logger.warning_once(
470
+ "This model has set a `original_max_position_embeddings` field, to be used together with "
471
+ "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
472
+ "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
473
+ "as it is compatible with most model architectures."
474
+ )
475
+ else:
476
+ factor = rope_scaling.get("factor")
477
+ if factor is None:
478
+ logger.warning("Missing required keys in `rope_scaling`: 'factor'")
479
+ elif not isinstance(factor, float) or factor < 1.0:
480
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
481
+
482
+ attention_factor = rope_scaling.get("attention_factor")
483
+ if attention_factor is not None:
484
+ if not isinstance(attention_factor, float) or attention_factor < 0.0:
485
+ logger.warning(
486
+ f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
487
+ )
488
+
489
+ def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None):
490
+ rope_scaling = config.rope_scaling
491
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
492
+ required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
493
+ received_keys = set(rope_scaling.keys())
494
+ _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
495
+
496
+ factor = rope_scaling["factor"]
497
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
498
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
499
+
500
+ low_freq_factor = rope_scaling["low_freq_factor"]
501
+ high_freq_factor = rope_scaling["high_freq_factor"]
502
+ if low_freq_factor is None or not isinstance(low_freq_factor, float):
503
+ logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
504
+ if high_freq_factor is None or not isinstance(high_freq_factor, float):
505
+ logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
506
+ if high_freq_factor <= low_freq_factor:
507
+ logger.warning(
508
+ "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
509
+ f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
510
+ )
511
+
512
+ original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
513
+ if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
514
+ logger.warning(
515
+ "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
516
+ f"{original_max_position_embeddings}"
517
+ )
518
+ if original_max_position_embeddings >= config.max_position_embeddings:
519
+ logger.warning(
520
+ "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
521
+ f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
522
+ )
523
+
524
+ # Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
525
+ ROPE_VALIDATION_FUNCTIONS = {
526
+ "default": _validate_default_rope_parameters,
527
+ "linear": _validate_linear_scaling_rope_parameters,
528
+ "dynamic": _validate_dynamic_scaling_rope_parameters,
529
+ "yarn": _validate_yarn_parameters,
530
+ "longrope": _validate_longrope_parameters,
531
+ "llama3": _validate_llama3_parameters,
532
+ }
533
+
534
+ def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None):
535
+ """
536
+ Validate the RoPE config arguments, given a `PretrainedConfig` object
537
+ """
538
+ rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig`
539
+ if rope_scaling is None:
540
+ return
541
+
542
+ # BC: "rope_type" was originally "type"
543
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
544
+ validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
545
+ if validation_fn is not None:
546
+ validation_fn(config, ignore_keys=ignore_keys)
547
+ else:
548
+ logger.warning(
549
+ f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
550
+ )
Ming_Uni/pipeline_sana.py ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import html
16
+ import inspect
17
+ import re
18
+ import urllib.parse as ul
19
+ import warnings
20
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
21
+
22
+ import torch
23
+ from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
24
+
25
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
26
+ from diffusers.image_processor import PixArtImageProcessor
27
+ from diffusers.loaders import SanaLoraLoaderMixin
28
+ from diffusers.models import AutoencoderDC, SanaTransformer2DModel
29
+ from diffusers.schedulers import DPMSolverMultistepScheduler
30
+ from diffusers.utils import (
31
+ BACKENDS_MAPPING,
32
+ USE_PEFT_BACKEND,
33
+ is_bs4_available,
34
+ is_ftfy_available,
35
+ is_torch_xla_available,
36
+ logging,
37
+ replace_example_docstring,
38
+ scale_lora_layers,
39
+ unscale_lora_layers,
40
+ )
41
+ from diffusers.utils.torch_utils import randn_tensor
42
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
43
+ from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha import (
44
+ ASPECT_RATIO_512_BIN,
45
+ ASPECT_RATIO_1024_BIN,
46
+ )
47
+ from diffusers.pipelines.pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
48
+ from diffusers.pipelines.sana.pipeline_output import SanaPipelineOutput
49
+
50
+
51
+ if is_torch_xla_available():
52
+ import torch_xla.core.xla_model as xm
53
+
54
+ XLA_AVAILABLE = True
55
+ else:
56
+ XLA_AVAILABLE = False
57
+
58
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
59
+
60
+ if is_bs4_available():
61
+ from bs4 import BeautifulSoup
62
+
63
+ if is_ftfy_available():
64
+ import ftfy
65
+
66
+
67
+ ASPECT_RATIO_4096_BIN = {
68
+ "0.25": [2048.0, 8192.0],
69
+ "0.26": [2048.0, 7936.0],
70
+ "0.27": [2048.0, 7680.0],
71
+ "0.28": [2048.0, 7424.0],
72
+ "0.32": [2304.0, 7168.0],
73
+ "0.33": [2304.0, 6912.0],
74
+ "0.35": [2304.0, 6656.0],
75
+ "0.4": [2560.0, 6400.0],
76
+ "0.42": [2560.0, 6144.0],
77
+ "0.48": [2816.0, 5888.0],
78
+ "0.5": [2816.0, 5632.0],
79
+ "0.52": [2816.0, 5376.0],
80
+ "0.57": [3072.0, 5376.0],
81
+ "0.6": [3072.0, 5120.0],
82
+ "0.68": [3328.0, 4864.0],
83
+ "0.72": [3328.0, 4608.0],
84
+ "0.78": [3584.0, 4608.0],
85
+ "0.82": [3584.0, 4352.0],
86
+ "0.88": [3840.0, 4352.0],
87
+ "0.94": [3840.0, 4096.0],
88
+ "1.0": [4096.0, 4096.0],
89
+ "1.07": [4096.0, 3840.0],
90
+ "1.13": [4352.0, 3840.0],
91
+ "1.21": [4352.0, 3584.0],
92
+ "1.29": [4608.0, 3584.0],
93
+ "1.38": [4608.0, 3328.0],
94
+ "1.46": [4864.0, 3328.0],
95
+ "1.67": [5120.0, 3072.0],
96
+ "1.75": [5376.0, 3072.0],
97
+ "2.0": [5632.0, 2816.0],
98
+ "2.09": [5888.0, 2816.0],
99
+ "2.4": [6144.0, 2560.0],
100
+ "2.5": [6400.0, 2560.0],
101
+ "2.89": [6656.0, 2304.0],
102
+ "3.0": [6912.0, 2304.0],
103
+ "3.11": [7168.0, 2304.0],
104
+ "3.62": [7424.0, 2048.0],
105
+ "3.75": [7680.0, 2048.0],
106
+ "3.88": [7936.0, 2048.0],
107
+ "4.0": [8192.0, 2048.0],
108
+ }
109
+
110
+ EXAMPLE_DOC_STRING = """
111
+ Examples:
112
+ ```py
113
+ >>> import torch
114
+ >>> from diffusers import SanaPipeline
115
+
116
+ >>> pipe = SanaPipeline.from_pretrained(
117
+ ... "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers", torch_dtype=torch.float32
118
+ ... )
119
+ >>> pipe.to("cuda")
120
+ >>> pipe.text_encoder.to(torch.bfloat16)
121
+ >>> pipe.transformer = pipe.transformer.to(torch.bfloat16)
122
+
123
+ >>> image = pipe(prompt='a cyberpunk cat with a neon sign that says "Sana"')[0]
124
+ >>> image[0].save("output.png")
125
+ ```
126
+ """
127
+
128
+
129
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
130
+ def retrieve_timesteps(
131
+ scheduler,
132
+ num_inference_steps: Optional[int] = None,
133
+ device: Optional[Union[str, torch.device]] = None,
134
+ timesteps: Optional[List[int]] = None,
135
+ sigmas: Optional[List[float]] = None,
136
+ **kwargs,
137
+ ):
138
+ r"""
139
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
140
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
141
+
142
+ Args:
143
+ scheduler (`SchedulerMixin`):
144
+ The scheduler to get timesteps from.
145
+ num_inference_steps (`int`):
146
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
147
+ must be `None`.
148
+ device (`str` or `torch.device`, *optional*):
149
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
150
+ timesteps (`List[int]`, *optional*):
151
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
152
+ `num_inference_steps` and `sigmas` must be `None`.
153
+ sigmas (`List[float]`, *optional*):
154
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
155
+ `num_inference_steps` and `timesteps` must be `None`.
156
+
157
+ Returns:
158
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
159
+ second element is the number of inference steps.
160
+ """
161
+ if timesteps is not None and sigmas is not None:
162
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
163
+ if timesteps is not None:
164
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
165
+ if not accepts_timesteps:
166
+ raise ValueError(
167
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
168
+ f" timestep schedules. Please check whether you are using the correct scheduler."
169
+ )
170
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
171
+ timesteps = scheduler.timesteps
172
+ num_inference_steps = len(timesteps)
173
+ elif sigmas is not None:
174
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
175
+ if not accept_sigmas:
176
+ raise ValueError(
177
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
178
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
179
+ )
180
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
181
+ timesteps = scheduler.timesteps
182
+ num_inference_steps = len(timesteps)
183
+ else:
184
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
185
+ timesteps = scheduler.timesteps
186
+ return timesteps, num_inference_steps
187
+
188
+
189
+ class SanaPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
190
+ r"""
191
+ Pipeline for text-to-image generation using [Sana](https://huggingface.co/papers/2410.10629).
192
+ """
193
+
194
+ # fmt: off
195
+ bad_punct_regex = re.compile(r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}")
196
+ # fmt: on
197
+
198
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
199
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
200
+
201
+ def __init__(
202
+ self,
203
+ tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
204
+ text_encoder: Gemma2PreTrainedModel,
205
+ vae: AutoencoderDC,
206
+ transformer: SanaTransformer2DModel,
207
+ scheduler: DPMSolverMultistepScheduler,
208
+ ):
209
+ super().__init__()
210
+
211
+ self.register_modules(
212
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
213
+ )
214
+
215
+ self.vae_scale_factor = (
216
+ 2 ** (len(self.vae.config.encoder_block_out_channels) - 1)
217
+ if hasattr(self, "vae") and self.vae is not None
218
+ else 32
219
+ )
220
+ self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
221
+
222
+ def enable_vae_slicing(self):
223
+ r"""
224
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
225
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
226
+ """
227
+ self.vae.enable_slicing()
228
+
229
+ def disable_vae_slicing(self):
230
+ r"""
231
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
232
+ computing decoding in one step.
233
+ """
234
+ self.vae.disable_slicing()
235
+
236
+ def enable_vae_tiling(self):
237
+ r"""
238
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
239
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
240
+ processing larger images.
241
+ """
242
+ self.vae.enable_tiling()
243
+
244
+ def disable_vae_tiling(self):
245
+ r"""
246
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
247
+ computing decoding in one step.
248
+ """
249
+ self.vae.disable_tiling()
250
+
251
+ def _get_gemma_prompt_embeds(
252
+ self,
253
+ prompt: Union[str, List[str]],
254
+ device: torch.device,
255
+ dtype: torch.dtype,
256
+ clean_caption: bool = False,
257
+ max_sequence_length: int = 300,
258
+ complex_human_instruction: Optional[List[str]] = None,
259
+ ):
260
+ r"""
261
+ Encodes the prompt into text encoder hidden states.
262
+
263
+ Args:
264
+ prompt (`str` or `List[str]`, *optional*):
265
+ prompt to be encoded
266
+ device: (`torch.device`, *optional*):
267
+ torch device to place the resulting embeddings on
268
+ clean_caption (`bool`, defaults to `False`):
269
+ If `True`, the function will preprocess and clean the provided caption before encoding.
270
+ max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
271
+ complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
272
+ If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
273
+ the prompt.
274
+ """
275
+ prompt = [prompt] if isinstance(prompt, str) else prompt
276
+
277
+ if getattr(self, "tokenizer", None) is not None:
278
+ self.tokenizer.padding_side = "right"
279
+
280
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
281
+
282
+ # prepare complex human instruction
283
+ if not complex_human_instruction:
284
+ max_length_all = max_sequence_length
285
+ else:
286
+ chi_prompt = "\n".join(complex_human_instruction)
287
+ prompt = [chi_prompt + p for p in prompt]
288
+ num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
289
+ max_length_all = num_chi_prompt_tokens + max_sequence_length - 2
290
+
291
+ text_inputs = self.tokenizer(
292
+ prompt,
293
+ padding="max_length",
294
+ max_length=max_length_all,
295
+ truncation=True,
296
+ add_special_tokens=True,
297
+ return_tensors="pt",
298
+ )
299
+ text_input_ids = text_inputs.input_ids
300
+
301
+ prompt_attention_mask = text_inputs.attention_mask
302
+ prompt_attention_mask = prompt_attention_mask.to(device)
303
+
304
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
305
+ prompt_embeds = prompt_embeds[0].to(dtype=dtype, device=device)
306
+
307
+ return prompt_embeds, prompt_attention_mask
308
+
309
+ def encode_prompt(
310
+ self,
311
+ prompt: Union[str, List[str]],
312
+ do_classifier_free_guidance: bool = True,
313
+ negative_prompt: str = "",
314
+ num_images_per_prompt: int = 1,
315
+ device: Optional[torch.device] = None,
316
+ prompt_embeds: Optional[torch.Tensor] = None,
317
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
318
+ prompt_attention_mask: Optional[torch.Tensor] = None,
319
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
320
+ clean_caption: bool = False,
321
+ max_sequence_length: int = 300,
322
+ complex_human_instruction: Optional[List[str]] = None,
323
+ lora_scale: Optional[float] = None,
324
+ ):
325
+ r"""
326
+ Encodes the prompt into text encoder hidden states.
327
+
328
+ Args:
329
+ prompt (`str` or `List[str]`, *optional*):
330
+ prompt to be encoded
331
+ negative_prompt (`str` or `List[str]`, *optional*):
332
+ The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
333
+ instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
334
+ PixArt-Alpha, this should be "".
335
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
336
+ whether to use classifier free guidance or not
337
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
338
+ number of images that should be generated per prompt
339
+ device: (`torch.device`, *optional*):
340
+ torch device to place the resulting embeddings on
341
+ prompt_embeds (`torch.Tensor`, *optional*):
342
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
343
+ provided, text embeddings will be generated from `prompt` input argument.
344
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
345
+ Pre-generated negative text embeddings. For Sana, it's should be the embeddings of the "" string.
346
+ clean_caption (`bool`, defaults to `False`):
347
+ If `True`, the function will preprocess and clean the provided caption before encoding.
348
+ max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
349
+ complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
350
+ If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
351
+ the prompt.
352
+ """
353
+
354
+ if device is None:
355
+ device = self._execution_device
356
+
357
+ if self.transformer is not None:
358
+ dtype = self.transformer.dtype
359
+ elif self.text_encoder is not None:
360
+ dtype = self.text_encoder.dtype
361
+ else:
362
+ dtype = None
363
+
364
+ # set lora scale so that monkey patched LoRA
365
+ # function of text encoder can correctly access it
366
+ if lora_scale is not None and isinstance(self, SanaLoraLoaderMixin):
367
+ self._lora_scale = lora_scale
368
+
369
+ # dynamically adjust the LoRA scale
370
+ if self.text_encoder is not None and USE_PEFT_BACKEND:
371
+ scale_lora_layers(self.text_encoder, lora_scale)
372
+
373
+ if prompt is not None and isinstance(prompt, str):
374
+ batch_size = 1
375
+ elif prompt is not None and isinstance(prompt, list):
376
+ batch_size = len(prompt)
377
+ else:
378
+ batch_size = prompt_embeds.shape[0]
379
+
380
+ if getattr(self, "tokenizer", None) is not None:
381
+ self.tokenizer.padding_side = "right"
382
+
383
+ # See Section 3.1. of the paper.
384
+ max_length = max_sequence_length
385
+ select_index = [0] + list(range(-max_length + 1, 0))
386
+
387
+ if prompt_embeds is None:
388
+ prompt_embeds, prompt_attention_mask = self._get_gemma_prompt_embeds(
389
+ prompt=prompt,
390
+ device=device,
391
+ dtype=dtype,
392
+ clean_caption=clean_caption,
393
+ max_sequence_length=max_sequence_length,
394
+ complex_human_instruction=complex_human_instruction,
395
+ )
396
+
397
+ prompt_embeds = prompt_embeds[:, select_index]
398
+ prompt_attention_mask = prompt_attention_mask[:, select_index]
399
+
400
+ bs_embed, seq_len, _ = prompt_embeds.shape
401
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
402
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
403
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
404
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
405
+ prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
406
+
407
+ # get unconditional embeddings for classifier free guidance
408
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
409
+ negative_prompt = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
410
+ negative_prompt_embeds, negative_prompt_attention_mask = self._get_gemma_prompt_embeds(
411
+ prompt=negative_prompt,
412
+ device=device,
413
+ dtype=dtype,
414
+ clean_caption=clean_caption,
415
+ max_sequence_length=max_sequence_length,
416
+ complex_human_instruction=False,
417
+ )
418
+
419
+ if do_classifier_free_guidance:
420
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
421
+ seq_len = negative_prompt_embeds.shape[1]
422
+
423
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
424
+
425
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
426
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
427
+
428
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
429
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
430
+ else:
431
+ negative_prompt_embeds = None
432
+ negative_prompt_attention_mask = None
433
+
434
+ if self.text_encoder is not None:
435
+ if isinstance(self, SanaLoraLoaderMixin) and USE_PEFT_BACKEND:
436
+ # Retrieve the original scale by scaling back the LoRA layers
437
+ unscale_lora_layers(self.text_encoder, lora_scale)
438
+
439
+ return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
440
+
441
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
442
+ def prepare_extra_step_kwargs(self, generator, eta):
443
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
444
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
445
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
446
+ # and should be between [0, 1]
447
+
448
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
449
+ extra_step_kwargs = {}
450
+ if accepts_eta:
451
+ extra_step_kwargs["eta"] = eta
452
+
453
+ # check if the scheduler accepts generator
454
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
455
+ if accepts_generator:
456
+ extra_step_kwargs["generator"] = generator
457
+ return extra_step_kwargs
458
+
459
+ def check_inputs(
460
+ self,
461
+ prompt,
462
+ height,
463
+ width,
464
+ callback_on_step_end_tensor_inputs=None,
465
+ negative_prompt=None,
466
+ prompt_embeds=None,
467
+ negative_prompt_embeds=None,
468
+ prompt_attention_mask=None,
469
+ negative_prompt_attention_mask=None,
470
+ ):
471
+ if height % 32 != 0 or width % 32 != 0:
472
+ raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
473
+
474
+ if callback_on_step_end_tensor_inputs is not None and not all(
475
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
476
+ ):
477
+ raise ValueError(
478
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
479
+ )
480
+
481
+ if prompt is not None and prompt_embeds is not None:
482
+ raise ValueError(
483
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
484
+ " only forward one of the two."
485
+ )
486
+ elif prompt is None and prompt_embeds is None:
487
+ raise ValueError(
488
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
489
+ )
490
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
491
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
492
+
493
+ if prompt is not None and negative_prompt_embeds is not None:
494
+ raise ValueError(
495
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
496
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
497
+ )
498
+
499
+ if negative_prompt is not None and negative_prompt_embeds is not None:
500
+ raise ValueError(
501
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
502
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
503
+ )
504
+
505
+ if prompt_embeds is not None and prompt_attention_mask is None:
506
+ raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
507
+
508
+ if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
509
+ raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
510
+
511
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
512
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
513
+ raise ValueError(
514
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
515
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
516
+ f" {negative_prompt_embeds.shape}."
517
+ )
518
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
519
+ raise ValueError(
520
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
521
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
522
+ f" {negative_prompt_attention_mask.shape}."
523
+ )
524
+
525
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
526
+ def _text_preprocessing(self, text, clean_caption=False):
527
+ if clean_caption and not is_bs4_available():
528
+ logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
529
+ logger.warning("Setting `clean_caption` to False...")
530
+ clean_caption = False
531
+
532
+ if clean_caption and not is_ftfy_available():
533
+ logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
534
+ logger.warning("Setting `clean_caption` to False...")
535
+ clean_caption = False
536
+
537
+ if not isinstance(text, (tuple, list)):
538
+ text = [text]
539
+
540
+ def process(text: str):
541
+ if clean_caption:
542
+ text = self._clean_caption(text)
543
+ text = self._clean_caption(text)
544
+ else:
545
+ text = text.lower().strip()
546
+ return text
547
+
548
+ return [process(t) for t in text]
549
+
550
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
551
+ def _clean_caption(self, caption):
552
+ caption = str(caption)
553
+ caption = ul.unquote_plus(caption)
554
+ caption = caption.strip().lower()
555
+ caption = re.sub("<person>", "person", caption)
556
+ # urls:
557
+ caption = re.sub(
558
+ r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
559
+ "",
560
+ caption,
561
+ ) # regex for urls
562
+ caption = re.sub(
563
+ r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
564
+ "",
565
+ caption,
566
+ ) # regex for urls
567
+ # html:
568
+ caption = BeautifulSoup(caption, features="html.parser").text
569
+
570
+ # @<nickname>
571
+ caption = re.sub(r"@[\w\d]+\b", "", caption)
572
+
573
+ # 31C0—31EF CJK Strokes
574
+ # 31F0��31FF Katakana Phonetic Extensions
575
+ # 3200—32FF Enclosed CJK Letters and Months
576
+ # 3300—33FF CJK Compatibility
577
+ # 3400—4DBF CJK Unified Ideographs Extension A
578
+ # 4DC0—4DFF Yijing Hexagram Symbols
579
+ # 4E00—9FFF CJK Unified Ideographs
580
+ caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
581
+ caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
582
+ caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
583
+ caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
584
+ caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
585
+ caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
586
+ caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
587
+ #######################################################
588
+
589
+ # все виды тире / all types of dash --> "-"
590
+ caption = re.sub(
591
+ r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
592
+ "-",
593
+ caption,
594
+ )
595
+
596
+ # кавычки к одному стандарту
597
+ caption = re.sub(r"[`´«»“”¨]", '"', caption)
598
+ caption = re.sub(r"[‘’]", "'", caption)
599
+
600
+ # &quot;
601
+ caption = re.sub(r"&quot;?", "", caption)
602
+ # &amp
603
+ caption = re.sub(r"&amp", "", caption)
604
+
605
+ # ip adresses:
606
+ caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
607
+
608
+ # article ids:
609
+ caption = re.sub(r"\d:\d\d\s+$", "", caption)
610
+
611
+ # \n
612
+ caption = re.sub(r"\\n", " ", caption)
613
+
614
+ # "#123"
615
+ caption = re.sub(r"#\d{1,3}\b", "", caption)
616
+ # "#12345.."
617
+ caption = re.sub(r"#\d{5,}\b", "", caption)
618
+ # "123456.."
619
+ caption = re.sub(r"\b\d{6,}\b", "", caption)
620
+ # filenames:
621
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
622
+
623
+ #
624
+ caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
625
+ caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
626
+
627
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
628
+ caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
629
+
630
+ # this-is-my-cute-cat / this_is_my_cute_cat
631
+ regex2 = re.compile(r"(?:\-|\_)")
632
+ if len(re.findall(regex2, caption)) > 3:
633
+ caption = re.sub(regex2, " ", caption)
634
+
635
+ caption = ftfy.fix_text(caption)
636
+ caption = html.unescape(html.unescape(caption))
637
+
638
+ caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
639
+ caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
640
+ caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
641
+
642
+ caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
643
+ caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
644
+ caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
645
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
646
+ caption = re.sub(r"\bpage\s+\d+\b", "", caption)
647
+
648
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
649
+
650
+ caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
651
+
652
+ caption = re.sub(r"\b\s+\:\s+", r": ", caption)
653
+ caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
654
+ caption = re.sub(r"\s+", " ", caption)
655
+
656
+ caption.strip()
657
+
658
+ caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
659
+ caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
660
+ caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
661
+ caption = re.sub(r"^\.\S+$", "", caption)
662
+
663
+ return caption.strip()
664
+
665
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
666
+ if latents is not None:
667
+ return latents.to(device=device, dtype=dtype)
668
+
669
+ shape = (
670
+ batch_size,
671
+ num_channels_latents,
672
+ int(height) // self.vae_scale_factor,
673
+ int(width) // self.vae_scale_factor,
674
+ )
675
+ if isinstance(generator, list) and len(generator) != batch_size:
676
+ raise ValueError(
677
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
678
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
679
+ )
680
+
681
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
682
+ return latents
683
+
684
+ @property
685
+ def guidance_scale(self):
686
+ return self._guidance_scale
687
+
688
+ @property
689
+ def attention_kwargs(self):
690
+ return self._attention_kwargs
691
+
692
+ @property
693
+ def do_classifier_free_guidance(self):
694
+ return self._guidance_scale > 1.0
695
+
696
+ @property
697
+ def num_timesteps(self):
698
+ return self._num_timesteps
699
+
700
+ @property
701
+ def interrupt(self):
702
+ return self._interrupt
703
+
704
+ @torch.no_grad()
705
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
706
+ def __call__(
707
+ self,
708
+ prompt: Union[str, List[str]] = None,
709
+ negative_prompt: Union[str, List[str]] = None,
710
+ num_inference_steps: int = 20,
711
+ timesteps: List[int] = None,
712
+ sigmas: List[float] = None,
713
+ guidance_scale: float = 4.5,
714
+ num_images_per_prompt: Optional[int] = 1,
715
+ height: int = 1024,
716
+ width: int = 1024,
717
+ eta: float = 0.0,
718
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
719
+ latents: Optional[torch.Tensor] = None,
720
+ prompt_embeds: Optional[torch.Tensor] = None,
721
+ prompt_attention_mask: Optional[torch.Tensor] = None,
722
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
723
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
724
+ output_type: Optional[str] = "pil",
725
+ device: Optional[Union[str, torch.device]] = None,
726
+ return_dict: bool = True,
727
+ clean_caption: bool = False,
728
+ use_resolution_binning: bool = True,
729
+ attention_kwargs: Optional[Dict[str, Any]] = None,
730
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
731
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
732
+ max_sequence_length: int = 300,
733
+ complex_human_instruction: List[str] = [
734
+ "Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:",
735
+ "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.",
736
+ "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
737
+ "Here are examples of how to transform or refine prompts:",
738
+ "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.",
739
+ "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.",
740
+ "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:",
741
+ "User Prompt: ",
742
+ ],
743
+ ) -> Union[SanaPipelineOutput, Tuple]:
744
+ """
745
+ Function invoked when calling the pipeline for generation.
746
+
747
+ Args:
748
+ prompt (`str` or `List[str]`, *optional*):
749
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
750
+ instead.
751
+ negative_prompt (`str` or `List[str]`, *optional*):
752
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
753
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
754
+ less than `1`).
755
+ num_inference_steps (`int`, *optional*, defaults to 20):
756
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
757
+ expense of slower inference.
758
+ timesteps (`List[int]`, *optional*):
759
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
760
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
761
+ passed will be used. Must be in descending order.
762
+ sigmas (`List[float]`, *optional*):
763
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
764
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
765
+ will be used.
766
+ guidance_scale (`float`, *optional*, defaults to 4.5):
767
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
768
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
769
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
770
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
771
+ usually at the expense of lower image quality.
772
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
773
+ The number of images to generate per prompt.
774
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
775
+ The height in pixels of the generated image.
776
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
777
+ The width in pixels of the generated image.
778
+ eta (`float`, *optional*, defaults to 0.0):
779
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
780
+ [`schedulers.DDIMScheduler`], will be ignored for others.
781
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
782
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
783
+ to make generation deterministic.
784
+ latents (`torch.Tensor`, *optional*):
785
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
786
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
787
+ tensor will ge generated by sampling using the supplied random `generator`.
788
+ prompt_embeds (`torch.Tensor`, *optional*):
789
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
790
+ provided, text embeddings will be generated from `prompt` input argument.
791
+ prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
792
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
793
+ Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
794
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
795
+ negative_prompt_attention_mask (`torch.Tensor`, *optional*):
796
+ Pre-generated attention mask for negative text embeddings.
797
+ output_type (`str`, *optional*, defaults to `"pil"`):
798
+ The output format of the generate image. Choose between
799
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
800
+ return_dict (`bool`, *optional*, defaults to `True`):
801
+ Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
802
+ attention_kwargs:
803
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
804
+ `self.processor` in
805
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
806
+ clean_caption (`bool`, *optional*, defaults to `True`):
807
+ Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
808
+ be installed. If the dependencies are not installed, the embeddings will be created from the raw
809
+ prompt.
810
+ use_resolution_binning (`bool` defaults to `True`):
811
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
812
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
813
+ the requested resolution. Useful for generating non-square images.
814
+ callback_on_step_end (`Callable`, *optional*):
815
+ A function that calls at the end of each denoising steps during the inference. The function is called
816
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
817
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
818
+ `callback_on_step_end_tensor_inputs`.
819
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
820
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
821
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
822
+ `._callback_tensor_inputs` attribute of your pipeline class.
823
+ max_sequence_length (`int` defaults to `300`):
824
+ Maximum sequence length to use with the `prompt`.
825
+ complex_human_instruction (`List[str]`, *optional*):
826
+ Instructions for complex human attention:
827
+ https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55.
828
+
829
+ Examples:
830
+
831
+ Returns:
832
+ [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] or `tuple`:
833
+ If `return_dict` is `True`, [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] is returned,
834
+ otherwise a `tuple` is returned where the first element is a list with the generated images
835
+ """
836
+
837
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
838
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
839
+
840
+ # 1. Check inputs. Raise error if not correct
841
+ if use_resolution_binning:
842
+ if self.transformer.config.sample_size == 128:
843
+ aspect_ratio_bin = ASPECT_RATIO_4096_BIN
844
+ elif self.transformer.config.sample_size == 64:
845
+ aspect_ratio_bin = ASPECT_RATIO_2048_BIN
846
+ elif self.transformer.config.sample_size == 32:
847
+ aspect_ratio_bin = ASPECT_RATIO_1024_BIN
848
+ elif self.transformer.config.sample_size == 16:
849
+ aspect_ratio_bin = ASPECT_RATIO_512_BIN
850
+ else:
851
+ raise ValueError("Invalid sample size")
852
+ orig_height, orig_width = height, width
853
+ height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
854
+
855
+ self.check_inputs(
856
+ prompt,
857
+ height,
858
+ width,
859
+ callback_on_step_end_tensor_inputs,
860
+ negative_prompt,
861
+ prompt_embeds,
862
+ negative_prompt_embeds,
863
+ prompt_attention_mask,
864
+ negative_prompt_attention_mask,
865
+ )
866
+
867
+ self._guidance_scale = guidance_scale
868
+ self._attention_kwargs = attention_kwargs
869
+ self._interrupt = False
870
+
871
+ # 2. Default height and width to transformer
872
+ if prompt is not None and isinstance(prompt, str):
873
+ batch_size = 1
874
+ elif prompt is not None and isinstance(prompt, list):
875
+ batch_size = len(prompt)
876
+ else:
877
+ batch_size = prompt_embeds.shape[0]
878
+
879
+ device = device or self._execution_device
880
+ lora_scale = self.attention_kwargs.get("scale", None) if self.attention_kwargs is not None else None
881
+
882
+ # 3. Encode input prompt
883
+ (
884
+ prompt_embeds,
885
+ prompt_attention_mask,
886
+ negative_prompt_embeds,
887
+ negative_prompt_attention_mask,
888
+ ) = self.encode_prompt(
889
+ prompt,
890
+ self.do_classifier_free_guidance,
891
+ negative_prompt=negative_prompt,
892
+ num_images_per_prompt=num_images_per_prompt,
893
+ device=device,
894
+ prompt_embeds=prompt_embeds,
895
+ negative_prompt_embeds=negative_prompt_embeds,
896
+ prompt_attention_mask=prompt_attention_mask,
897
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
898
+ clean_caption=clean_caption,
899
+ max_sequence_length=max_sequence_length,
900
+ complex_human_instruction=complex_human_instruction,
901
+ lora_scale=lora_scale,
902
+ )
903
+ if self.do_classifier_free_guidance:
904
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
905
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
906
+
907
+ # 4. Prepare timesteps
908
+ timesteps, num_inference_steps = retrieve_timesteps(
909
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
910
+ )
911
+
912
+ # 5. Prepare latents.
913
+ latent_channels = self.transformer.config.in_channels
914
+ latents = self.prepare_latents(
915
+ batch_size * num_images_per_prompt,
916
+ latent_channels,
917
+ height,
918
+ width,
919
+ torch.float32,
920
+ device,
921
+ generator,
922
+ latents,
923
+ )
924
+
925
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
926
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
927
+
928
+ # 7. Denoising loop
929
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
930
+ self._num_timesteps = len(timesteps)
931
+
932
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
933
+ for i, t in enumerate(timesteps):
934
+ if self.interrupt:
935
+ continue
936
+
937
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
938
+ latent_model_input = latent_model_input.to(prompt_embeds.dtype)
939
+
940
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
941
+ timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
942
+ timestep = timestep * self.transformer.config.timestep_scale
943
+
944
+ # predict noise model_output
945
+ noise_pred = self.transformer(
946
+ latent_model_input,
947
+ encoder_hidden_states=prompt_embeds,
948
+ encoder_attention_mask=prompt_attention_mask,
949
+ timestep=timestep,
950
+ return_dict=False,
951
+ attention_kwargs=self.attention_kwargs,
952
+ )[0]
953
+
954
+ noise_pred = noise_pred.float()
955
+
956
+ # perform guidance
957
+ if self.do_classifier_free_guidance:
958
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
959
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
960
+
961
+ # learned sigma
962
+ if self.transformer.config.out_channels // 2 == latent_channels:
963
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
964
+ else:
965
+ noise_pred = noise_pred
966
+
967
+ # compute previous image: x_t -> x_t-1
968
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
969
+
970
+ if callback_on_step_end is not None:
971
+ callback_kwargs = {}
972
+ for k in callback_on_step_end_tensor_inputs:
973
+ callback_kwargs[k] = locals()[k]
974
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
975
+
976
+ latents = callback_outputs.pop("latents", latents)
977
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
978
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
979
+
980
+ # call the callback, if provided
981
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
982
+ progress_bar.update()
983
+
984
+ if XLA_AVAILABLE:
985
+ xm.mark_step()
986
+
987
+ if output_type == "latent":
988
+ image = latents
989
+ else:
990
+ latents = latents.to(self.vae.dtype)
991
+ try:
992
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
993
+ except torch.cuda.OutOfMemoryError as e:
994
+ warnings.warn(
995
+ f"{e}. \n"
996
+ f"Try to use VAE tiling for large images. For example: \n"
997
+ f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
998
+ )
999
+ if use_resolution_binning:
1000
+ image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
1001
+
1002
+ if not output_type == "latent":
1003
+ image = self.image_processor.postprocess(image, output_type=output_type)
1004
+
1005
+ # Offload all models
1006
+ self.maybe_free_model_hooks()
1007
+
1008
+ if not return_dict:
1009
+ return (image,)
1010
+
1011
+ return SanaPipelineOutput(images=image)
Ming_Uni/process.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import random
4
+ from io import BytesIO
5
+ from contextlib import nullcontext
6
+ import numpy as np
7
+ import torch
8
+ from PIL import Image
9
+ from Ming_Uni.qwen2vl_processor import Qwen2VLImageProcessor
10
+
11
+ LLAVA_DEFAULT_IMAGE_TOKEN = "<image>"
12
+
13
+ from PIL import Image
14
+
15
+ from Ming_Uni.Templates_native import (
16
+ EOT,
17
+ SYSTEM_PREFIX,
18
+ USER_PREFIX,
19
+ ASSISTANT_PREFIX,
20
+ GLM_USER_PREFIX,
21
+ GLM_ASSISTANT_PREFIX,
22
+ QWEN2_SYSTEM_PREFIX,
23
+ QWEN2_USER_PREFIX,
24
+ QWEN2_ASSISTANT_PREFIX,
25
+ interleave_tokens,
26
+ DEFAULT_IMAGE_PATCH_TOKEN,
27
+ DEFAULT_IM_START_TOKEN,
28
+ DEFAULT_IM_END_TOKEN,
29
+ DEFAULT_AU_START_TOKEN,
30
+ DEFAULT_AU_END_TOKEN,
31
+ DEFAULT_AUDIO_PATCH_TOKEN,
32
+ DEFAULT_GEN_AU_START_TOKEN,
33
+ DEFAULT_GEN_AU_END_TOKEN,
34
+ DEFAULT_VID_START_TOKEN,
35
+ DEFAULT_VID_END_TOKEN,
36
+ DEFAULT_END_OF_CHUNK_TOKEN,
37
+ )
38
+
39
+ additional_special_tokens_llama = [
40
+ "[item]",
41
+ "<html>",
42
+ "</html>",
43
+ "<body>",
44
+ "</body>",
45
+ "<table>",
46
+ "</table>",
47
+ "<tr>",
48
+ "</tr>",
49
+ "<td>",
50
+ "</td>",
51
+ ]
52
+ additional_special_tokens_qwen2 = [
53
+ "[item]",
54
+ "<html>",
55
+ "</html>",
56
+ "<body>",
57
+ "</body>",
58
+ "<table>",
59
+ "</table>",
60
+ "<tr>",
61
+ "</tr>",
62
+ "<td>",
63
+ "</td>",
64
+ "<think>",
65
+ "</think>",
66
+ "<answer>",
67
+ "</answer>"
68
+ ]
69
+ def init_tokenizer(llm_model, interleave_tokens=[]):
70
+ from transformers import AutoTokenizer
71
+ tokenizer = AutoTokenizer.from_pretrained(llm_model)
72
+ tokenizer.add_special_tokens(
73
+ {"additional_special_tokens": additional_special_tokens_qwen2}
74
+ )
75
+
76
+ # add special_tokens to tokenizer
77
+ if len(interleave_tokens) > 0:
78
+ num_new_tokens = tokenizer.add_tokens(interleave_tokens, special_tokens=True)
79
+ print("generation_num_tokens: {}".format(num_new_tokens))
80
+ print("Tokenizer length after adding interleave tokens in dataset: ", len(tokenizer))
81
+ return tokenizer
82
+ def center_crop(image_path, save_path, short_side=512):
83
+ """
84
+ 按照短边裁剪为 512 像素,并对图像进行中心裁剪。
85
+
86
+ :param image_path: 输入图像路径
87
+ :param save_path: 保存裁剪后的图像路径
88
+ :param short_side: 裁剪时短边的大小,默认值为 512
89
+ """
90
+ # 打开图像
91
+ img = Image.open(image_path)
92
+
93
+ # 获取原始图像的尺寸
94
+ width, height = img.size
95
+
96
+ # 计算缩放比例,根据短边调整为 short_side 的大小
97
+ if width < height:
98
+ scale = short_side / width
99
+ new_width = short_side
100
+ new_height = int(height * scale)
101
+ else:
102
+ scale = short_side / height
103
+ new_height = short_side
104
+ new_width = int(width * scale)
105
+
106
+ # 缩放图像,使短边为 512
107
+ if new_width != width or new_height != height:
108
+ img_resized = img.resize((new_width, new_height))
109
+ else:
110
+ img_resized = img
111
+
112
+ # 获取缩放后图像的尺寸
113
+ resized_width, resized_height = img_resized.size
114
+
115
+ # 计算中心裁剪的坐标
116
+ left = (resized_width - short_side) // 2
117
+ top = (resized_height - short_side) // 2
118
+ right = left + short_side
119
+ bottom = top + short_side
120
+
121
+ # 裁剪图像
122
+ img_cropped = img_resized.crop((left, top, right, bottom))
123
+
124
+ # 保存裁剪后的图像
125
+ img_cropped.save(save_path)
126
+ print(f'裁剪后的图像已保存到 {save_path}')
127
+
128
+
129
+ class MyProcessor():
130
+ def __init__(self,glm_model):
131
+ vis_processor = Qwen2VLImageProcessor()
132
+ # 设置最大pixels
133
+ max_pixels = 451584
134
+ min_pixels = 451584
135
+ temporal_patch_size = 2
136
+ merge_size = 2
137
+
138
+
139
+ assert hasattr(vis_processor, "max_pixels")
140
+ setattr(vis_processor, "max_pixels", max_pixels)
141
+ assert hasattr(vis_processor, "min_pixels")
142
+ setattr(vis_processor, "min_pixels", min_pixels)
143
+ assert hasattr(vis_processor, "temporal_patch_size")
144
+ setattr(vis_processor, "temporal_patch_size", temporal_patch_size)
145
+ assert hasattr(vis_processor, "merge_size")
146
+ setattr(vis_processor, "merge_size", merge_size)
147
+
148
+ self.vis_processor = vis_processor
149
+
150
+ self.use_qwen2_template = True
151
+
152
+ self.llm_model_type = 'qwen2'
153
+
154
+ self.num_query_token=2560
155
+ self.glm_model="/video_hy2/modelzoo/Qwen2.5-7B-Instruct"
156
+ self.tokenizer = init_tokenizer(
157
+ self.glm_model,
158
+ interleave_tokens
159
+ )
160
+ self._init_special_token()
161
+
162
+ def _init_special_token(self):
163
+ self.image_start_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_IM_START_TOKEN)
164
+ self.image_end_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_IM_END_TOKEN)
165
+ self.image_patch_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_PATCH_TOKEN)
166
+
167
+ self.video_start_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_VID_START_TOKEN)
168
+ self.video_end_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_VID_END_TOKEN)
169
+
170
+ self.audio_start_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_AU_START_TOKEN)
171
+ self.audio_end_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_AU_END_TOKEN)
172
+ self.audio_patch_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_AUDIO_PATCH_TOKEN)
173
+ self.end_of_chunk_token = self.tokenizer.convert_tokens_to_ids(DEFAULT_END_OF_CHUNK_TOKEN)
174
+
175
+ bos_token = None
176
+
177
+ if self.llm_model_type in ["qwen2"]:
178
+ bos_token = self.tokenizer.bos_token if self.tokenizer.eos_token is None else self.tokenizer.pad_token
179
+ self.qwen2_bos_id = self.tokenizer.convert_tokens_to_ids(bos_token)
180
+ self.qwen2_eos_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token)
181
+ self.qwen2_pad_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)
182
+
183
+
184
+ assert bos_token is not None
185
+ self.llm_bos_token = bos_token
186
+ self.llm_eos_token = self.tokenizer.eos_token
187
+ self.llm_pad_token = self.tokenizer.pad_token
188
+
189
+ self.img_text = DEFAULT_IM_START_TOKEN + self.num_query_token * DEFAULT_IMAGE_PATCH_TOKEN + DEFAULT_IM_END_TOKEN
190
+
191
+ self.usr_prefix = QWEN2_USER_PREFIX
192
+ self.assistant_prefix = QWEN2_ASSISTANT_PREFIX
193
+
194
+ self.img_text_id = (self.tokenizer(self.img_text, return_tensors="pt")["input_ids"][0]).tolist()
195
+ self.system_prefix_id = (self.tokenizer(SYSTEM_PREFIX, return_tensors="pt")["input_ids"][0]).tolist()
196
+ if self.use_qwen2_template:
197
+ self.system_prefix_id = (self.tokenizer(QWEN2_SYSTEM_PREFIX, return_tensors="pt")["input_ids"][0]).tolist()
198
+
199
+ self.usr_prefix_id = (self.tokenizer(self.usr_prefix, return_tensors="pt")["input_ids"][0]).tolist()
200
+ self.assistant_prefix_id = (self.tokenizer(self.assistant_prefix, return_tensors="pt")["input_ids"][0]).tolist()
201
+
202
+ self.EOT_id = (self.tokenizer(EOT, return_tensors="pt")["input_ids"][0]).tolist()
203
+ self._n_id = (self.tokenizer("\n", return_tensors="pt")["input_ids"][0]).tolist()
204
+
205
+ def preprocess_text(self, question, generate_prefix=None):
206
+ input_text = ""
207
+ input_ids = []
208
+ position_ids = None
209
+
210
+ input_text += QWEN2_SYSTEM_PREFIX
211
+ input_ids.extend(self.system_prefix_id)
212
+
213
+ input_text += self.usr_prefix
214
+ input_ids.extend(self.usr_prefix_id)
215
+
216
+ input_text += question
217
+ question_id = (self.tokenizer(question, return_tensors="pt")["input_ids"][0]).tolist()
218
+ input_ids.extend(question_id)
219
+
220
+ input_text += self.assistant_prefix
221
+ input_ids.extend(self.assistant_prefix_id)
222
+
223
+ assert self.llm_model_type in ["qwen2"]
224
+ #input_ids = torch.cat(
225
+ # [torch.tensor(input_ids), torch.tensor([self.qwen2_eos_id])]
226
+ #) # 后面并eos_id
227
+ #input_text = input_text + self.llm_eos_token
228
+
229
+
230
+ if generate_prefix is not None:
231
+ input_text += generate_prefix
232
+ generate_prefix_id = (self.tokenizer(generate_prefix, return_tensors="pt")["input_ids"][0]).tolist()
233
+ input_ids.extend(generate_prefix_id)
234
+
235
+ input_ids = torch.tensor(input_ids)
236
+ attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
237
+
238
+ return dict(
239
+ input_ids=input_ids,
240
+ attention_mask=attention_mask,
241
+ position_ids=position_ids,
242
+ input_text=input_text, # just for debug
243
+ )
244
+
245
+
246
+ def process(self, image_file, prompt, device="cpu", input_interpolate64=False, input_interpolate256=False):
247
+ pixel_values = None
248
+ image_grid_thw = None
249
+ generate_prefix = "<image>"
250
+
251
+ if image_file is not None:
252
+ if isinstance(image_file, Image.Image):
253
+ image = image_file
254
+ elif image_file.startswith("http"):
255
+ response = requests.get(image_file)
256
+ response.raise_for_status() # 检查请求是否成功
257
+ # 将字节数据转换为BytesIO对象
258
+ image_data = BytesIO(response.content)
259
+ image = Image.open(image_data).convert("RGB")
260
+ else:
261
+ image = Image.open(image_file).convert("RGB")
262
+ # image = fetch_image({"type": "image", "image": image_file})
263
+ prompt = f"<image>\n {prompt}" if prompt else "<image>\n"
264
+ image_inputs = self.vis_processor(images=image, videos=None)
265
+ image_grid_thw = image_inputs["image_grid_thw"] # [ 1 36 34]
266
+ pixel_values = image_inputs["pixel_values"] # (1224, 1176)
267
+ # print(f"image_grid_thw: {image_grid_thw}")
268
+ # print(f"pixel_values_size: {pixel_values.shape}")
269
+
270
+ num_query_token = torch.prod(image_grid_thw, dim=1) // 4
271
+ ### 64 ~~~
272
+ #num_query_token = torch.tensor([64])
273
+ assert num_query_token.shape[0] == 1
274
+
275
+ assert prompt.count(LLAVA_DEFAULT_IMAGE_TOKEN) == 1
276
+
277
+ assert not (input_interpolate64 is True and input_interpolate256 is True)
278
+ if input_interpolate64 is True:
279
+ img_text = DEFAULT_IM_START_TOKEN + 64 * DEFAULT_IMAGE_PATCH_TOKEN + DEFAULT_IM_END_TOKEN
280
+ elif input_interpolate256 is True:
281
+ img_text = DEFAULT_IM_START_TOKEN + 256 * DEFAULT_IMAGE_PATCH_TOKEN + DEFAULT_IM_END_TOKEN
282
+ else:
283
+ img_text = DEFAULT_IM_START_TOKEN + num_query_token[0] * DEFAULT_IMAGE_PATCH_TOKEN + DEFAULT_IM_END_TOKEN
284
+ prompt = prompt.replace(LLAVA_DEFAULT_IMAGE_TOKEN, img_text).strip()
285
+
286
+ ret = self.preprocess_text(prompt, generate_prefix)
287
+
288
+ input_text = ret["input_text"]
289
+ input_ids = ret["input_ids"].tolist()
290
+ attention_mask = ret["attention_mask"]
291
+ if attention_mask is not None:
292
+ attention_mask = attention_mask.tolist()
293
+
294
+ if image_file is not None:
295
+ image_start_indices = list(torch.where(torch.tensor(input_ids) == self.image_start_token)[0])
296
+ image_end_indices = list(torch.where(torch.tensor(input_ids) == self.image_end_token)[0])
297
+ print(image_start_indices, image_end_indices)
298
+ #assert len(image_start_indices) == len(image_end_indices)
299
+
300
+ num_images = 1 if image_file is not None else 0
301
+ #assert len(image_start_indices) == num_images
302
+ #assert len(image_end_indices) == num_images
303
+
304
+ assert DEFAULT_AU_START_TOKEN not in input_text and DEFAULT_AU_END_TOKEN not in input_text
305
+ assert DEFAULT_GEN_AU_START_TOKEN not in input_text and DEFAULT_GEN_AU_END_TOKEN not in input_text
306
+ assert DEFAULT_VID_START_TOKEN not in input_text and DEFAULT_VID_END_TOKEN not in input_text
307
+
308
+ attention_mask = torch.tensor(attention_mask, dtype=torch.int32)
309
+
310
+ assert len(input_ids) == len(attention_mask)
311
+ if image_grid_thw is not None:
312
+ n_image_features = int(sum(torch.prod(image_grid_thw, dim=-1) // 4))
313
+ n_image_tokens = input_ids.count(self.image_patch_token)
314
+ if n_image_tokens != n_image_features:
315
+ print(
316
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
317
+ )
318
+ image_grid_thw = image_grid_thw.tolist()
319
+
320
+ input_image = pixel_values
321
+ result = {
322
+ "image": input_image.to(device) if input_image is not None else None,
323
+ "image_grid_thw": torch.tensor(image_grid_thw).to(device) if image_grid_thw is not None else None,
324
+ "decoder_image": torch.zeros(0, 3, 224, 224).to(device),
325
+ "task_type": "others",
326
+ "dataset_type": "image_text",
327
+ "input_ids": torch.tensor(input_ids).unsqueeze(0).to(device),
328
+ "position_ids": None,
329
+ "generation_attention_mask": attention_mask.unsqueeze(0).to(device),
330
+ "labels": None,
331
+ "audio": None,
332
+ "weights": None,
333
+ "input_text": input_text, # just for debug
334
+ }
335
+ return result
Ming_Uni/qwen2_5_vit.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch Qwen2_5_ViT model."""
21
+
22
+ import math
23
+
24
+ import os
25
+ import torch
26
+ import torch.nn as nn
27
+ import torch.nn.functional as F
28
+
29
+ from transformers.activations import ACT2FN
30
+ from transformers.modeling_utils import PreTrainedModel
31
+ from transformers.utils import (
32
+ is_flash_attn_2_available,
33
+ logging,
34
+ )
35
+
36
+ from typing import Union
37
+
38
+ from transformers.configuration_utils import PretrainedConfig
39
+
40
+ if is_flash_attn_2_available():
41
+ from flash_attn import flash_attn_varlen_func
42
+ from flash_attn.layers.rotary import apply_rotary_emb
43
+
44
+ else:
45
+ flash_attn_varlen_func = None
46
+ apply_rotary_emb = None
47
+
48
+ logger = logging.get_logger(__name__)
49
+
50
+ class Qwen2_5_VLVisionConfig(PretrainedConfig):
51
+ model_type = "qwen2_5_vit"
52
+
53
+ def __init__(
54
+ self,
55
+ depth=32,
56
+ hidden_size=3584,
57
+ hidden_act="silu",
58
+ intermediate_size=3420,
59
+ num_heads=16,
60
+ in_channels=3,
61
+ patch_size=14,
62
+ spatial_merge_size=2,
63
+ temporal_patch_size=2,
64
+ tokens_per_second=4,
65
+ window_size=112,
66
+ out_hidden_size=3584,
67
+ fullatt_block_indexes=[7, 15, 23, 31],
68
+ _attn_implementation="flash_attention_2",
69
+ **kwargs,
70
+ ):
71
+ super().__init__(**kwargs)
72
+ self.depth = depth
73
+ self.hidden_size = hidden_size
74
+ self.hidden_act = hidden_act
75
+ self.intermediate_size = intermediate_size
76
+ self.num_heads = num_heads
77
+ self.in_channels = in_channels
78
+ self.patch_size = patch_size
79
+ self.spatial_merge_size = spatial_merge_size
80
+ self.temporal_patch_size = temporal_patch_size
81
+ self.tokens_per_second = tokens_per_second
82
+ self.window_size = window_size
83
+ self.fullatt_block_indexes = fullatt_block_indexes
84
+ self.out_hidden_size = out_hidden_size
85
+ self._attn_implementation = _attn_implementation
86
+
87
+ @classmethod
88
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
89
+ cls._set_token_in_kwargs(kwargs)
90
+
91
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
92
+
93
+ if 'vision_config' in config_dict:
94
+ config_dict = config_dict['vision_config']
95
+
96
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
97
+ logger.warning(
98
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
99
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
100
+ )
101
+
102
+ return cls.from_dict(config_dict, **kwargs)
103
+
104
+ class Qwen2_5_VLMLP(nn.Module):
105
+ def __init__(self, config, bias: bool = False):
106
+ super().__init__()
107
+ self.hidden_size = config.hidden_size
108
+ self.intermediate_size = config.intermediate_size
109
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
110
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
111
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
112
+ self.act_fn = ACT2FN[config.hidden_act]
113
+
114
+ def forward(self, hidden_state):
115
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
116
+
117
+ class Qwen2_5_VisionPatchEmbed(nn.Module):
118
+ def __init__(
119
+ self,
120
+ patch_size: int = 14,
121
+ temporal_patch_size: int = 2,
122
+ in_channels: int = 3,
123
+ embed_dim: int = 1152,
124
+ ) -> None:
125
+ super().__init__()
126
+ self.patch_size = patch_size
127
+ self.temporal_patch_size = temporal_patch_size
128
+ self.in_channels = in_channels
129
+ self.embed_dim = embed_dim
130
+
131
+ kernel_size = [temporal_patch_size, patch_size, patch_size]
132
+ self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
133
+
134
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
135
+ target_dtype = self.proj.weight.dtype
136
+ hidden_states = hidden_states.view(
137
+ -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
138
+ )
139
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
140
+ return hidden_states
141
+
142
+ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
143
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
144
+ super().__init__()
145
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
146
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
147
+
148
+ def forward(self, seqlen: int) -> torch.Tensor:
149
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
150
+ freqs = torch.outer(seq, self.inv_freq)
151
+ return freqs
152
+
153
+ class Qwen2RMSNorm(nn.Module):
154
+ def __init__(self, hidden_size, eps=1e-6):
155
+ """
156
+ Qwen2RMSNorm is equivalent to T5LayerNorm
157
+ """
158
+ super().__init__()
159
+ self.weight = nn.Parameter(torch.ones(hidden_size))
160
+ self.variance_epsilon = eps
161
+
162
+ def forward(self, hidden_states):
163
+ input_dtype = hidden_states.dtype
164
+ hidden_states = hidden_states.to(torch.float32)
165
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
166
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
167
+ return self.weight * hidden_states.to(input_dtype)
168
+
169
+ def extra_repr(self):
170
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
171
+
172
+ class Qwen2_5_VLPatchMerger(nn.Module):
173
+ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
174
+ super().__init__()
175
+ self.hidden_size = context_dim * (spatial_merge_size ** 2)
176
+ self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6)
177
+ self.mlp = nn.Sequential(
178
+ nn.Linear(self.hidden_size, self.hidden_size),
179
+ nn.GELU(),
180
+ nn.Linear(self.hidden_size, dim),
181
+ )
182
+
183
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
184
+ x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
185
+ return x
186
+
187
+ def apply_rotary_pos_emb_flashatt(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
188
+ tensor_ = tensor.float()
189
+ cos = freqs.cos().float()
190
+ sin = freqs.sin().float()
191
+ output = apply_rotary_emb(tensor_, cos, sin).type_as(tensor)
192
+ return output
193
+
194
+ class Qwen2_5_VLVisionFlashAttention2(nn.Module):
195
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
196
+ super().__init__()
197
+ self.num_heads = num_heads
198
+ self.qkv = nn.Linear(dim, dim * 3, bias=True)
199
+ self.proj = nn.Linear(dim, dim)
200
+
201
+ def forward(
202
+ self,
203
+ hidden_states: torch.Tensor,
204
+ cu_seqlens: torch.Tensor,
205
+ rotary_pos_emb: torch.Tensor = None,
206
+ ) -> torch.Tensor:
207
+ seq_length = hidden_states.shape[0]
208
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
209
+ q = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
210
+ k = apply_rotary_pos_emb_flashatt(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
211
+
212
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
213
+ attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
214
+ seq_length, -1
215
+ )
216
+ attn_output = self.proj(attn_output)
217
+ return attn_output
218
+
219
+ def rotate_half(x):
220
+ """Rotates half the hidden dims of the input."""
221
+ x1 = x[..., : x.shape[-1] // 2]
222
+ x2 = x[..., x.shape[-1] // 2:]
223
+ return torch.cat((-x2, x1), dim=-1)
224
+
225
+ def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
226
+ orig_dtype = tensor.dtype
227
+ tensor = tensor.float()
228
+ cos = freqs.cos()
229
+ sin = freqs.sin()
230
+ cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
231
+ sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
232
+ output = (tensor * cos) + (rotate_half(tensor) * sin)
233
+ output = output.to(orig_dtype)
234
+ return output
235
+
236
+ class Qwen2_5_VLVisionAttention(nn.Module):
237
+ class Qwen2_5_VLVisionAttention(nn.Module):
238
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
239
+ super().__init__()
240
+ self.num_heads = num_heads
241
+ self.head_dim = dim // num_heads
242
+ self.qkv = nn.Linear(dim, dim * 3, bias=True)
243
+ self.proj = nn.Linear(dim, dim)
244
+
245
+ def forward(
246
+ self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
247
+ ) -> torch.Tensor:
248
+ seq_length = hidden_states.shape[0]
249
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
250
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
251
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
252
+
253
+ attention_mask = torch.full(
254
+ [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
255
+ )
256
+ for i in range(1, len(cu_seqlens)):
257
+ attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = 0
258
+
259
+ q = q.transpose(0, 1)
260
+ k = k.transpose(0, 1)
261
+ v = v.transpose(0, 1)
262
+ attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
263
+ attn_weights = attn_weights + attention_mask
264
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
265
+ attn_output = torch.matmul(attn_weights, v)
266
+ attn_output = attn_output.transpose(0, 1)
267
+ attn_output = attn_output.reshape(seq_length, -1)
268
+ attn_output = self.proj(attn_output)
269
+ return attn_output
270
+
271
+ class Qwen2_5_VLVisionSdpaAttention(nn.Module):
272
+ def __init__(self, dim: int, num_heads: int = 16) -> None:
273
+ super().__init__()
274
+ self.num_heads = num_heads
275
+ self.qkv = nn.Linear(dim, dim * 3, bias=True)
276
+ self.proj = nn.Linear(dim, dim)
277
+
278
+ def forward(
279
+ self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
280
+ ) -> torch.Tensor:
281
+ seq_length = hidden_states.shape[0]
282
+ q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
283
+ q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
284
+ k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
285
+
286
+ attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
287
+ for i in range(1, len(cu_seqlens)):
288
+ attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
289
+ q = q.transpose(0, 1)
290
+ k = k.transpose(0, 1)
291
+ v = v.transpose(0, 1)
292
+ attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
293
+ attn_output = attn_output.transpose(0, 1)
294
+ attn_output = attn_output.reshape(seq_length, -1)
295
+ attn_output = self.proj(attn_output)
296
+ return attn_output
297
+
298
+ QWEN2_5_VL_VISION_ATTENTION_CLASSES = {
299
+ "eager": Qwen2_5_VLVisionAttention,
300
+ "flash_attention_2": Qwen2_5_VLVisionFlashAttention2,
301
+ "sdpa": Qwen2_5_VLVisionSdpaAttention,
302
+ }
303
+
304
+ class Qwen2_5_VLVisionBlock(nn.Module):
305
+ def __init__(self, config, attn_implementation: str = "sdpa") -> None:
306
+ super().__init__()
307
+ self.norm1 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
308
+ self.norm2 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
309
+ self.attn = QWEN2_5_VL_VISION_ATTENTION_CLASSES[attn_implementation](
310
+ config.hidden_size, num_heads=config.num_heads
311
+ )
312
+ self.mlp = Qwen2_5_VLMLP(config, bias=True)
313
+
314
+ def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
315
+ hidden_states = hidden_states + self.attn(
316
+ self.norm1(hidden_states),
317
+ cu_seqlens=cu_seqlens,
318
+ rotary_pos_emb=rotary_pos_emb,
319
+ )
320
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
321
+ return hidden_states
322
+
323
+ class Qwen2_5_VisionTransformer(PreTrainedModel):
324
+ config_class = Qwen2_5_VLVisionConfig
325
+ _no_split_modules = ["Qwen2_5_VLVisionBlock"]
326
+ _supports_flash_attn_2 = True
327
+ _supports_sdpa = True
328
+
329
+ def __init__(self, config, *inputs, **kwargs) -> None:
330
+ super().__init__(config, *inputs, **kwargs)
331
+ self.spatial_merge_size = config.spatial_merge_size
332
+ self.patch_size = config.patch_size
333
+ self.fullatt_block_indexes = config.fullatt_block_indexes
334
+ self.window_size = config.window_size
335
+ self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
336
+
337
+ self.patch_embed = Qwen2_5_VisionPatchEmbed(
338
+ patch_size=config.patch_size,
339
+ temporal_patch_size=config.temporal_patch_size,
340
+ in_channels=config.in_channels,
341
+ embed_dim=config.hidden_size,
342
+ )
343
+
344
+ head_dim = config.hidden_size // config.num_heads
345
+ self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
346
+
347
+ self.blocks = nn.ModuleList(
348
+ [Qwen2_5_VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
349
+ )
350
+ self.merger = Qwen2_5_VLPatchMerger(
351
+ dim=config.out_hidden_size,
352
+ context_dim=config.hidden_size,
353
+ spatial_merge_size=config.spatial_merge_size,
354
+ )
355
+ self.gradient_checkpointing = False
356
+
357
+ def get_dtype(self) -> torch.dtype:
358
+ return self.blocks[0].mlp.down_proj.weight.dtype
359
+
360
+ def rot_pos_emb(self, grid_thw):
361
+ pos_ids = []
362
+ for t, h, w in grid_thw:
363
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
364
+ hpos_ids = hpos_ids.reshape(
365
+ h // self.spatial_merge_size,
366
+ self.spatial_merge_size,
367
+ w // self.spatial_merge_size,
368
+ self.spatial_merge_size,
369
+ )
370
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
371
+ hpos_ids = hpos_ids.flatten()
372
+
373
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
374
+ wpos_ids = wpos_ids.reshape(
375
+ h // self.spatial_merge_size,
376
+ self.spatial_merge_size,
377
+ w // self.spatial_merge_size,
378
+ self.spatial_merge_size,
379
+ )
380
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
381
+ wpos_ids = wpos_ids.flatten()
382
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
383
+ pos_ids = torch.cat(pos_ids, dim=0)
384
+ max_grid_size = grid_thw[:, 1:].max()
385
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
386
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
387
+ return rotary_pos_emb
388
+
389
+ def get_window_index(self, grid_thw):
390
+ window_index: list = []
391
+ cu_window_seqlens: list = [0]
392
+ window_index_id = 0
393
+ vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
394
+
395
+ for grid_t, grid_h, grid_w in grid_thw:
396
+ llm_grid_h, llm_grid_w = (
397
+ grid_h // self.spatial_merge_size,
398
+ grid_w // self.spatial_merge_size,
399
+ )
400
+ index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
401
+ pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
402
+ pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
403
+ num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
404
+ num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
405
+ index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
406
+ index_padded = index_padded.reshape(
407
+ grid_t,
408
+ num_windows_h,
409
+ vit_merger_window_size,
410
+ num_windows_w,
411
+ vit_merger_window_size,
412
+ )
413
+ index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
414
+ grid_t,
415
+ num_windows_h * num_windows_w,
416
+ vit_merger_window_size,
417
+ vit_merger_window_size,
418
+ )
419
+ seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
420
+ index_padded = index_padded.reshape(-1)
421
+ index_new = index_padded[index_padded != -100]
422
+ window_index.append(index_new + window_index_id)
423
+ cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
424
+ cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
425
+ window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
426
+ window_index = torch.cat(window_index, dim=0)
427
+
428
+ return window_index, cu_window_seqlens
429
+
430
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
431
+ """
432
+ Args:
433
+ hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`):
434
+ The final hidden states of the model.
435
+ grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
436
+ The temporal, height and width of feature shape of each image in LLM.
437
+
438
+ Returns:
439
+ `torch.Tensor`: hidden_states.
440
+ """
441
+ hidden_states = self.patch_embed(hidden_states)
442
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
443
+ window_index, cu_window_seqlens = self.get_window_index(grid_thw)
444
+ cu_window_seqlens = torch.tensor(
445
+ cu_window_seqlens,
446
+ device=hidden_states.device,
447
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
448
+ )
449
+ cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
450
+
451
+ seq_len, _ = hidden_states.size()
452
+ hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
453
+ hidden_states = hidden_states[window_index, :, :]
454
+ hidden_states = hidden_states.reshape(seq_len, -1)
455
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
456
+ rotary_pos_emb = rotary_pos_emb[window_index, :, :]
457
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
458
+
459
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
460
+ dim=0,
461
+ # Select dtype based on the following factors:
462
+ # - FA2 requires that cu_seqlens_q must have dtype int32
463
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
464
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
465
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
466
+ )
467
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
468
+
469
+ for layer_num, blk in enumerate(self.blocks):
470
+ if layer_num in self.fullatt_block_indexes:
471
+ cu_seqlens_now = cu_seqlens
472
+ else:
473
+ cu_seqlens_now = cu_window_seqlens
474
+ if self.gradient_checkpointing and self.training:
475
+ hidden_states = self._gradient_checkpointing_func(
476
+ blk.__call__, hidden_states, cu_seqlens_now, rotary_pos_emb
477
+ )
478
+ else:
479
+ hidden_states = blk(
480
+ hidden_states,
481
+ cu_seqlens=cu_seqlens_now,
482
+ rotary_pos_emb=rotary_pos_emb,
483
+ )
484
+
485
+ hidden_states = self.merger(hidden_states)
486
+
487
+ reverse_indices = torch.argsort(window_index)
488
+ hidden_states = hidden_states[reverse_indices, :]
489
+
490
+ return hidden_states
Ming_Uni/qwen2vl_processor.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """Image processor class for Qwen2-VL."""
21
+
22
+ import math
23
+ from typing import Dict, List, Optional, Union
24
+
25
+ import numpy as np
26
+
27
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
28
+ from transformers.image_transforms import (
29
+ convert_to_rgb,
30
+ resize,
31
+ to_channel_dimension_format,
32
+ )
33
+ from transformers.image_utils import (
34
+ OPENAI_CLIP_MEAN,
35
+ OPENAI_CLIP_STD,
36
+ ChannelDimension,
37
+ ImageInput,
38
+ PILImageResampling,
39
+ VideoInput,
40
+ get_image_size,
41
+ infer_channel_dimension_format,
42
+ is_scaled_image,
43
+ is_valid_image,
44
+ make_list_of_images,
45
+ to_numpy_array,
46
+ valid_images,
47
+ validate_preprocess_arguments,
48
+ )
49
+ from transformers.utils import TensorType, is_vision_available, logging
50
+
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+ if is_vision_available():
55
+ from PIL import Image
56
+
57
+
58
+ def make_batched_images(images) -> List[List[ImageInput]]:
59
+ """
60
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
61
+
62
+ Args:
63
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
64
+ The input image.
65
+
66
+ Returns:
67
+ list: A list of images.
68
+ """
69
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
70
+ return [img for img_list in images for img in img_list]
71
+
72
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
73
+ return images
74
+
75
+ elif is_valid_image(images):
76
+ return [images]
77
+
78
+ raise ValueError(f"Could not make batched images from {images}")
79
+
80
+
81
+ # Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
82
+ def make_batched_videos(videos) -> List[VideoInput]:
83
+ if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
84
+ return videos
85
+
86
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
87
+ if isinstance(videos[0], Image.Image):
88
+ return [videos]
89
+ elif len(videos[0].shape) == 4:
90
+ return [list(video) for video in videos]
91
+
92
+ elif is_valid_image(videos) and len(videos.shape) == 4:
93
+ return [list(videos)]
94
+
95
+ raise ValueError(f"Could not make batched video from {videos}")
96
+
97
+
98
+ def smart_resize(
99
+ height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
100
+ ):
101
+ """Rescales the image so that the following conditions are met:
102
+
103
+ 1. Both dimensions (height and width) are divisible by 'factor'.
104
+
105
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
106
+
107
+ 3. The aspect ratio of the image is maintained as closely as possible.
108
+
109
+ """
110
+
111
+ if height < factor or width < factor:
112
+ # # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") # ocrbench的图有一部分是这种情况
113
+ print(f"height:{height} or width:{width} smaller than factor:{factor}, resize small side to factor")
114
+
115
+ elif max(height, width) / min(height, width) > 200:
116
+ raise ValueError(
117
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
118
+ )
119
+
120
+ h_bar = round(height / factor) * factor
121
+ w_bar = round(width / factor) * factor
122
+ if h_bar * w_bar > max_pixels:
123
+ beta = math.sqrt((height * width) / max_pixels)
124
+ h_bar = math.floor(height / beta / factor) * factor
125
+ w_bar = math.floor(width / beta / factor) * factor
126
+ elif h_bar * w_bar < min_pixels:
127
+ beta = math.sqrt(min_pixels / (height * width))
128
+ h_bar = math.ceil(height * beta / factor) * factor
129
+ w_bar = math.ceil(width * beta / factor) * factor
130
+ return h_bar, w_bar
131
+
132
+
133
+ class Qwen2VLImageProcessor(BaseImageProcessor):
134
+ r"""
135
+ Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
136
+
137
+ Args:
138
+ do_resize (`bool`, *optional*, defaults to `True`):
139
+ Whether to resize the image's (height, width) dimensions.
140
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
141
+ Resampling filter to use when resizing the image.
142
+ do_rescale (`bool`, *optional*, defaults to `True`):
143
+ Whether to rescale the image by the specified scale `rescale_factor`.
144
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
145
+ Scale factor to use if rescaling the image.
146
+ do_normalize (`bool`, *optional*, defaults to `True`):
147
+ Whether to normalize the image.
148
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
149
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
150
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
151
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
152
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
153
+ Whether to convert the image to RGB.
154
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
155
+ The min pixels of the image to resize the image.
156
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
157
+ The max pixels of the image to resize the image.
158
+ patch_size (`int`, *optional*, defaults to 14):
159
+ The spacial patch size of the vision encoder.
160
+ temporal_patch_size (`int`, *optional*, defaults to 2):
161
+ The temporal patch size of the vision encoder.
162
+ merge_size (`int`, *optional*, defaults to 2):
163
+ The merge size of the vision encoder to llm encoder.
164
+ """
165
+
166
+ model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
167
+
168
+ def __init__(
169
+ self,
170
+ do_resize: bool = True,
171
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
172
+ do_rescale: bool = True,
173
+ rescale_factor: Union[int, float] = 1 / 255,
174
+ do_normalize: bool = True,
175
+ image_mean: Optional[Union[float, List[float]]] = None,
176
+ image_std: Optional[Union[float, List[float]]] = None,
177
+ do_convert_rgb: bool = True,
178
+ min_pixels: int = 56 * 56,
179
+ max_pixels: int = 28 * 28 * 1280,
180
+ patch_size: int = 14,
181
+ temporal_patch_size: int = 2,
182
+ merge_size: int = 2,
183
+ **kwargs,
184
+ ) -> None:
185
+ super().__init__(**kwargs)
186
+ self.do_resize = do_resize
187
+ self.resample = resample
188
+ self.do_rescale = do_rescale
189
+ self.rescale_factor = rescale_factor
190
+ self.do_normalize = do_normalize
191
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
192
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
193
+ self.min_pixels = min_pixels
194
+ self.max_pixels = max_pixels
195
+ self.patch_size = patch_size
196
+ self.temporal_patch_size = temporal_patch_size
197
+ self.merge_size = merge_size
198
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
199
+ self.do_convert_rgb = do_convert_rgb
200
+
201
+ def _preprocess(
202
+ self,
203
+ images: Union[ImageInput, VideoInput],
204
+ do_resize: bool = None,
205
+ resample: PILImageResampling = None,
206
+ do_rescale: bool = None,
207
+ rescale_factor: float = None,
208
+ do_normalize: bool = None,
209
+ image_mean: Optional[Union[float, List[float]]] = None,
210
+ image_std: Optional[Union[float, List[float]]] = None,
211
+ do_convert_rgb: bool = None,
212
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
213
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
214
+ ):
215
+ """
216
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
217
+
218
+ Args:
219
+ images (`ImageInput`):
220
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
221
+ vision_info (`List[Dict]`, *optional*):
222
+ Optional list of dictionaries containing additional information about vision inputs.
223
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
224
+ Whether to resize the image.
225
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
226
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
227
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
228
+ Whether to rescale the image.
229
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
230
+ Scale factor to use if rescaling the image.
231
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
232
+ Whether to normalize the image.
233
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
234
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
235
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
236
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
237
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
238
+ Whether to convert the image to RGB.
239
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
240
+ The channel dimension format for the output image. Can be one of:
241
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
242
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
243
+ - Unset: Use the channel dimension format of the input image.
244
+ input_data_format (`ChannelDimension` or `str`, *optional*):
245
+ The channel dimension format for the input image. Can be one of:
246
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
247
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
248
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
249
+ """
250
+ images = make_list_of_images(images)
251
+
252
+ if do_convert_rgb:
253
+ images = [convert_to_rgb(image) for image in images]
254
+
255
+ # All transformations expect numpy arrays.
256
+ images = [to_numpy_array(image) for image in images]
257
+
258
+ if is_scaled_image(images[0]) and do_rescale:
259
+ logger.warning_once(
260
+ "It looks like you are trying to rescale already rescaled images. If the input"
261
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
262
+ )
263
+ if input_data_format is None:
264
+ # We assume that all images have the same channel dimension format.
265
+ input_data_format = infer_channel_dimension_format(images[0])
266
+
267
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
268
+ resized_height, resized_width = height, width
269
+ processed_images = []
270
+ for image in images:
271
+ if do_resize:
272
+ resized_height, resized_width = smart_resize(
273
+ height,
274
+ width,
275
+ factor=self.patch_size * self.merge_size,
276
+ min_pixels=self.min_pixels,
277
+ max_pixels=self.max_pixels,
278
+ )
279
+ image = resize(
280
+ image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
281
+ )
282
+
283
+ if do_rescale:
284
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
285
+
286
+ if do_normalize:
287
+ image = self.normalize(
288
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
289
+ )
290
+
291
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
292
+ processed_images.append(image)
293
+
294
+ patches = np.array(processed_images)
295
+ if data_format == ChannelDimension.LAST:
296
+ patches = patches.transpose(0, 3, 1, 2)
297
+ if patches.shape[0] == 1:
298
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
299
+ channel = patches.shape[1]
300
+ grid_t = patches.shape[0] // self.temporal_patch_size
301
+ grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
302
+ patches = patches.reshape(
303
+ grid_t,
304
+ self.temporal_patch_size,
305
+ channel,
306
+ grid_h // self.merge_size,
307
+ self.merge_size,
308
+ self.patch_size,
309
+ grid_w // self.merge_size,
310
+ self.merge_size,
311
+ self.patch_size,
312
+ )
313
+ patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
314
+ flatten_patches = patches.reshape(
315
+ grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
316
+ )
317
+
318
+ return flatten_patches, (grid_t, grid_h, grid_w)
319
+
320
+ def preprocess(
321
+ self,
322
+ images: ImageInput,
323
+ videos: VideoInput = None,
324
+ do_resize: bool = None,
325
+ size: Dict[str, int] = None,
326
+ resample: PILImageResampling = None,
327
+ do_rescale: bool = None,
328
+ rescale_factor: float = None,
329
+ do_normalize: bool = None,
330
+ image_mean: Optional[Union[float, List[float]]] = None,
331
+ image_std: Optional[Union[float, List[float]]] = None,
332
+ do_convert_rgb: bool = None,
333
+ return_tensors: Optional[Union[str, TensorType]] = None,
334
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
335
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
336
+ ):
337
+ """
338
+ Args:
339
+ images (`ImageInput`):
340
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
341
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
342
+ videos (`VideoInput`):
343
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
344
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
345
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
346
+ Whether to resize the image.
347
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
348
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
349
+ the longest edge resized to keep the input aspect ratio.
350
+ resample (`int`, *optional*, defaults to `self.resample`):
351
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
352
+ has an effect if `do_resize` is set to `True`.
353
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
354
+ Whether to rescale the image.
355
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
356
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
357
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
358
+ Whether to normalize the image.
359
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
360
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
361
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
362
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
363
+ `True`.
364
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
365
+ Whether to convert the image to RGB.
366
+ return_tensors (`str` or `TensorType`, *optional*):
367
+ The type of tensors to return. Can be one of:
368
+ - Unset: Return a list of `np.ndarray`.
369
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
370
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
371
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
372
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
373
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
374
+ The channel dimension format for the output image. Can be one of:
375
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
376
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
377
+ - Unset: Use the channel dimension format of the input image.
378
+ input_data_format (`ChannelDimension` or `str`, *optional*):
379
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
380
+ from the input image. Can be one of:
381
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
382
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
383
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
384
+
385
+ """
386
+ do_resize = do_resize if do_resize is not None else self.do_resize
387
+ size = size if size is not None else self.size
388
+ resample = resample if resample is not None else self.resample
389
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
390
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
391
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
392
+ image_mean = image_mean if image_mean is not None else self.image_mean
393
+ image_std = image_std if image_std is not None else self.image_std
394
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
395
+
396
+ if images is not None:
397
+ images = make_batched_images(images)
398
+ if videos is not None:
399
+ videos = make_batched_videos(videos)
400
+
401
+ if images is not None and not valid_images(images):
402
+ raise ValueError(
403
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
404
+ "torch.Tensor, tf.Tensor or jax.ndarray."
405
+ )
406
+
407
+ validate_preprocess_arguments(
408
+ rescale_factor=rescale_factor,
409
+ do_normalize=do_normalize,
410
+ image_mean=image_mean,
411
+ image_std=image_std,
412
+ do_resize=do_resize,
413
+ size=size,
414
+ resample=resample,
415
+ )
416
+
417
+ if images is not None:
418
+ pixel_values, vision_grid_thws = [], []
419
+ for image in images:
420
+ patches, image_grid_thw = self._preprocess(
421
+ image,
422
+ do_resize=do_resize,
423
+ resample=resample,
424
+ do_rescale=do_rescale,
425
+ rescale_factor=rescale_factor,
426
+ do_normalize=do_normalize,
427
+ image_mean=image_mean,
428
+ image_std=image_std,
429
+ data_format=data_format,
430
+ do_convert_rgb=do_convert_rgb,
431
+ input_data_format=input_data_format,
432
+ )
433
+ pixel_values.extend(patches)
434
+ vision_grid_thws.append(image_grid_thw)
435
+ pixel_values = np.array(pixel_values)
436
+ vision_grid_thws = np.array(vision_grid_thws)
437
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
438
+
439
+ if videos is not None:
440
+ pixel_values, vision_grid_thws = [], []
441
+ for images in videos:
442
+ patches, video_grid_thw = self._preprocess(
443
+ images,
444
+ do_resize=do_resize,
445
+ resample=resample,
446
+ do_rescale=do_rescale,
447
+ rescale_factor=rescale_factor,
448
+ do_normalize=do_normalize,
449
+ image_mean=image_mean,
450
+ image_std=image_std,
451
+ data_format=data_format,
452
+ do_convert_rgb=do_convert_rgb,
453
+ input_data_format=input_data_format,
454
+ )
455
+ pixel_values.extend(patches)
456
+ vision_grid_thws.append(video_grid_thw)
457
+ pixel_values = np.array(pixel_values)
458
+ vision_grid_thws = np.array(vision_grid_thws)
459
+ data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
460
+
461
+ # return BatchFeature(data=data, tensor_type=return_tensors)
462
+ return BatchFeature(data=data, tensor_type="pt")
Ming_Uni/sana_loss.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+ import copy
5
+ from diffusers import DPMSolverMultistepScheduler
6
+ import os
7
+ from collections import OrderedDict
8
+ import logging
9
+ from safetensors.torch import load_file
10
+ from diffusers import (
11
+ AutoencoderDC,
12
+ FlowMatchEulerDiscreteScheduler,
13
+ SanaTransformer2DModel
14
+ )
15
+ import torch.nn as nn
16
+ from .pipeline_sana import SanaPipeline
17
+ # from flux_encoder import tokenize_prompt, encode_prompt
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class ToClipMLP(nn.Module):
23
+ def __init__(self, input_dim, output_dim):
24
+ super().__init__()
25
+ #self.activation_fn = ACT2FN[config.hidden_act]
26
+ self.fc1 = nn.Linear(input_dim, 2048)
27
+ self.layer_norm1 = nn.LayerNorm(2048)
28
+ self.relu = nn.ReLU()
29
+ self.fc2 = nn.Linear(2048, output_dim)
30
+ self.layer_norm2 = nn.LayerNorm(output_dim)
31
+
32
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
33
+ hidden_states = self.fc1(hidden_states)
34
+ hidden_states = self.layer_norm1(hidden_states)
35
+ hidden_states = self.relu(hidden_states)
36
+ hidden_states = self.fc2(hidden_states)
37
+ hidden_states = self.layer_norm2(hidden_states)
38
+ return hidden_states
39
+
40
+ class ToClipMLP(nn.Module):
41
+ def __init__(self, input_dim, output_dim):
42
+ super().__init__()
43
+ #self.activation_fn = ACT2FN[config.hidden_act]
44
+ self.fc1 = nn.Linear(input_dim, 2048)
45
+ self.layer_norm1 = nn.LayerNorm(2048)
46
+ self.relu = nn.ReLU()
47
+ self.fc2 = nn.Linear(2048, output_dim)
48
+ self.layer_norm2 = nn.LayerNorm(output_dim)
49
+
50
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
51
+ hidden_states = self.fc1(hidden_states)
52
+ hidden_states = self.layer_norm1(hidden_states)
53
+ hidden_states = self.relu(hidden_states)
54
+ hidden_states = self.fc2(hidden_states)
55
+ hidden_states = self.layer_norm2(hidden_states)
56
+ return hidden_states
57
+
58
+
59
+ class SanaModel_withMLP(nn.Module):
60
+ def __init__(self, sana, vision_dim=1152):
61
+ super().__init__()
62
+ self.sana = sana
63
+ self.dtype = torch.bfloat16
64
+ self.mlp = ToClipMLP(vision_dim, 2304)
65
+ # self.mlp_pool = ToClipMLP(vision_dim, 768)
66
+ self.config = self.sana.config
67
+
68
+ def forward(self, hidden_states,
69
+ timestep,
70
+ encoder_hidden_states,
71
+ return_dict,
72
+ encoder_attention_mask=None,
73
+ **kargs):
74
+
75
+ encoder_hidden_states = self.mlp(encoder_hidden_states)
76
+ hidden_states = self.sana(
77
+ hidden_states=hidden_states,
78
+ encoder_hidden_states=encoder_hidden_states,
79
+ encoder_attention_mask=encoder_attention_mask,
80
+ timestep=timestep,
81
+ return_dict=False,
82
+ **kargs
83
+ )
84
+ return hidden_states
85
+
86
+ def enable_gradient_checkpointing(self):
87
+ self.sana.enable_gradient_checkpointing()
88
+
89
+ def inference_load_denoising_pretrained_weights(
90
+ net,
91
+ weights_path,
92
+ names=None,
93
+ prefix_to_remove=None,
94
+ ):
95
+ # state_dict = load_file(weights_path, map_location="cpu")
96
+ state_dict = load_file(weights_path)
97
+ net.load_state_dict(state_dict, strict=False)
98
+ return
99
+
100
+
101
+ def load_denoising_pretrained_weights(
102
+ net,
103
+ weights_path,
104
+ names=None,
105
+ prefix_to_remove=None,
106
+ ):
107
+ state_dict = torch.load(weights_path, map_location="cpu")
108
+ if "model" in state_dict:
109
+ state_dict = state_dict["model"]
110
+ elif "net" in state_dict:
111
+ state_dict = state_dict["net"]
112
+
113
+ #if torch.distributed.get_rank() == 0 and names is not None:
114
+ # embed()
115
+
116
+ #torch.distributed.barrier()
117
+ if names is not None:
118
+ selected_state_dict = OrderedDict()
119
+ for ori_name in names:
120
+ name = ori_name[len(prefix_to_remove):] if prefix_to_remove is not None else ori_name
121
+ selected_state_dict[name] = state_dict[ori_name]
122
+
123
+ state_dict = selected_state_dict
124
+
125
+ net.load_state_dict(state_dict, strict=True)
126
+ return
127
+
128
+
129
+ class SANALoss(torch.nn.Module):
130
+ def __init__(self, model_path, scheduler_path, vision_dim=3584, diffusion_type='flow_matching', convert_vpred_to_xpred=True, checkpoint_path=None, checkpoint_path_withmlp=None, mlp_checkpoint_path=None, trainable_params='all', device='cpu', guidance_scale=3.5, revision=None, variant=None, repa_loss=False, mid_layer_idx=10, mid_loss_weight=1.0):
131
+ super(SANALoss, self).__init__()
132
+ self.torch_type = torch.bfloat16
133
+ self.base_model_path = model_path
134
+ self.use_mid_loss = repa_loss
135
+ self.mid_loss_weight = mid_loss_weight
136
+ self.mid_layer_idx = mid_layer_idx
137
+ #self.text_encoder = Gemma2Model.from_pretrained(model_path, subfolder="text_encoder")
138
+ #self.tokenizer = AutoTokenizer.from_pretrained(model_path,subfolder="tokenizer")
139
+ self.scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder="scheduler")
140
+ #self.sana_pipeline = SanaPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16,)
141
+
142
+ self.device = torch.device(torch.cuda.current_device())
143
+ self.scheduler_path = scheduler_path
144
+ self.vae = AutoencoderDC.from_pretrained(
145
+ model_path,
146
+ subfolder="vae",
147
+ revision=revision,
148
+ variant=variant,
149
+ )
150
+
151
+ # self.vae.to(self.torch_type).to(self.device)
152
+ self.vae.requires_grad_(False)
153
+
154
+ self.train_model = SanaTransformer2DModel.from_pretrained(
155
+ model_path, subfolder="transformer", revision=revision, variant=variant
156
+ )
157
+
158
+ if checkpoint_path is not None:
159
+ assert os.path.exists(checkpoint_path)
160
+ load_denoising_pretrained_weights(self.train_model, checkpoint_path)
161
+
162
+ # self.train_model = UNet2DConditionModel_withMLP(self.train_model, vision_dim=vision_dim)
163
+
164
+ self.train_model = SanaModel_withMLP(self.train_model, vision_dim=vision_dim)
165
+ if checkpoint_path_withmlp is not None:
166
+ assert os.path.exists(checkpoint_path_withmlp)
167
+ load_denoising_pretrained_weights(self.train_model, checkpoint_path_withmlp)
168
+ elif mlp_checkpoint_path is not None:
169
+ assert os.path.exists(mlp_checkpoint_path)
170
+ inference_load_denoising_pretrained_weights(self.train_model, mlp_checkpoint_path)
171
+
172
+ # 创建处理中间层特征的MLP
173
+ hidden_dim = 2240
174
+ self.mid_layer_mlp = None
175
+ if self.use_mid_loss:
176
+ self.mid_layer_mlp = torch.nn.Sequential(
177
+ torch.nn.Linear(hidden_dim, hidden_dim * 2),
178
+ torch.nn.GELU(),
179
+ torch.nn.Linear(hidden_dim * 2, 32),
180
+ torch.nn.LayerNorm(32)
181
+ )
182
+
183
+ # 初始化MLP的权重
184
+ for m in self.mid_layer_mlp.modules():
185
+ if isinstance(m, torch.nn.Linear):
186
+ # 使用Kaiming初始化权重
187
+ torch.nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='leaky_relu')
188
+ if m.bias is not None:
189
+ # 将偏置初始化为0
190
+ torch.nn.init.zeros_(m.bias)
191
+
192
+ self.train_model.enable_gradient_checkpointing()
193
+
194
+ self.set_trainable_params(trainable_params)
195
+
196
+
197
+ num_parameters_trainable = 0
198
+ num_parameters = 0
199
+ name_parameters_trainable = []
200
+ for n, p in self.train_model.named_parameters():
201
+ num_parameters += p.data.nelement()
202
+ if not p.requires_grad:
203
+ continue # frozen weights
204
+ name_parameters_trainable.append(n)
205
+ num_parameters_trainable += p.data.nelement()
206
+
207
+ self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
208
+ self.scheduler_path, subfolder="scheduler"
209
+ )
210
+ self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
211
+
212
+
213
+ # if self.train_model.config.guidance_embeds:
214
+ # self.guidance = torch.tensor([guidance_scale], device=self.device)
215
+ # # guidance = guidance.expand(model_input.shape[0])
216
+ # else:
217
+ # self.guidance = None
218
+
219
+ logger.info("Preparation done. Starting training diffusion ...")
220
+
221
+ def get_sigmas(self, timesteps, n_dim=4, dtype=torch.float32):
222
+ # sigmas = noise_scheduler_copy.sigmas.to(device=self.device, dtype=dtype)
223
+ sigmas = self.noise_scheduler_copy.sigmas
224
+ schedule_timesteps = self.noise_scheduler_copy.timesteps.to(device=timesteps.device)
225
+ timesteps = timesteps
226
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
227
+
228
+ sigma = sigmas[step_indices].flatten()
229
+ while len(sigma.shape) < n_dim:
230
+ sigma = sigma.unsqueeze(-1)
231
+ return sigma
232
+
233
+ def compute_text_embeddings(self, prompt, text_encoders, tokenizers):
234
+ with torch.no_grad():
235
+ prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
236
+ [text_encoders], [tokenizers], prompt, 77
237
+ )
238
+ # prompt_embeds = prompt_embeds.to(local_rank)
239
+ pooled_prompt_embeds = pooled_prompt_embeds.to(local_rank)
240
+ # text_ids = text_ids.to(local_rank)
241
+ return prompt_embeds, pooled_prompt_embeds, text_ids
242
+
243
+ def set_trainable_params(self, trainable_params):
244
+
245
+ self.vae.requires_grad_(False)
246
+
247
+ if trainable_params == 'all':
248
+ self.train_model.requires_grad_(True)
249
+ else:
250
+ self.train_model.requires_grad_(False)
251
+ for name, module in self.train_model.named_modules():
252
+ for trainable_param in trainable_params:
253
+ if trainable_param in name:
254
+ for params in module.parameters():
255
+ params.requires_grad = True
256
+
257
+ num_parameters_trainable = 0
258
+ num_parameters = 0
259
+ name_parameters_trainable = []
260
+ for n, p in self.train_model.named_parameters():
261
+ num_parameters += p.data.nelement()
262
+ if not p.requires_grad:
263
+ continue # frozen weights
264
+ name_parameters_trainable.append(n)
265
+ num_parameters_trainable += p.data.nelement()
266
+
267
+ def sample(self, encoder_hidden_states, steps=20, cfg=7.0, seed=42, height=512, width=512):
268
+ #self.pipelines = SanaPipeline.from_pretrained(self.base_model_path)#.to(device=self.device)
269
+ self.pipelines = SanaPipeline(vae=self.vae,
270
+ transformer=self.train_model,
271
+ text_encoder=None,
272
+ tokenizer=None,
273
+ scheduler=self.noise_scheduler,
274
+ ).to(self.device)
275
+
276
+ prompt_attention_mask = torch.ones(encoder_hidden_states.shape[:2]).to(self.device)
277
+ negative_attention_mask = torch.ones(encoder_hidden_states.shape[:2]).to(self.device)
278
+
279
+ image = self.pipelines(
280
+ prompt_embeds=encoder_hidden_states,
281
+ prompt_attention_mask=prompt_attention_mask,
282
+ negative_prompt_embeds=encoder_hidden_states*0,
283
+ negative_prompt_attention_mask=negative_attention_mask,
284
+ guidance_scale=cfg,
285
+ generator=torch.manual_seed(seed),
286
+ num_inference_steps=steps,
287
+ device=self.device,
288
+ height=height,
289
+ width=width,
290
+ max_sequence_length=300,
291
+ ).images[0]
292
+
293
+ return image
Ming_Uni/sana_transformer.py ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict, Optional, Tuple, Union
16
+
17
+ import torch
18
+ import torch.nn.functional as F
19
+ from torch import nn
20
+
21
+ # from ...configuration_utils import ConfigMixin, register_to_config
22
+ # from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
23
+ # from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
24
+ # from ..attention_processor import (
25
+ # Attention,
26
+ # AttentionProcessor,
27
+ # SanaLinearAttnProcessor2_0,
28
+ # )
29
+ # from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
30
+ # from ..modeling_outputs import Transformer2DModelOutput
31
+ # from ..modeling_utils import ModelMixin
32
+ # from ..normalization import AdaLayerNormSingle, RMSNorm
33
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
34
+ from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
35
+ from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
36
+ from diffusers.models.attention_processor import (
37
+ Attention,
38
+ AttentionProcessor,
39
+ SanaLinearAttnProcessor2_0,
40
+ )
41
+ from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
42
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
43
+ from diffusers.models.modeling_utils import ModelMixin
44
+ from diffusers.models.normalization import AdaLayerNormSingle, RMSNorm
45
+
46
+
47
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
48
+
49
+
50
+ class GLUMBConv(nn.Module):
51
+ def __init__(
52
+ self,
53
+ in_channels: int,
54
+ out_channels: int,
55
+ expand_ratio: float = 4,
56
+ norm_type: Optional[str] = None,
57
+ residual_connection: bool = True,
58
+ ) -> None:
59
+ super().__init__()
60
+
61
+ hidden_channels = int(expand_ratio * in_channels)
62
+ self.norm_type = norm_type
63
+ self.residual_connection = residual_connection
64
+
65
+ self.nonlinearity = nn.SiLU()
66
+ self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
67
+ self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
68
+ self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
69
+
70
+ self.norm = None
71
+ if norm_type == "rms_norm":
72
+ self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
73
+
74
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
75
+ if self.residual_connection:
76
+ residual = hidden_states
77
+
78
+ hidden_states = self.conv_inverted(hidden_states)
79
+ hidden_states = self.nonlinearity(hidden_states)
80
+
81
+ hidden_states = self.conv_depth(hidden_states)
82
+ hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
83
+ hidden_states = hidden_states * self.nonlinearity(gate)
84
+
85
+ hidden_states = self.conv_point(hidden_states)
86
+
87
+ if self.norm_type == "rms_norm":
88
+ # move channel to the last dimension so we apply RMSnorm across channel dimension
89
+ hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
90
+
91
+ if self.residual_connection:
92
+ hidden_states = hidden_states + residual
93
+
94
+ return hidden_states
95
+
96
+
97
+ class SanaModulatedNorm(nn.Module):
98
+ def __init__(self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6):
99
+ super().__init__()
100
+ self.norm = nn.LayerNorm(dim, elementwise_affine=elementwise_affine, eps=eps)
101
+
102
+ def forward(
103
+ self, hidden_states: torch.Tensor, temb: torch.Tensor, scale_shift_table: torch.Tensor
104
+ ) -> torch.Tensor:
105
+ hidden_states = self.norm(hidden_states)
106
+ shift, scale = (scale_shift_table[None] + temb[:, None].to(scale_shift_table.device)).chunk(2, dim=1)
107
+ hidden_states = hidden_states * (1 + scale) + shift
108
+ return hidden_states
109
+
110
+
111
+ class SanaCombinedTimestepGuidanceEmbeddings(nn.Module):
112
+ def __init__(self, embedding_dim):
113
+ super().__init__()
114
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
115
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
116
+
117
+ self.guidance_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
118
+ self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
119
+
120
+ self.silu = nn.SiLU()
121
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
122
+
123
+ def forward(self, timestep: torch.Tensor, guidance: torch.Tensor = None, hidden_dtype: torch.dtype = None):
124
+ timesteps_proj = self.time_proj(timestep)
125
+ timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (N, D)
126
+
127
+ guidance_proj = self.guidance_condition_proj(guidance)
128
+ guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=hidden_dtype))
129
+ conditioning = timesteps_emb + guidance_emb
130
+
131
+ return self.linear(self.silu(conditioning)), conditioning
132
+
133
+
134
+ class SanaAttnProcessor2_0:
135
+ r"""
136
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
137
+ """
138
+
139
+ def __init__(self):
140
+ if not hasattr(F, "scaled_dot_product_attention"):
141
+ raise ImportError("SanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
142
+
143
+ def __call__(
144
+ self,
145
+ attn: Attention,
146
+ hidden_states: torch.Tensor,
147
+ encoder_hidden_states: Optional[torch.Tensor] = None,
148
+ attention_mask: Optional[torch.Tensor] = None,
149
+ ) -> torch.Tensor:
150
+ batch_size, sequence_length, _ = (
151
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
152
+ )
153
+
154
+ if attention_mask is not None:
155
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
156
+ # scaled_dot_product_attention expects attention_mask shape to be
157
+ # (batch, heads, source_length, target_length)
158
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
159
+
160
+ query = attn.to_q(hidden_states)
161
+
162
+ if encoder_hidden_states is None:
163
+ encoder_hidden_states = hidden_states
164
+
165
+ key = attn.to_k(encoder_hidden_states)
166
+ value = attn.to_v(encoder_hidden_states)
167
+
168
+ if attn.norm_q is not None:
169
+ query = attn.norm_q(query)
170
+ if attn.norm_k is not None:
171
+ key = attn.norm_k(key)
172
+
173
+ inner_dim = key.shape[-1]
174
+ head_dim = inner_dim // attn.heads
175
+
176
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
177
+
178
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
179
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
180
+
181
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
182
+ # TODO: add support for attn.scale when we move to Torch 2.1
183
+ hidden_states = F.scaled_dot_product_attention(
184
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
185
+ )
186
+
187
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
188
+ hidden_states = hidden_states.to(query.dtype)
189
+
190
+ # linear proj
191
+ hidden_states = attn.to_out[0](hidden_states)
192
+ # dropout
193
+ hidden_states = attn.to_out[1](hidden_states)
194
+
195
+ hidden_states = hidden_states / attn.rescale_output_factor
196
+
197
+ return hidden_states
198
+
199
+
200
+ class SanaTransformerBlock(nn.Module):
201
+ r"""
202
+ Transformer block introduced in [Sana](https://huggingface.co/papers/2410.10629).
203
+ """
204
+
205
+ def __init__(
206
+ self,
207
+ dim: int = 2240,
208
+ num_attention_heads: int = 70,
209
+ attention_head_dim: int = 32,
210
+ dropout: float = 0.0,
211
+ num_cross_attention_heads: Optional[int] = 20,
212
+ cross_attention_head_dim: Optional[int] = 112,
213
+ cross_attention_dim: Optional[int] = 2240,
214
+ attention_bias: bool = True,
215
+ norm_elementwise_affine: bool = False,
216
+ norm_eps: float = 1e-6,
217
+ attention_out_bias: bool = True,
218
+ mlp_ratio: float = 2.5,
219
+ qk_norm: Optional[str] = None,
220
+ ) -> None:
221
+ super().__init__()
222
+
223
+ # 1. Self Attention
224
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=norm_eps)
225
+ self.attn1 = Attention(
226
+ query_dim=dim,
227
+ heads=num_attention_heads,
228
+ dim_head=attention_head_dim,
229
+ kv_heads=num_attention_heads if qk_norm is not None else None,
230
+ qk_norm=qk_norm,
231
+ dropout=dropout,
232
+ bias=attention_bias,
233
+ cross_attention_dim=None,
234
+ processor=SanaLinearAttnProcessor2_0(),
235
+ )
236
+
237
+ # 2. Cross Attention
238
+ if cross_attention_dim is not None:
239
+ self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
240
+ self.attn2 = Attention(
241
+ query_dim=dim,
242
+ qk_norm=qk_norm,
243
+ kv_heads=num_cross_attention_heads if qk_norm is not None else None,
244
+ cross_attention_dim=cross_attention_dim,
245
+ heads=num_cross_attention_heads,
246
+ dim_head=cross_attention_head_dim,
247
+ dropout=dropout,
248
+ bias=True,
249
+ out_bias=attention_out_bias,
250
+ processor=SanaAttnProcessor2_0(),
251
+ )
252
+
253
+ # 3. Feed-forward
254
+ self.ff = GLUMBConv(dim, dim, mlp_ratio, norm_type=None, residual_connection=False)
255
+
256
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
257
+
258
+ def forward(
259
+ self,
260
+ hidden_states: torch.Tensor,
261
+ attention_mask: Optional[torch.Tensor] = None,
262
+ encoder_hidden_states: Optional[torch.Tensor] = None,
263
+ encoder_attention_mask: Optional[torch.Tensor] = None,
264
+ timestep: Optional[torch.LongTensor] = None,
265
+ height: int = None,
266
+ width: int = None,
267
+ ) -> torch.Tensor:
268
+ batch_size = hidden_states.shape[0]
269
+
270
+ # 1. Modulation
271
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
272
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
273
+ ).chunk(6, dim=1)
274
+
275
+ # 2. Self Attention
276
+ norm_hidden_states = self.norm1(hidden_states)
277
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
278
+ norm_hidden_states = norm_hidden_states.to(hidden_states.dtype)
279
+
280
+ attn_output = self.attn1(norm_hidden_states)
281
+ hidden_states = hidden_states + gate_msa * attn_output
282
+
283
+ # 3. Cross Attention
284
+ if self.attn2 is not None:
285
+ attn_output = self.attn2(
286
+ hidden_states,
287
+ encoder_hidden_states=encoder_hidden_states,
288
+ attention_mask=encoder_attention_mask,
289
+ )
290
+ hidden_states = attn_output + hidden_states
291
+
292
+ # 4. Feed-forward
293
+ norm_hidden_states = self.norm2(hidden_states)
294
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
295
+
296
+ norm_hidden_states = norm_hidden_states.unflatten(1, (height, width)).permute(0, 3, 1, 2)
297
+ ff_output = self.ff(norm_hidden_states)
298
+ ff_output = ff_output.flatten(2, 3).permute(0, 2, 1)
299
+ hidden_states = hidden_states + gate_mlp * ff_output
300
+
301
+ return hidden_states
302
+
303
+
304
+ class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
305
+ r"""
306
+ A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.
307
+
308
+ Args:
309
+ in_channels (`int`, defaults to `32`):
310
+ The number of channels in the input.
311
+ out_channels (`int`, *optional*, defaults to `32`):
312
+ The number of channels in the output.
313
+ num_attention_heads (`int`, defaults to `70`):
314
+ The number of heads to use for multi-head attention.
315
+ attention_head_dim (`int`, defaults to `32`):
316
+ The number of channels in each head.
317
+ num_layers (`int`, defaults to `20`):
318
+ The number of layers of Transformer blocks to use.
319
+ num_cross_attention_heads (`int`, *optional*, defaults to `20`):
320
+ The number of heads to use for cross-attention.
321
+ cross_attention_head_dim (`int`, *optional*, defaults to `112`):
322
+ The number of channels in each head for cross-attention.
323
+ cross_attention_dim (`int`, *optional*, defaults to `2240`):
324
+ The number of channels in the cross-attention output.
325
+ caption_channels (`int`, defaults to `2304`):
326
+ The number of channels in the caption embeddings.
327
+ mlp_ratio (`float`, defaults to `2.5`):
328
+ The expansion ratio to use in the GLUMBConv layer.
329
+ dropout (`float`, defaults to `0.0`):
330
+ The dropout probability.
331
+ attention_bias (`bool`, defaults to `False`):
332
+ Whether to use bias in the attention layer.
333
+ sample_size (`int`, defaults to `32`):
334
+ The base size of the input latent.
335
+ patch_size (`int`, defaults to `1`):
336
+ The size of the patches to use in the patch embedding layer.
337
+ norm_elementwise_affine (`bool`, defaults to `False`):
338
+ Whether to use elementwise affinity in the normalization layer.
339
+ norm_eps (`float`, defaults to `1e-6`):
340
+ The epsilon value for the normalization layer.
341
+ qk_norm (`str`, *optional*, defaults to `None`):
342
+ The normalization to use for the query and key.
343
+ timestep_scale (`float`, defaults to `1.0`):
344
+ The scale to use for the timesteps.
345
+ """
346
+
347
+ _supports_gradient_checkpointing = True
348
+ _no_split_modules = ["SanaTransformerBlock", "PatchEmbed", "SanaModulatedNorm"]
349
+ _skip_layerwise_casting_patterns = ["patch_embed", "norm"]
350
+
351
+ @register_to_config
352
+ def __init__(
353
+ self,
354
+ in_channels: int = 32,
355
+ out_channels: Optional[int] = 32,
356
+ num_attention_heads: int = 70,
357
+ attention_head_dim: int = 32,
358
+ num_layers: int = 20,
359
+ num_cross_attention_heads: Optional[int] = 20,
360
+ cross_attention_head_dim: Optional[int] = 112,
361
+ cross_attention_dim: Optional[int] = 2240,
362
+ caption_channels: int = 2304,
363
+ mlp_ratio: float = 2.5,
364
+ dropout: float = 0.0,
365
+ attention_bias: bool = False,
366
+ sample_size: int = 32,
367
+ patch_size: int = 1,
368
+ norm_elementwise_affine: bool = False,
369
+ norm_eps: float = 1e-6,
370
+ interpolation_scale: Optional[int] = None,
371
+ guidance_embeds: bool = False,
372
+ guidance_embeds_scale: float = 0.1,
373
+ qk_norm: Optional[str] = None,
374
+ timestep_scale: float = 1.0,
375
+ ) -> None:
376
+ super().__init__()
377
+
378
+ out_channels = out_channels or in_channels
379
+ inner_dim = num_attention_heads * attention_head_dim
380
+
381
+ # 1. Patch Embedding
382
+ self.patch_embed = PatchEmbed(
383
+ height=sample_size,
384
+ width=sample_size,
385
+ patch_size=patch_size,
386
+ in_channels=in_channels,
387
+ embed_dim=inner_dim,
388
+ interpolation_scale=interpolation_scale,
389
+ pos_embed_type="sincos" if interpolation_scale is not None else None,
390
+ )
391
+
392
+ # 2. Additional condition embeddings
393
+ if guidance_embeds:
394
+ self.time_embed = SanaCombinedTimestepGuidanceEmbeddings(inner_dim)
395
+ else:
396
+ self.time_embed = AdaLayerNormSingle(inner_dim)
397
+
398
+ self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
399
+ self.caption_norm = RMSNorm(inner_dim, eps=1e-5, elementwise_affine=True)
400
+
401
+ # 3. Transformer blocks
402
+ self.transformer_blocks = nn.ModuleList(
403
+ [
404
+ SanaTransformerBlock(
405
+ inner_dim,
406
+ num_attention_heads,
407
+ attention_head_dim,
408
+ dropout=dropout,
409
+ num_cross_attention_heads=num_cross_attention_heads,
410
+ cross_attention_head_dim=cross_attention_head_dim,
411
+ cross_attention_dim=cross_attention_dim,
412
+ attention_bias=attention_bias,
413
+ norm_elementwise_affine=norm_elementwise_affine,
414
+ norm_eps=norm_eps,
415
+ mlp_ratio=mlp_ratio,
416
+ qk_norm=qk_norm,
417
+ )
418
+ for _ in range(num_layers)
419
+ ]
420
+ )
421
+
422
+ # 4. Output blocks
423
+ self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
424
+ self.norm_out = SanaModulatedNorm(inner_dim, elementwise_affine=False, eps=1e-6)
425
+ self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
426
+
427
+ self.gradient_checkpointing = False
428
+
429
+ @property
430
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
431
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
432
+ r"""
433
+ Returns:
434
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
435
+ indexed by its weight name.
436
+ """
437
+ # set recursively
438
+ processors = {}
439
+
440
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
441
+ if hasattr(module, "get_processor"):
442
+ processors[f"{name}.processor"] = module.get_processor()
443
+
444
+ for sub_name, child in module.named_children():
445
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
446
+
447
+ return processors
448
+
449
+ for name, module in self.named_children():
450
+ fn_recursive_add_processors(name, module, processors)
451
+
452
+ return processors
453
+
454
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
455
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
456
+ r"""
457
+ Sets the attention processor to use to compute attention.
458
+
459
+ Parameters:
460
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
461
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
462
+ for **all** `Attention` layers.
463
+
464
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
465
+ processor. This is strongly recommended when setting trainable attention processors.
466
+
467
+ """
468
+ count = len(self.attn_processors.keys())
469
+
470
+ if isinstance(processor, dict) and len(processor) != count:
471
+ raise ValueError(
472
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
473
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
474
+ )
475
+
476
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
477
+ if hasattr(module, "set_processor"):
478
+ if not isinstance(processor, dict):
479
+ module.set_processor(processor)
480
+ else:
481
+ module.set_processor(processor.pop(f"{name}.processor"))
482
+
483
+ for sub_name, child in module.named_children():
484
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
485
+
486
+ for name, module in self.named_children():
487
+ fn_recursive_attn_processor(name, module, processor)
488
+
489
+ def register_block_hooks(self, block_indices=None):
490
+ """
491
+ 为指定的transformer block注册钩子以获取输出
492
+
493
+ Args:
494
+ block_indices (list, optional): 要监视的block索引列表,None表示所有block
495
+
496
+ Returns:
497
+ dict: block_outputs字典,键为block索引,值为对应的输出
498
+ """
499
+ block_outputs = {}
500
+ hooks = []
501
+
502
+ indices = block_indices if block_indices is not None else range(len(self.transformer_blocks))
503
+
504
+ for idx in indices:
505
+ # print('idx',idx)
506
+ if idx < 0 or idx >= len(self.transformer_blocks):
507
+ continue
508
+
509
+ def get_hook(i):
510
+ def hook(module, input, output):
511
+ block_outputs[i] = output
512
+ return hook
513
+
514
+ h = self.transformer_blocks[idx].register_forward_hook(get_hook(idx))
515
+ hooks.append(h)
516
+
517
+ return block_outputs, hooks
518
+
519
+ def remove_hooks(self, hooks):
520
+ """移除所有注册的钩子"""
521
+ for h in hooks:
522
+ h.remove()
523
+
524
+
525
+ def forward(
526
+ self,
527
+ hidden_states: torch.Tensor,
528
+ encoder_hidden_states: torch.Tensor,
529
+ timestep: torch.Tensor,
530
+ guidance: Optional[torch.Tensor] = None,
531
+ encoder_attention_mask: Optional[torch.Tensor] = None,
532
+ attention_mask: Optional[torch.Tensor] = None,
533
+ attention_kwargs: Optional[Dict[str, Any]] = None,
534
+ return_dict: bool = True,
535
+ ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
536
+ if attention_kwargs is not None:
537
+ attention_kwargs = attention_kwargs.copy()
538
+ lora_scale = attention_kwargs.pop("scale", 1.0)
539
+ else:
540
+ lora_scale = 1.0
541
+
542
+ if USE_PEFT_BACKEND:
543
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
544
+ scale_lora_layers(self, lora_scale)
545
+ else:
546
+ if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
547
+ logger.warning(
548
+ "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
549
+ )
550
+
551
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
552
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
553
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
554
+ # expects mask of shape:
555
+ # [batch, key_tokens]
556
+ # adds singleton query_tokens dimension:
557
+ # [batch, 1, key_tokens]
558
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
559
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
560
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
561
+ if attention_mask is not None and attention_mask.ndim == 2:
562
+ # assume that mask is expressed as:
563
+ # (1 = keep, 0 = discard)
564
+ # convert mask into a bias that can be added to attention scores:
565
+ # (keep = +0, discard = -10000.0)
566
+ attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
567
+ attention_mask = attention_mask.unsqueeze(1)
568
+
569
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
570
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
571
+ encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
572
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
573
+
574
+ # 1. Input
575
+ batch_size, num_channels, height, width = hidden_states.shape
576
+ p = self.config.patch_size
577
+ post_patch_height, post_patch_width = height // p, width // p
578
+
579
+ hidden_states = self.patch_embed(hidden_states)
580
+
581
+ if guidance is not None:
582
+ timestep, embedded_timestep = self.time_embed(
583
+ timestep, guidance=guidance, hidden_dtype=hidden_states.dtype
584
+ )
585
+ else:
586
+ timestep, embedded_timestep = self.time_embed(
587
+ timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
588
+ )
589
+
590
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states)
591
+ encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
592
+
593
+ encoder_hidden_states = self.caption_norm(encoder_hidden_states)
594
+
595
+ # 2. Transformer blocks
596
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
597
+ for block in self.transformer_blocks:
598
+ hidden_states = self._gradient_checkpointing_func(
599
+ block,
600
+ hidden_states,
601
+ attention_mask,
602
+ encoder_hidden_states,
603
+ encoder_attention_mask,
604
+ timestep,
605
+ post_patch_height,
606
+ post_patch_width,
607
+ )
608
+
609
+ else:
610
+ for block in self.transformer_blocks:
611
+ hidden_states = block(
612
+ hidden_states,
613
+ attention_mask,
614
+ encoder_hidden_states,
615
+ encoder_attention_mask,
616
+ timestep,
617
+ post_patch_height,
618
+ post_patch_width,
619
+ )
620
+
621
+ # 3. Normalization
622
+ hidden_states = self.norm_out(hidden_states, embedded_timestep, self.scale_shift_table)
623
+
624
+ hidden_states = self.proj_out(hidden_states)
625
+
626
+ # 5. Unpatchify
627
+ hidden_states = hidden_states.reshape(
628
+ batch_size, post_patch_height, post_patch_width, self.config.patch_size, self.config.patch_size, -1
629
+ )
630
+ hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
631
+ output = hidden_states.reshape(batch_size, -1, post_patch_height * p, post_patch_width * p)
632
+
633
+ if USE_PEFT_BACKEND:
634
+ # remove `lora_scale` from each PEFT layer
635
+ unscale_lora_layers(self, lora_scale)
636
+
637
+ if not return_dict:
638
+ return (output,)
639
+
640
+ return Transformer2DModelOutput(sample=output)
inference.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ from Ming_Uni.MingUniInference import Ming_Uni_Inference
4
+ from Ming_Uni.process import MyProcessor
5
+ device = torch.cuda.current_device()
6
+ device = torch.device(device)
7
+
8
+ model_path='../Ming-Lite-Uni/'
9
+ model = Ming_Uni_Inference(model_path)
10
+ model.to(torch.bfloat16)
11
+ model.to(device)
12
+ model.eval()
13
+
14
+ llm_model=os.path.join(model_path, 'qwen2_5_llm')
15
+ my_proc=MyProcessor(llm_model)
16
+
17
+ image_file = "tests/cake.jpg"
18
+ prompt = "add a candle on top of the cake"
19
+ inputs = my_proc.process(image_file=image_file, prompt=prompt, device=device)
20
+
21
+ result = model.image_gen_generate(inputs, steps=30, seed=42, cfg=5.0, height=512, width=512)[1]
22
+ result.save("result.png")
tests/cake.jpg ADDED
tests/man.jpg ADDED