linoyts HF Staff commited on
Commit
11a704b
·
verified ·
1 Parent(s): 9b43057

Upload pipeline_qwenimage_edit_plus.py

Browse files
qwenimage/pipeline_qwenimage_edit_plus.py ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import math
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
22
+
23
+ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
24
+ from diffusers.loaders import QwenImageLoraLoaderMixin
25
+ from diffusers.models import AutoencoderKLQwenImage, QwenImageTransformer2DModel
26
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
27
+ from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
28
+ from diffusers.utils.torch_utils import randn_tensor
29
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
30
+ from diffusers.pipelines.qwenimage.pipeline_output import QwenImagePipelineOutput
31
+
32
+
33
+ if is_torch_xla_available():
34
+ import torch_xla.core.xla_model as xm
35
+
36
+ XLA_AVAILABLE = True
37
+ else:
38
+ XLA_AVAILABLE = False
39
+
40
+
41
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
+
43
+ EXAMPLE_DOC_STRING = """
44
+ Examples:
45
+ ```py
46
+ >>> import torch
47
+ >>> from PIL import Image
48
+ >>> from diffusers import QwenImageEditPlusPipeline
49
+ >>> from diffusers.utils import load_image
50
+
51
+ >>> pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16)
52
+ >>> pipe.to("cuda")
53
+ >>> image = load_image(
54
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
55
+ ... ).convert("RGB")
56
+ >>> prompt = (
57
+ ... "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors"
58
+ ... )
59
+ >>> # Depending on the variant being used, the pipeline call will slightly vary.
60
+ >>> # Refer to the pipeline documentation for more details.
61
+ >>> image = pipe(image, prompt, num_inference_steps=50).images[0]
62
+ >>> image.save("qwenimage_edit_plus.png")
63
+ ```
64
+ """
65
+
66
+ CONDITION_IMAGE_SIZE = 384 * 384
67
+ VAE_IMAGE_SIZE = 1024 * 1024
68
+
69
+
70
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift
71
+ def calculate_shift(
72
+ image_seq_len,
73
+ base_seq_len: int = 256,
74
+ max_seq_len: int = 4096,
75
+ base_shift: float = 0.5,
76
+ max_shift: float = 1.15,
77
+ ):
78
+ m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
79
+ b = base_shift - m * base_seq_len
80
+ mu = image_seq_len * m + b
81
+ return mu
82
+
83
+
84
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
85
+ def retrieve_timesteps(
86
+ scheduler,
87
+ num_inference_steps: Optional[int] = None,
88
+ device: Optional[Union[str, torch.device]] = None,
89
+ timesteps: Optional[List[int]] = None,
90
+ sigmas: Optional[List[float]] = None,
91
+ **kwargs,
92
+ ):
93
+ r"""
94
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
95
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
96
+
97
+ Args:
98
+ scheduler (`SchedulerMixin`):
99
+ The scheduler to get timesteps from.
100
+ num_inference_steps (`int`):
101
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
102
+ must be `None`.
103
+ device (`str` or `torch.device`, *optional*):
104
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
105
+ timesteps (`List[int]`, *optional*):
106
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
107
+ `num_inference_steps` and `sigmas` must be `None`.
108
+ sigmas (`List[float]`, *optional*):
109
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
110
+ `num_inference_steps` and `timesteps` must be `None`.
111
+
112
+ Returns:
113
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
114
+ second element is the number of inference steps.
115
+ """
116
+ if timesteps is not None and sigmas is not None:
117
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
118
+ if timesteps is not None:
119
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
120
+ if not accepts_timesteps:
121
+ raise ValueError(
122
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
123
+ f" timestep schedules. Please check whether you are using the correct scheduler."
124
+ )
125
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
126
+ timesteps = scheduler.timesteps
127
+ num_inference_steps = len(timesteps)
128
+ elif sigmas is not None:
129
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
130
+ if not accept_sigmas:
131
+ raise ValueError(
132
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
133
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
134
+ )
135
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
136
+ timesteps = scheduler.timesteps
137
+ num_inference_steps = len(timesteps)
138
+ else:
139
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
140
+ timesteps = scheduler.timesteps
141
+ return timesteps, num_inference_steps
142
+
143
+
144
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
145
+ def retrieve_latents(
146
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
147
+ ):
148
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
149
+ return encoder_output.latent_dist.sample(generator)
150
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
151
+ return encoder_output.latent_dist.mode()
152
+ elif hasattr(encoder_output, "latents"):
153
+ return encoder_output.latents
154
+ else:
155
+ raise AttributeError("Could not access latents of provided encoder_output")
156
+
157
+
158
+ def calculate_dimensions(target_area, ratio):
159
+ width = math.sqrt(target_area * ratio)
160
+ height = width / ratio
161
+
162
+ width = round(width / 32) * 32
163
+ height = round(height / 32) * 32
164
+
165
+ return width, height
166
+
167
+
168
+ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
169
+ r"""
170
+ The Qwen-Image-Edit pipeline for image editing.
171
+
172
+ Args:
173
+ transformer ([`QwenImageTransformer2DModel`]):
174
+ Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
175
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
176
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
177
+ vae ([`AutoencoderKL`]):
178
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
179
+ text_encoder ([`Qwen2.5-VL-7B-Instruct`]):
180
+ [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the
181
+ [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant.
182
+ tokenizer (`QwenTokenizer`):
183
+ Tokenizer of class
184
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
185
+ """
186
+
187
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
188
+ _callback_tensor_inputs = ["latents", "prompt_embeds"]
189
+
190
+ def __init__(
191
+ self,
192
+ scheduler: FlowMatchEulerDiscreteScheduler,
193
+ vae: AutoencoderKLQwenImage,
194
+ text_encoder: Qwen2_5_VLForConditionalGeneration,
195
+ tokenizer: Qwen2Tokenizer,
196
+ processor: Qwen2VLProcessor,
197
+ transformer: QwenImageTransformer2DModel,
198
+ ):
199
+ super().__init__()
200
+
201
+ self.register_modules(
202
+ vae=vae,
203
+ text_encoder=text_encoder,
204
+ tokenizer=tokenizer,
205
+ processor=processor,
206
+ transformer=transformer,
207
+ scheduler=scheduler,
208
+ )
209
+ self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
210
+ self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
211
+ # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
212
+ # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
213
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
214
+ self.tokenizer_max_length = 1024
215
+
216
+ self.prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
217
+ self.prompt_template_encode_start_idx = 64
218
+ self.default_sample_size = 128
219
+
220
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden
221
+ def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
222
+ bool_mask = mask.bool()
223
+ valid_lengths = bool_mask.sum(dim=1)
224
+ selected = hidden_states[bool_mask]
225
+ split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
226
+
227
+ return split_result
228
+
229
+ def _get_qwen_prompt_embeds(
230
+ self,
231
+ prompt: Union[str, List[str]] = None,
232
+ image: Optional[torch.Tensor] = None,
233
+ device: Optional[torch.device] = None,
234
+ dtype: Optional[torch.dtype] = None,
235
+ ):
236
+ device = device or self._execution_device
237
+ dtype = dtype or self.text_encoder.dtype
238
+
239
+ prompt = [prompt] if isinstance(prompt, str) else prompt
240
+ img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
241
+ if isinstance(image, list):
242
+ base_img_prompt = ""
243
+ for i, img in enumerate(image):
244
+ base_img_prompt += img_prompt_template.format(i + 1)
245
+ elif image is not None:
246
+ base_img_prompt = img_prompt_template.format(1)
247
+ else:
248
+ base_img_prompt = ""
249
+
250
+ template = self.prompt_template_encode
251
+
252
+ drop_idx = self.prompt_template_encode_start_idx
253
+ txt = [template.format(base_img_prompt + e) for e in prompt]
254
+
255
+ model_inputs = self.processor(
256
+ text=txt,
257
+ images=image,
258
+ padding=True,
259
+ return_tensors="pt",
260
+ ).to(device)
261
+
262
+ outputs = self.text_encoder(
263
+ input_ids=model_inputs.input_ids,
264
+ attention_mask=model_inputs.attention_mask,
265
+ pixel_values=model_inputs.pixel_values,
266
+ image_grid_thw=model_inputs.image_grid_thw,
267
+ output_hidden_states=True,
268
+ )
269
+
270
+ hidden_states = outputs.hidden_states[-1]
271
+ split_hidden_states = self._extract_masked_hidden(hidden_states, model_inputs.attention_mask)
272
+ split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
273
+ attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
274
+ max_seq_len = max([e.size(0) for e in split_hidden_states])
275
+ prompt_embeds = torch.stack(
276
+ [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
277
+ )
278
+ encoder_attention_mask = torch.stack(
279
+ [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
280
+ )
281
+
282
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
283
+
284
+ return prompt_embeds, encoder_attention_mask
285
+
286
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.encode_prompt
287
+ def encode_prompt(
288
+ self,
289
+ prompt: Union[str, List[str]],
290
+ image: Optional[torch.Tensor] = None,
291
+ device: Optional[torch.device] = None,
292
+ num_images_per_prompt: int = 1,
293
+ prompt_embeds: Optional[torch.Tensor] = None,
294
+ prompt_embeds_mask: Optional[torch.Tensor] = None,
295
+ max_sequence_length: int = 1024,
296
+ ):
297
+ r"""
298
+
299
+ Args:
300
+ prompt (`str` or `List[str]`, *optional*):
301
+ prompt to be encoded
302
+ image (`torch.Tensor`, *optional*):
303
+ image to be encoded
304
+ device: (`torch.device`):
305
+ torch device
306
+ num_images_per_prompt (`int`):
307
+ number of images that should be generated per prompt
308
+ prompt_embeds (`torch.Tensor`, *optional*):
309
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
310
+ provided, text embeddings will be generated from `prompt` input argument.
311
+ """
312
+ device = device or self._execution_device
313
+
314
+ prompt = [prompt] if isinstance(prompt, str) else prompt
315
+ batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
316
+
317
+ if prompt_embeds is None:
318
+ prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, image, device)
319
+
320
+ _, seq_len, _ = prompt_embeds.shape
321
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
322
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
323
+ prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
324
+ prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
325
+
326
+ return prompt_embeds, prompt_embeds_mask
327
+
328
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.check_inputs
329
+ def check_inputs(
330
+ self,
331
+ prompt,
332
+ height,
333
+ width,
334
+ negative_prompt=None,
335
+ prompt_embeds=None,
336
+ negative_prompt_embeds=None,
337
+ prompt_embeds_mask=None,
338
+ negative_prompt_embeds_mask=None,
339
+ callback_on_step_end_tensor_inputs=None,
340
+ max_sequence_length=None,
341
+ ):
342
+ if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
343
+ logger.warning(
344
+ f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
345
+ )
346
+
347
+ if callback_on_step_end_tensor_inputs is not None and not all(
348
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
349
+ ):
350
+ raise ValueError(
351
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
352
+ )
353
+
354
+ if prompt is not None and prompt_embeds is not None:
355
+ raise ValueError(
356
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
357
+ " only forward one of the two."
358
+ )
359
+ elif prompt is None and prompt_embeds is None:
360
+ raise ValueError(
361
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
362
+ )
363
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
364
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
365
+
366
+ if negative_prompt is not None and negative_prompt_embeds is not None:
367
+ raise ValueError(
368
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
369
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
370
+ )
371
+
372
+ if prompt_embeds is not None and prompt_embeds_mask is None:
373
+ raise ValueError(
374
+ "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
375
+ )
376
+ if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
377
+ raise ValueError(
378
+ "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
379
+ )
380
+
381
+ if max_sequence_length is not None and max_sequence_length > 1024:
382
+ raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
383
+
384
+ @staticmethod
385
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._pack_latents
386
+ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
387
+ latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
388
+ latents = latents.permute(0, 2, 4, 1, 3, 5)
389
+ latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
390
+
391
+ return latents
392
+
393
+ @staticmethod
394
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._unpack_latents
395
+ def _unpack_latents(latents, height, width, vae_scale_factor):
396
+ batch_size, num_patches, channels = latents.shape
397
+
398
+ # VAE applies 8x compression on images but we must also account for packing which requires
399
+ # latent height and width to be divisible by 2.
400
+ height = 2 * (int(height) // (vae_scale_factor * 2))
401
+ width = 2 * (int(width) // (vae_scale_factor * 2))
402
+
403
+ latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
404
+ latents = latents.permute(0, 3, 1, 4, 2, 5)
405
+
406
+ latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width)
407
+
408
+ return latents
409
+
410
+ # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline._encode_vae_image
411
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
412
+ if isinstance(generator, list):
413
+ image_latents = [
414
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
415
+ for i in range(image.shape[0])
416
+ ]
417
+ image_latents = torch.cat(image_latents, dim=0)
418
+ else:
419
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
420
+ latents_mean = (
421
+ torch.tensor(self.vae.config.latents_mean)
422
+ .view(1, self.latent_channels, 1, 1, 1)
423
+ .to(image_latents.device, image_latents.dtype)
424
+ )
425
+ latents_std = (
426
+ torch.tensor(self.vae.config.latents_std)
427
+ .view(1, self.latent_channels, 1, 1, 1)
428
+ .to(image_latents.device, image_latents.dtype)
429
+ )
430
+ image_latents = (image_latents - latents_mean) / latents_std
431
+
432
+ return image_latents
433
+
434
+ def prepare_latents(
435
+ self,
436
+ images,
437
+ batch_size,
438
+ num_channels_latents,
439
+ height,
440
+ width,
441
+ dtype,
442
+ device,
443
+ generator,
444
+ latents=None,
445
+ ):
446
+ # VAE applies 8x compression on images but we must also account for packing which requires
447
+ # latent height and width to be divisible by 2.
448
+ height = 2 * (int(height) // (self.vae_scale_factor * 2))
449
+ width = 2 * (int(width) // (self.vae_scale_factor * 2))
450
+
451
+ shape = (batch_size, 1, num_channels_latents, height, width)
452
+
453
+ image_latents = None
454
+ if images is not None:
455
+ if not isinstance(images, list):
456
+ images = [images]
457
+ all_image_latents = []
458
+ for image in images:
459
+ image = image.to(device=device, dtype=dtype)
460
+ if image.shape[1] != self.latent_channels:
461
+ image_latents = self._encode_vae_image(image=image, generator=generator)
462
+ else:
463
+ image_latents = image
464
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
465
+ # expand init_latents for batch_size
466
+ additional_image_per_prompt = batch_size // image_latents.shape[0]
467
+ image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
468
+ elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
469
+ raise ValueError(
470
+ f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
471
+ )
472
+ else:
473
+ image_latents = torch.cat([image_latents], dim=0)
474
+
475
+ image_latent_height, image_latent_width = image_latents.shape[3:]
476
+ image_latents = self._pack_latents(
477
+ image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width
478
+ )
479
+ all_image_latents.append(image_latents)
480
+ image_latents = torch.cat(all_image_latents, dim=1)
481
+
482
+ if isinstance(generator, list) and len(generator) != batch_size:
483
+ raise ValueError(
484
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
485
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
486
+ )
487
+ if latents is None:
488
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
489
+ latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
490
+ else:
491
+ latents = latents.to(device=device, dtype=dtype)
492
+
493
+ return latents, image_latents
494
+
495
+ @property
496
+ def guidance_scale(self):
497
+ return self._guidance_scale
498
+
499
+ @property
500
+ def attention_kwargs(self):
501
+ return self._attention_kwargs
502
+
503
+ @property
504
+ def num_timesteps(self):
505
+ return self._num_timesteps
506
+
507
+ @property
508
+ def current_timestep(self):
509
+ return self._current_timestep
510
+
511
+ @property
512
+ def interrupt(self):
513
+ return self._interrupt
514
+
515
+ @torch.no_grad()
516
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
517
+ def __call__(
518
+ self,
519
+ image: Optional[PipelineImageInput] = None,
520
+ prompt: Union[str, List[str]] = None,
521
+ negative_prompt: Union[str, List[str]] = None,
522
+ true_cfg_scale: float = 4.0,
523
+ height: Optional[int] = None,
524
+ width: Optional[int] = None,
525
+ num_inference_steps: int = 50,
526
+ sigmas: Optional[List[float]] = None,
527
+ guidance_scale: Optional[float] = None,
528
+ num_images_per_prompt: int = 1,
529
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
530
+ latents: Optional[torch.Tensor] = None,
531
+ prompt_embeds: Optional[torch.Tensor] = None,
532
+ prompt_embeds_mask: Optional[torch.Tensor] = None,
533
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
534
+ negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
535
+ output_type: Optional[str] = "pil",
536
+ return_dict: bool = True,
537
+ attention_kwargs: Optional[Dict[str, Any]] = None,
538
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
539
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
540
+ max_sequence_length: int = 512,
541
+ ):
542
+ r"""
543
+ Function invoked when calling the pipeline for generation.
544
+
545
+ Args:
546
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
547
+ `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
548
+ numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
549
+ or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
550
+ list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
551
+ latents as `image`, but if passing latents directly it is not encoded again.
552
+ prompt (`str` or `List[str]`, *optional*):
553
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
554
+ instead.
555
+ negative_prompt (`str` or `List[str]`, *optional*):
556
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
557
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
558
+ not greater than `1`).
559
+ true_cfg_scale (`float`, *optional*, defaults to 1.0):
560
+ true_cfg_scale (`float`, *optional*, defaults to 1.0): Guidance scale as defined in [Classifier-Free
561
+ Diffusion Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of
562
+ equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is
563
+ enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale
564
+ encourages to generate images that are closely linked to the text `prompt`, usually at the expense of
565
+ lower image quality.
566
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
567
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
568
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
569
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
570
+ num_inference_steps (`int`, *optional*, defaults to 50):
571
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
572
+ expense of slower inference.
573
+ sigmas (`List[float]`, *optional*):
574
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
575
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
576
+ will be used.
577
+ guidance_scale (`float`, *optional*, defaults to None):
578
+ A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance
579
+ where the guidance scale is applied during inference through noise prediction rescaling, guidance
580
+ distilled models take the guidance scale directly as an input parameter during forward pass. Guidance
581
+ scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images
582
+ that are closely linked to the text `prompt`, usually at the expense of lower image quality. This
583
+ parameter in the pipeline is there to support future guidance-distilled models when they come up. It is
584
+ ignored when not using guidance distilled models. To enable traditional classifier-free guidance,
585
+ please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should
586
+ enable classifier-free guidance computations).
587
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
588
+ The number of images to generate per prompt.
589
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
590
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
591
+ to make generation deterministic.
592
+ latents (`torch.Tensor`, *optional*):
593
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
594
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
595
+ tensor will be generated by sampling using the supplied random `generator`.
596
+ prompt_embeds (`torch.Tensor`, *optional*):
597
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
598
+ provided, text embeddings will be generated from `prompt` input argument.
599
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
600
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
601
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
602
+ argument.
603
+ output_type (`str`, *optional*, defaults to `"pil"`):
604
+ The output format of the generate image. Choose between
605
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
606
+ return_dict (`bool`, *optional*, defaults to `True`):
607
+ Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple.
608
+ attention_kwargs (`dict`, *optional*):
609
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
610
+ `self.processor` in
611
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
612
+ callback_on_step_end (`Callable`, *optional*):
613
+ A function that calls at the end of each denoising steps during the inference. The function is called
614
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
615
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
616
+ `callback_on_step_end_tensor_inputs`.
617
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
618
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
619
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
620
+ `._callback_tensor_inputs` attribute of your pipeline class.
621
+ max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
622
+
623
+ Examples:
624
+
625
+ Returns:
626
+ [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
627
+ [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
628
+ returning a tuple, the first element is a list with the generated images.
629
+ """
630
+ image_size = image[-1].size if isinstance(image, list) else image.size
631
+ calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] / image_size[1])
632
+ height = height or calculated_height
633
+ width = width or calculated_width
634
+
635
+ multiple_of = self.vae_scale_factor * 2
636
+ width = width // multiple_of * multiple_of
637
+ height = height // multiple_of * multiple_of
638
+
639
+ # 1. Check inputs. Raise error if not correct
640
+ self.check_inputs(
641
+ prompt,
642
+ height,
643
+ width,
644
+ negative_prompt=negative_prompt,
645
+ prompt_embeds=prompt_embeds,
646
+ negative_prompt_embeds=negative_prompt_embeds,
647
+ prompt_embeds_mask=prompt_embeds_mask,
648
+ negative_prompt_embeds_mask=negative_prompt_embeds_mask,
649
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
650
+ max_sequence_length=max_sequence_length,
651
+ )
652
+
653
+ self._guidance_scale = guidance_scale
654
+ self._attention_kwargs = attention_kwargs
655
+ self._current_timestep = None
656
+ self._interrupt = False
657
+
658
+ # 2. Define call parameters
659
+ if prompt is not None and isinstance(prompt, str):
660
+ batch_size = 1
661
+ elif prompt is not None and isinstance(prompt, list):
662
+ batch_size = len(prompt)
663
+ else:
664
+ batch_size = prompt_embeds.shape[0]
665
+
666
+ device = self._execution_device
667
+ # 3. Preprocess image
668
+ if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
669
+ if not isinstance(image, list):
670
+ image = [image]
671
+ condition_image_sizes = []
672
+ condition_images = []
673
+ vae_image_sizes = []
674
+ vae_images = []
675
+ for img in image:
676
+ image_width, image_height = img.size
677
+ condition_width, condition_height = calculate_dimensions(
678
+ CONDITION_IMAGE_SIZE, image_width / image_height
679
+ )
680
+ vae_width, vae_height = calculate_dimensions(VAE_IMAGE_SIZE, image_width / image_height)
681
+ condition_image_sizes.append((condition_width, condition_height))
682
+ vae_image_sizes.append((vae_width, vae_height))
683
+ condition_images.append(self.image_processor.resize(img, condition_height, condition_width))
684
+ vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2))
685
+
686
+ has_neg_prompt = negative_prompt is not None or (
687
+ negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
688
+ )
689
+
690
+ if true_cfg_scale > 1 and not has_neg_prompt:
691
+ logger.warning(
692
+ f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
693
+ )
694
+ elif true_cfg_scale <= 1 and has_neg_prompt:
695
+ logger.warning(
696
+ " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
697
+ )
698
+
699
+ do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
700
+ prompt_embeds, prompt_embeds_mask = self.encode_prompt(
701
+ image=condition_images,
702
+ prompt=prompt,
703
+ prompt_embeds=prompt_embeds,
704
+ prompt_embeds_mask=prompt_embeds_mask,
705
+ device=device,
706
+ num_images_per_prompt=num_images_per_prompt,
707
+ max_sequence_length=max_sequence_length,
708
+ )
709
+ if do_true_cfg:
710
+ negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
711
+ image=condition_images,
712
+ prompt=negative_prompt,
713
+ prompt_embeds=negative_prompt_embeds,
714
+ prompt_embeds_mask=negative_prompt_embeds_mask,
715
+ device=device,
716
+ num_images_per_prompt=num_images_per_prompt,
717
+ max_sequence_length=max_sequence_length,
718
+ )
719
+
720
+ # 4. Prepare latent variables
721
+ num_channels_latents = self.transformer.config.in_channels // 4
722
+ latents, image_latents = self.prepare_latents(
723
+ vae_images,
724
+ batch_size * num_images_per_prompt,
725
+ num_channels_latents,
726
+ height,
727
+ width,
728
+ prompt_embeds.dtype,
729
+ device,
730
+ generator,
731
+ latents,
732
+ )
733
+ img_shapes = [
734
+ [
735
+ (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
736
+ *[
737
+ (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
738
+ for vae_width, vae_height in vae_image_sizes
739
+ ],
740
+ ]
741
+ ] * batch_size
742
+
743
+ # 5. Prepare timesteps
744
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
745
+ image_seq_len = latents.shape[1]
746
+ mu = calculate_shift(
747
+ image_seq_len,
748
+ self.scheduler.config.get("base_image_seq_len", 256),
749
+ self.scheduler.config.get("max_image_seq_len", 4096),
750
+ self.scheduler.config.get("base_shift", 0.5),
751
+ self.scheduler.config.get("max_shift", 1.15),
752
+ )
753
+ timesteps, num_inference_steps = retrieve_timesteps(
754
+ self.scheduler,
755
+ num_inference_steps,
756
+ device,
757
+ sigmas=sigmas,
758
+ mu=mu,
759
+ )
760
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
761
+ self._num_timesteps = len(timesteps)
762
+
763
+ # handle guidance
764
+ if self.transformer.config.guidance_embeds and guidance_scale is None:
765
+ raise ValueError("guidance_scale is required for guidance-distilled model.")
766
+ elif self.transformer.config.guidance_embeds:
767
+ guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
768
+ guidance = guidance.expand(latents.shape[0])
769
+ elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
770
+ logger.warning(
771
+ f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
772
+ )
773
+ guidance = None
774
+ elif not self.transformer.config.guidance_embeds and guidance_scale is None:
775
+ guidance = None
776
+
777
+ if self.attention_kwargs is None:
778
+ self._attention_kwargs = {}
779
+
780
+ txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
781
+
782
+ image_rotary_emb = self.transformer.pos_embed(img_shapes, txt_seq_lens, device=latents.device)
783
+ if do_true_cfg:
784
+ negative_txt_seq_lens = (
785
+ negative_prompt_embeds_mask.sum(dim=1).tolist()
786
+ if negative_prompt_embeds_mask is not None
787
+ else None
788
+ )
789
+ uncond_image_rotary_emb = self.transformer.pos_embed(
790
+ img_shapes, negative_txt_seq_lens, device=latents.device
791
+ )
792
+ else:
793
+ uncond_image_rotary_emb = None
794
+
795
+ # 6. Denoising loop
796
+ self.scheduler.set_begin_index(0)
797
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
798
+ for i, t in enumerate(timesteps):
799
+ if self.interrupt:
800
+ continue
801
+
802
+ self._current_timestep = t
803
+
804
+ latent_model_input = latents
805
+ if image_latents is not None:
806
+ latent_model_input = torch.cat([latents, image_latents], dim=1)
807
+
808
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
809
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
810
+ with self.transformer.cache_context("cond"):
811
+ noise_pred = self.transformer(
812
+ hidden_states=latent_model_input,
813
+ timestep=timestep / 1000,
814
+ guidance=guidance,
815
+ encoder_hidden_states_mask=prompt_embeds_mask,
816
+ encoder_hidden_states=prompt_embeds,
817
+ image_rotary_emb=image_rotary_emb,
818
+ attention_kwargs=self.attention_kwargs,
819
+ return_dict=False,
820
+ )[0]
821
+ noise_pred = noise_pred[:, : latents.size(1)]
822
+
823
+ if do_true_cfg:
824
+ with self.transformer.cache_context("uncond"):
825
+ neg_noise_pred = self.transformer(
826
+ hidden_states=latent_model_input,
827
+ timestep=timestep / 1000,
828
+ guidance=guidance,
829
+ encoder_hidden_states_mask=negative_prompt_embeds_mask,
830
+ encoder_hidden_states=negative_prompt_embeds,
831
+ image_rotary_emb=uncond_image_rotary_emb,
832
+ attention_kwargs=self.attention_kwargs,
833
+ return_dict=False,
834
+ )[0]
835
+ neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
836
+ comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
837
+
838
+ cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
839
+ noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
840
+ noise_pred = comb_pred * (cond_norm / noise_norm)
841
+
842
+ # compute the previous noisy sample x_t -> x_t-1
843
+ latents_dtype = latents.dtype
844
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
845
+
846
+ if latents.dtype != latents_dtype:
847
+ if torch.backends.mps.is_available():
848
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
849
+ latents = latents.to(latents_dtype)
850
+
851
+ if callback_on_step_end is not None:
852
+ callback_kwargs = {}
853
+ for k in callback_on_step_end_tensor_inputs:
854
+ callback_kwargs[k] = locals()[k]
855
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
856
+
857
+ latents = callback_outputs.pop("latents", latents)
858
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
859
+
860
+ # call the callback, if provided
861
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
862
+ progress_bar.update()
863
+
864
+ if XLA_AVAILABLE:
865
+ xm.mark_step()
866
+
867
+ self._current_timestep = None
868
+ if output_type == "latent":
869
+ image = latents
870
+ else:
871
+ latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
872
+ latents = latents.to(self.vae.dtype)
873
+ latents_mean = (
874
+ torch.tensor(self.vae.config.latents_mean)
875
+ .view(1, self.vae.config.z_dim, 1, 1, 1)
876
+ .to(latents.device, latents.dtype)
877
+ )
878
+ latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
879
+ latents.device, latents.dtype
880
+ )
881
+ latents = latents / latents_std + latents_mean
882
+ image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
883
+ image = self.image_processor.postprocess(image, output_type=output_type)
884
+
885
+ # Offload all models
886
+ self.maybe_free_model_hooks()
887
+
888
+ if not return_dict:
889
+ return (image,)
890
+
891
+ return QwenImagePipelineOutput(images=image)