Spaces:

VAST-AI
/

SeqTex

Running on Zero

App Files Files Community

yuanze1024 commited on Jun 24

Commit

1d5bb62

0 Parent(s):

init space

Browse files

Files changed (16) hide show

.gitattributes +1 -0
app.py +231 -0
examples/birdhouse.glb +3 -0
examples/mario.glb +3 -0
utils/__init__.py +0 -0
utils/controlnet_union.py +957 -0
utils/image_generation.py +299 -0
utils/mesh_utils.py +500 -0
utils/pipeline_controlnet_union_sd_xl.py +1397 -0
utils/pipeline_stable_diffusion_switcher.py +1240 -0
utils/rasterize.py +166 -0
utils/render_utils.py +352 -0
utils/texture_generation.py +309 -0
wan/__init__.py +0 -0
wan/pipeline_wan_t2tex_extra.py +366 -0
wan/wan_t2tex_transformer_3d_extra.py +634 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.glb filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+from utils.image_generation import generate_image_condition
+from utils.mesh_utils import Mesh
+from utils.render_utils import render_views
+from utils.texture_generation import generate_texture
+import gradio as gr
+from gradio_litmodel3d import LitModel3D
+EXAMPLES = [
+    ["examples/birdhouse.glb", True, False, False, False, 42, "First View", "SDXL", False, "A rustic birdhouse featuring a snow-covered roof, wood textures, and two decorative cardinal birds. It has a circular entryway and conveys a winter-themed aesthetic."],
+    ["examples/mario.glb", False, False, False, True, 6666, "Third View", "FLUX", True, "Mario, a cartoon character wearing a red cap and blue overalls, with brown hair and a mustache, and white gloves, in a fighting pose. The clothes he wears are not in a reflection mode."],
+]
+def tensor_to_pil(tensor, mask=None, normalize: bool = True):
+    """
+    Convert tensor to PIL Image.
+    :param tensor: torch.Tensor, shape can be (Nv, H, W, C), (Nv, C, H, W), (H, W, C), (C, H, W)
+    :param mask: torch.Tensor, shape same as tensor, effective when C=3
+    :return: PIL.Image
+    """
+    # Move to cpu
+    tensor = tensor.detach()
+    if tensor.is_cuda:
+        tensor = tensor.cpu()
+    if mask is not None and mask.is_cuda:
+        mask = mask.cpu()
+    # Convert to float32
+    tensor = tensor.float()
+    if mask is not None:
+        mask = mask.float()
+    if normalize:
+        tensor = (tensor + 1.0) / 2.0
+    tensor = torch.clamp(tensor, 0.0, 1.0)
+    if mask is not None:
+        if mask.shape[-1] not in [1, 3]:
+            mask = mask.unsqueeze(-1)
+        tensor = torch.cat([tensor, mask], dim=-1)
+    shape = tensor.shape
+    # 4D: (Nv, H, W, C) or (Nv, C, H, W)
+    if len(shape) == 4:
+        Nv = shape[0]
+        if shape[-1] in [3, 4]:  # (Nv, H, W, C)
+            tensor = rearrange(tensor, 'nv h w c -> h (nv w) c')
+        else:  # (Nv, C, H, W)
+            tensor = rearrange(tensor, 'nv c h w -> h (nv w) c')
+    # 3D: (H, W, C) or (C, H, W)
+    elif len(shape) == 3:
+        if shape[-1] in [3, 4]:  # (H, W, C)
+            tensor = rearrange(tensor, 'h w c -> h w c')
+        else:  # (C, H, W)
+            tensor = rearrange(tensor, 'c h w -> h w c')
+    else:
+        raise ValueError(f"Unsupported tensor shape: {shape}")
+    # Convert to numpy
+    np_img = (tensor.numpy() * 255).round().astype(np.uint8)
+    # Create PIL Image
+    if np_img.shape[2] == 3:
+        return Image.fromarray(np_img, mode="RGB")
+    elif np_img.shape[2] == 4:
+        return Image.fromarray(np_img, mode="RGBA")
+    else:
+        raise ValueError("Only support 3 or 4 channel images.")
+if __name__ == '__main__':
+    with gr.Blocks() as demo:
+        gr.Markdown("# 🎨 SeqTex: Generate Mesh Textures in Video Sequence")
+        gr.Markdown("""
+        ## 🚀 Welcome to SeqTex!
+        **SeqTex** is a cutting-edge AI system that generates high-quality textures for 3D meshes using image prompts (here we use image generator to get them from textual prompts).
+        Choose to either **try our example models** below or **upload your own 3D mesh** to create stunning textures.
+        """)
+        gr.Markdown("---")
+        gr.Markdown("## 🔧 Step 1: Upload & Process 3D Mesh")
+        gr.Markdown("""
+        **📋 How to prepare your 3D mesh:**
+        - Upload your 3D mesh in **.obj** or **.glb** format
+        - **💡 Pro Tip**:
+          - For optimal results, ensure your mesh includes only one part with <span style="color:#e74c3c; font-weight:bold;">UV parameterization</span>
+          - Otherwise, we'll combine all parts and generate UV parameterization using *xAtlas* (may take longer for high-poly meshes; may also fail for certain meshes)
+        - **⚠️ Important**: We recommend adjusting your model using *Mesh Orientation Adjustments* to be **Z-UP oriented** for best results
+        """)
+        position_map_tensor, normal_map_tensor, position_images_tensor, normal_images_tensor, mask_images_tensor, w2cs, mesh, mvp_matrix = gr.State(), gr.State(), gr.State(), gr.State(), gr.State(), gr.State(), gr.State(), gr.State()
+        # fixed_texture_map = Image.open("image.webp").convert("RGB")
+        # Step 1
+        with gr.Row():
+            with gr.Column():
+                mesh_upload = gr.File(label="📁 Upload 3D Mesh", file_types=[".obj", ".glb"])
+                # uv_tool = gr.Radio(["xAtlas", "UVAtlas"], label="UV parameterizer", value="xAtlas")
+                gr.Markdown("**🔄 Mesh Orientation Adjustments** (if needed):")
+                y2z = gr.Checkbox(label="Y → Z Transform", value=False, info="Rotate: Y becomes Z, -Z becomes Y")
+                y2x = gr.Checkbox(label="Y → X Transform", value=False, info="Rotate: Y becomes X, -X becomes Y")
+                z2x = gr.Checkbox(label="Z → X Transform", value=False, info="Rotate: Z becomes X, -X becomes Z")
+                upside_down = gr.Checkbox(label="🔃 Flip Vertically", value=False, info="Fix upside-down mesh orientation")
+            with gr.Column():
+                step1_button = gr.Button("🔄 Process Mesh & Generate Views", variant="primary")
+                step1_progress = gr.Textbox(label="📊 Processing Status", interactive=False)
+                model_input = gr.Model3D(label="📐 Processed 3D Model", height=500)
+        with gr.Row(equal_height=True):
+            rgb_views = gr.Image(label="📷 Generated Views (Front, Back, Left, Right)", type="pil", scale=3)
+            position_map = gr.Image(label="🗺️ Position Map", type="pil", scale=1)
+            normal_map = gr.Image(label="🧭 Normal Map", type="pil", scale=1)
+        step1_button.click(
+            Mesh.process,
+            inputs=[mesh_upload, gr.State("xAtlas"), y2z, y2x, z2x, upside_down],
+            outputs=[position_map_tensor, normal_map_tensor, position_images_tensor, normal_images_tensor, mask_images_tensor, w2cs, mesh, mvp_matrix, step1_progress]
+        ).then(
+            tensor_to_pil,
+            inputs=[normal_images_tensor, mask_images_tensor],
+            outputs=[rgb_views]
+        ).then(
+            tensor_to_pil,
+            inputs=[position_map_tensor],
+            outputs=[position_map]
+        ).then(
+            tensor_to_pil,
+            inputs=[normal_map_tensor],
+            outputs=[normal_map]
+        ).then(
+            Mesh.export,
+            inputs=[mesh],
+            outputs=[model_input]
+        )
+        # Step 2
+        gr.Markdown("---")
+        gr.Markdown("## 👁️ Step 2: Select View & Generate Image Condition")
+        gr.Markdown("""
+        **📋 How to generate image condition:**
+        - Your mesh will be rendered from **four viewpoints** (front, back, left, right)
+        - Choose **one view** as your image condition
+        - Enter a **descriptive text prompt** for the desired texture
+        - Select your preferred AI model:
+          - <span style="color:#27ae60; font-weight:bold;">🎯 SDXL</span>: Fast generation with depth + normal control, better details
+          - <span style="color:#3498db; font-weight:bold;">⚡ FLUX</span>: High-quality generation with depth control (slower due to CPU offloading). Better work with **Edge Refinement**
+        """)
+        with gr.Row():
+            with gr.Column():
+                img_condition_seed = gr.Number(label="🎲 Random Seed", minimum=0, maximum=9999, step=1, value=42, info="Change for different results")
+                selected_view = gr.Radio(["First View", "Second View", "Third View", "Fourth View"], label="📐 Camera View", value="First View", info="Choose which viewpoint to use as reference")
+                with gr.Row():
+                    model_choice = gr.Radio(["SDXL", "FLUX"], label="🤖 AI Model", value="SDXL", info="SDXL: Fast, depth+normal control | FLUX: High-quality, slower processing")
+                    edge_refinement = gr.Checkbox(label="✨ Edge Refinement", value=True, info="Smooth boundary artifacts (recommended for cleaner results)")
+                text_prompt = gr.Textbox(label="💬 Texture Description", placeholder="Describe the desired texture appearance (e.g., 'rustic wooden surface with weathered paint')", lines=2)
+                step2_button = gr.Button("🎯 Generate Image Condition", variant="primary")
+                step2_progress = gr.Textbox(label="📊 Generation Status", interactive=False)
+            with gr.Column():
+                condition_image = gr.Image(label="🖼️ Generated Image Condition", type="pil") # , interactive=False
+        step2_button.click(
+            generate_image_condition,
+            inputs=[position_images_tensor, normal_images_tensor, mask_images_tensor, w2cs, text_prompt, selected_view, img_condition_seed, model_choice, edge_refinement],
+            outputs=[condition_image, step2_progress],
+            concurrency_id="gpu_intensive"
+        )
+        # Step 3
+        gr.Markdown("---")
+        gr.Markdown("## 🎨 Step 3: Generate Final Texture")
+        gr.Markdown("""
+        **📋 How to generate final texture:**
+        - The **SeqTex pipeline** will create a complete texture map for your model
+        - View the results from multiple angles and download your textured 3D model (the viewport is a little bit dark)
+        """)
+        texture_map_tensor, mv_out_tensor = gr.State(), gr.State()
+        with gr.Row():
+            with gr.Column(scale=1):
+                step3_button = gr.Button("🎨 Generate Final Texture", variant="primary")
+                step3_progress = gr.Textbox(label="📊 Texture Generation Status", interactive=False)
+                texture_map = gr.Image(label="🏆 Generated Texture Map", interactive=False)
+            with gr.Column(scale=2):
+                rendered_imgs = gr.Image(label="🖼️ Final Rendered Views")
+                mv_branch_imgs = gr.Image(label="🖼️ SeqTex Direct Output")
+            with gr.Column(scale=1.5):
+                # model_display = gr.Model3D(label="🏆 Final Textured Model", height=500)
+                model_display = LitModel3D(label="Model with Texture",
+                                           exposure=30.0,
+                                           height=500)
+        step3_button.click(
+            generate_texture,
+            inputs=[position_map_tensor, normal_map_tensor, position_images_tensor, normal_images_tensor, condition_image, text_prompt, selected_view],
+            outputs=[texture_map_tensor, mv_out_tensor, step3_progress],
+            concurrency_id="gpu_intensive"
+        ).then(
+            tensor_to_pil,
+            inputs=[texture_map_tensor, gr.State(None), gr.State(False)],
+            outputs=[texture_map]
+        ).then(
+            tensor_to_pil,
+            inputs=[mv_out_tensor, gr.State(None), gr.State(False)],
+            outputs=[mv_branch_imgs]
+        ).then(
+            render_views,
+            inputs=[mesh, texture_map_tensor, mvp_matrix],
+            outputs=[rendered_imgs]
+        ).then(
+            Mesh.export,
+            inputs=[mesh, gr.State(None), texture_map],
+            outputs=[model_display]
+        )
+        # Add example inputs for user convenience
+        gr.Markdown("---")
+        gr.Markdown("## 🚀 Try Our Examples")
+        gr.Markdown("**Quick Start**: Click on any example below to see SeqTex in action with pre-configured settings!")
+        gr.Examples(
+            examples=EXAMPLES,
+            inputs=[mesh_upload, y2z, y2x, z2x, upside_down, img_condition_seed, selected_view, model_choice, edge_refinement, text_prompt],
+            cache_examples=False
+        )
+    demo.launch(server_name="0.0.0.0", server_port=52424)

examples/birdhouse.glb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30a006774b35531831aaf4ba0dd1c7b8a5b5b58433af17ebc52c816cfbd654b9
+size 10043504

examples/mario.glb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbe06e0ad2fc52811ba343dcaeccacb0b9cee1705b6f33bcd222d20de770b80c
+size 1970408

utils/__init__.py ADDED Viewed

File without changes

utils/controlnet_union.py ADDED Viewed

	@@ -0,0 +1,957 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+from collections import OrderedDict
+# Transformer Block
+# Used to exchange info between different conditions and input image
+# With reference to https://github.com/TencentARC/T2I-Adapter/blob/SD/ldm/modules/encoders/adapter.py#L147
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x)
+        return ret.type(orig_type)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
+                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+#-----------------------------------------------------------------------------------------------------
+@dataclass
+class ControlNetOutput(BaseOutput):
+    """
+    The output of [`ControlNetModel`].
+    Args:
+        down_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`torch.Tensor`):
+            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+    # original setting is (16, 32, 96, 256)
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (48, 96, 192, 384),
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+        embedding = self.conv_out(embedding)
+        return embedding
+class ControlNetModel_Union(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    """
+    A ControlNet model.
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads=64,
+        num_control_type = 6,
+    ):
+        super().__init__()
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+        # Copyright by Qi Xin(2024/07/06)
+        # Condition Transformer(fuse single/multi conditions with input image)
+        # The Condition Transformer augment the feature representation of conditions
+        # The overall design is somewhat like resnet. The output of Condition Transformer is used to predict a condition bias adding to the original condition feature.
+        # num_control_type = 6
+        num_trans_channel = 320
+        num_trans_head = 8
+        num_trans_layer = 1
+        num_proj_channel = 320
+        task_scale_factor = num_trans_channel ** 0.5
+        self.task_embedding = nn.Parameter(task_scale_factor * torch.randn(num_control_type, num_trans_channel))
+        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(num_trans_channel, num_trans_head) for _ in range(num_trans_layer)])
+        self.spatial_ch_projs = zero_module(nn.Linear(num_trans_channel, num_proj_channel))
+        #-----------------------------------------------------------------------------------------------------
+        # Copyright by Qi Xin(2024/07/06)
+        # Control Encoder to distinguish different control conditions
+        # A simple but effective module, consists of an embedding layer and a linear layer, to inject the control info to time embedding.
+        self.control_type_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+        self.control_add_embedding = TimestepEmbedding(addition_time_embed_dim * num_control_type, time_embed_dim)
+        #-----------------------------------------------------------------------------------------------------
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+        # mid
+        mid_block_channel = block_out_channels[-1]
+        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            in_channels=mid_block_channel,
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+    ):
+        r"""
+        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
+                where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+        controlnet = cls(
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            # transformer_layers_per_block=[1, 2, 5],
+            in_channels=unet.config.in_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+        )
+        if load_weights_from_unet:
+            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
+            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
+            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+            if controlnet.class_embedding:
+                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
+            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False)
+            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False)
+        return controlnet
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond_list: torch.FloatTensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        """
+        The [`ControlNetModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        # elif channel_order == "bgr":
+        #     controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+        # Copyright by Qi Xin(2024/07/06)
+        # inject control type info to time embedding to distinguish different control conditions
+        control_type = added_cond_kwargs.get('control_type')
+        control_embeds = self.control_type_proj(control_type.flatten())
+        control_embeds = control_embeds.reshape((t_emb.shape[0], -1))
+        control_embeds = control_embeds.to(emb.dtype)
+        control_emb = self.control_add_embedding(control_embeds)
+        emb = emb + control_emb
+        #---------------------------------------------------------------------------------
+        emb = emb + aug_emb if aug_emb is not None else emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        indices = torch.nonzero(control_type[0])
+        # Copyright by Qi Xin(2024/07/06)
+        # add single/multi conditons to input image.
+        # Condition Transformer provides an easy and effective way to fuse different features naturally
+        inputs = []
+        condition_list = []
+        for idx in range(indices.shape[0] + 1):
+            if idx == indices.shape[0]:
+                controlnet_cond = sample
+                feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) # N * C
+            else:
+                controlnet_cond = self.controlnet_cond_embedding(controlnet_cond_list[indices[idx][0]])
+                feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) # N * C
+                feat_seq = feat_seq + self.task_embedding[indices[idx][0]]
+            inputs.append(feat_seq.unsqueeze(1))
+            condition_list.append(controlnet_cond)
+        x = torch.cat(inputs, dim=1)  # NxLxC
+        x = self.transformer_layes(x)
+        controlnet_cond_fuser = sample * 0.0
+        for idx in range(indices.shape[0]):
+            alpha = self.spatial_ch_projs(x[:, idx])
+            alpha = alpha.unsqueeze(-1).unsqueeze(-1)
+            controlnet_cond_fuser += condition_list[idx] + alpha
+        sample = sample + controlnet_cond_fuser
+        #-------------------------------------------------------------------------------------------
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module

utils/image_generation.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import threading
+import cv2
+import numpy as np
+import spaces
+import torch
+import torch.nn.functional as F
+# Add FLUX imports
+from diffusers import (AutoencoderKL, EulerAncestralDiscreteScheduler,
+                       FluxControlNetModel, FluxControlNetPipeline)
+from einops import rearrange
+from PIL import Image
+from torchvision.transforms import ToPILImage
+import gradio as gr
+from .controlnet_union import ControlNetModel_Union
+from .pipeline_controlnet_union_sd_xl import \
+    StableDiffusionXLControlNetUnionPipeline
+from .render_utils import get_silhouette_image
+IMG_PIPE = None
+IMG_PIPE_LOCK = threading.Lock()
+# Add FLUX pipeline variables
+FLUX_PIPE = None
+FLUX_PIPE_LOCK = threading.Lock()
+FLUX_SUFFIX = None
+FLUX_NEGATIVE = None
+def lazy_get_flux_pipe():
+    """
+    Lazy load the FLUX pipeline with ControlNet for image generation.
+    """
+    global FLUX_PIPE, FLUX_SUFFIX, FLUX_NEGATIVE
+    if FLUX_PIPE is not None:
+        return FLUX_PIPE
+    gr.Info("First called, loading FLUX pipeline... It may take about 1 minute.")
+    with FLUX_PIPE_LOCK:
+        if FLUX_PIPE is not None:
+            return FLUX_PIPE
+        FLUX_SUFFIX = ", albedo texture, high-quality, 8K, flat shaded, diffuse color only, orthographic view, seamless texture pattern, detailed surface texture."
+        FLUX_NEGATIVE = "ugly, PBR, lighting, shadows, highlights, specular, reflections, ambient occlusion, global illumination, bloom, glare, lens flare, glow, shiny, glossy, noise, grain, blurry, bokeh, depth of field."
+        base_model = 'black-forest-labs/FLUX.1-dev'
+        controlnet_model_union = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0'
+        controlnet = FluxControlNetModel.from_pretrained(controlnet_model_union, torch_dtype=torch.bfloat16)
+        FLUX_PIPE = FluxControlNetPipeline.from_pretrained(
+            base_model,
+            controlnet=controlnet,
+            torch_dtype=torch.bfloat16
+        )
+        # Use model CPU offload for better GPU utilization during inference
+        FLUX_PIPE.enable_model_cpu_offload()
+    return FLUX_PIPE
+def lazy_get_sdxl_pipe():
+    """
+    Lazy load the SDXL pipeline with ControlNet for image generation.
+    """
+    global IMG_PIPE
+    if IMG_PIPE is not None:
+        return IMG_PIPE
+    gr.Info("First called, loading SDXL pipeline... It may take about 20 seconds.")
+    with IMG_PIPE_LOCK:
+        if IMG_PIPE is not None:
+            return IMG_PIPE
+        eulera_scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
+        # when test with other base model, you need to change the vae also.
+        vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        controlnet_model = ControlNetModel_Union.from_pretrained("xinsir/controlnet-union-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True)
+        IMG_PIPE = StableDiffusionXLControlNetUnionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet_model,
+            vae=vae,
+            torch_dtype=torch.float16,
+            scheduler=eulera_scheduler,
+        )
+        # Move pipeline to CUDA device
+        IMG_PIPE = IMG_PIPE.to("cuda")
+    return IMG_PIPE
+def generate_sdxl_condition(depth_img, normal_img, text_prompt, mask, seed=42, edge_refinement=False, image_height=1024, image_width=1024, progress=gr.Progress()) -> Image.Image:
+    """
+    Generate image condition using SDXL model with ControlNet based on depth and normal images.
+    :param depth_img: Depth image from the selected view.
+    :param normal_img: Normal image (Camera Coordinate System) from the selected view.
+    :param text_prompt: Text prompt for image generation.
+    :param mask: A mask image to apply to guide the subsequent pipeline to focus on the foreground.
+    :param seed: Random seed for image generation.
+    :param edge_refinement: Whether to apply edge refinement to smooth mask boundaries (default: False).
+    :param image_height: Height of the output image.
+    :param image_width: Width of the output image.
+    :param progress: Progress callback for Gradio.
+    :return: Generated image condition (e.g., PIL Image).
+    """
+    progress(0.1, desc="Loading SDXL pipeline...")
+    pipeline = lazy_get_sdxl_pipe()
+    progress(0.3, desc="SDXL pipeline loaded successfully.")
+    positive_prompt = text_prompt + ", photo-realistic style, high quality, 8K, highly detailed texture, soft lightning, uniform color, foreground"
+    negative_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
+    img_generation_resolution = 1024  # SDXL performs better at 1024x1024
+    image = pipeline(prompt=[positive_prompt]*1,
+                image_list=[0, depth_img, 0, 0, normal_img, 0],
+                negative_prompt=[negative_prompt]*1,
+                generator=torch.Generator(device="cuda").manual_seed(seed),
+                width=img_generation_resolution,
+                height=img_generation_resolution,
+                num_inference_steps=50,
+                union_control=True,
+                union_control_type=torch.Tensor([0, 1, 0, 0, 1, 0]).to("cuda"), # use depth and normal images
+                progress=progress,
+            ).images[0]
+    progress(0.9, desc="Condition tensor generated successfully.")
+    rgb_tensor = torch.from_numpy(np.array(image)).float().permute(2, 0, 1).unsqueeze(0).to(pipeline.device)
+    mask_tensor = torch.from_numpy(np.array(mask)).float().unsqueeze(0).unsqueeze(0).to(pipeline.device)  # Ensure mask is in the correct shape
+    mask_tensor = mask_tensor / 255.0  # Normalize mask to [0, 1]
+    rgb_tensor = F.interpolate(rgb_tensor, (image_height, image_width), mode="bilinear", align_corners=False)
+    mask_tensor = F.interpolate(mask_tensor, (image_height, image_width), mode="bilinear", align_corners=False)
+    # Apply edge refinement if enabled
+    if edge_refinement:
+        # Convert to CUDA device for edge refinement
+        rgb_tensor_cuda = rgb_tensor.to("cuda")
+        mask_tensor_cuda = mask_tensor.to("cuda")
+        rgb_tensor_cuda = refine_image_edges(rgb_tensor_cuda, mask_tensor_cuda)
+        rgb_tensor = rgb_tensor_cuda.to(pipeline.device)
+    background_tensor = torch.zeros_like(rgb_tensor)
+    rgb_tensor = torch.lerp(background_tensor, rgb_tensor, mask_tensor)
+    rgb_tensor = rearrange(rgb_tensor, "1 C H W -> C H W")
+    rgb_tensor = rgb_tensor / 255.
+    to_img = ToPILImage()
+    condition_image = to_img(rgb_tensor.cpu())
+    progress(1, desc="Condition image generated successfully.")
+    return condition_image
+def generate_flux_condition(depth_img, text_prompt, mask, seed=42, edge_refinement=False, image_height=1024, image_width=1024, progress=gr.Progress()) -> Image.Image:
+    """
+    Generate image condition using FLUX model with ControlNet based on depth image only.
+    Note: FLUX.1-dev-ControlNet-Union-Pro-2.0 does not support normal control, only depth.
+    :param depth_img: Depth image from the selected view.
+    :param text_prompt: Text prompt for image generation.
+    :param mask: A mask image to apply to guide the subsequent pipeline to focus on the foreground.
+    :param seed: Random seed for image generation.
+    :param image_height: Height of the output image.
+    :param image_width: Width of the output image.
+    :param progress: Progress callback for Gradio.
+    :param edge_refinement: Whether to apply edge refinement to smooth mask boundaries (default: False).
+    :return: Generated image condition (PIL Image).
+    """
+    progress(0.1, desc="Loading FLUX pipeline...")
+    pipeline = lazy_get_flux_pipe()
+    progress(0.3, desc="FLUX pipeline loaded successfully.")
+    # Enhanced prompt for better results
+    positive_prompt = text_prompt + FLUX_SUFFIX
+    negative_prompt = FLUX_NEGATIVE
+    # Get image dimensions
+    width, height = depth_img.size
+    progress(0.5, desc="Generating image with FLUX (including onload and cpu offload)...")
+    # Generate image using FLUX ControlNet with depth control
+    # model_cpu_offload handles GPU loading automatically
+    image = pipeline(
+        prompt=positive_prompt,
+        negative_prompt=negative_prompt,
+        control_image=depth_img,
+        width=width,
+        height=height,
+        controlnet_conditioning_scale=0.8,  # Recommended for depth
+        control_guidance_end=0.8,
+        num_inference_steps=30,
+        guidance_scale=3.5,
+        generator=torch.Generator(device="cuda").manual_seed(seed),
+    ).images[0]
+    progress(0.9, desc="Applying mask and resizing...")
+    # Convert to tensor and apply mask
+    rgb_tensor = torch.from_numpy(np.array(image)).float().permute(2, 0, 1).unsqueeze(0).to("cuda")
+    mask_tensor = torch.from_numpy(np.array(mask)).float().unsqueeze(0).unsqueeze(0).to("cuda")
+    mask_tensor = mask_tensor / 255.0  # Normalize mask to [0, 1]
+    # Resize to target dimensions
+    rgb_tensor = F.interpolate(rgb_tensor, (image_height, image_width), mode="bilinear", align_corners=False)
+    mask_tensor = F.interpolate(mask_tensor, (image_height, image_width), mode="bilinear", align_corners=False)
+    # Apply mask (blend with black background)
+    background_tensor = torch.zeros_like(rgb_tensor)
+    if edge_refinement:
+        # replace edge with inner values
+        rgb_tensor = refine_image_edges(rgb_tensor, mask_tensor)
+    rgb_tensor = torch.lerp(background_tensor, rgb_tensor, mask_tensor)
+    # Convert back to PIL Image
+    rgb_tensor = rearrange(rgb_tensor, "1 C H W -> C H W")
+    rgb_tensor = rgb_tensor / 255.0
+    to_img = ToPILImage()
+    condition_image = to_img(rgb_tensor.cpu())
+    progress(1, desc="FLUX condition image generated successfully.")
+    return condition_image
+def refine_image_edges(rgb_tensor, mask_tensor):
+    """
+    Refine image edges using advanced morphological operations to remove white edges while preserving object boundaries.
+    Algorithm:
+    1. Erode mask to get eroded_mask
+    2. Double erode mask to get double_eroded_mask
+    3. XOR eroded_mask and double_eroded_mask to get circle_valid_mask
+    4. Use circle_valid_mask to extract circle_rgb (clean edge values)
+    5. Dilate circle_rgb to cover the edge region
+    6. Final result: use double_eroded_mask for original RGB foreground, dilated_circle_rgb for background
+    :param rgb_tensor: RGB image tensor of shape (1, C, H, W) on CUDA device
+    :param mask_tensor: Mask tensor of shape (1, 1, H, W) on CUDA device, normalized to [0, 1]
+    :return: refined_rgb_tensor
+    """
+    # Convert tensors to numpy for OpenCV processing
+    rgb_np = rgb_tensor.squeeze().permute(1, 2, 0).cpu().numpy().astype(np.uint8)  # (H, W, C)
+    mask_np = mask_tensor.squeeze().cpu().numpy()  # Remove batch and channel dimensions
+    original_mask_np = (mask_np * 255).astype(np.uint8)  # Convert to 0-255 range
+    # Create morphological kernel (3x3 as requested)
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    # Step 1: Erode mask to get eroded_mask
+    eroded_mask_np = cv2.erode(original_mask_np, kernel, iterations=3)
+    # Step 2: Double erode mask to get double_eroded_mask
+    double_eroded_mask_np = cv2.erode(eroded_mask_np, kernel, iterations=5)
+    # Step 3: XOR eroded_mask and double_eroded_mask to get circle_valid_mask
+    circle_valid_mask_np = cv2.bitwise_xor(eroded_mask_np, double_eroded_mask_np)
+    # Step 4: Use circle_valid_mask to extract circle_rgb (clean edge values)
+    circle_valid_mask_3c = cv2.cvtColor(circle_valid_mask_np, cv2.COLOR_GRAY2BGR) / 255.0
+    circle_rgb_np = (rgb_np * circle_valid_mask_3c).astype(np.uint8)
+    # Step 5: Dilate circle_rgb to cover the edge region (using iterations=6 directly)
+    dilated_circle_rgb_np = cv2.dilate(circle_rgb_np, kernel, iterations=8)
+    # Step 6: Final composition
+    # Use double_eroded_mask for original RGB foreground, dilated_circle_rgb for background
+    double_eroded_mask_3c = cv2.cvtColor(double_eroded_mask_np, cv2.COLOR_GRAY2BGR) / 255.0
+    # Final result: original RGB where double_eroded_mask is valid, dilated_circle_rgb elsewhere
+    refined_rgb_np = (rgb_np * double_eroded_mask_3c +
+                     dilated_circle_rgb_np * (1 - double_eroded_mask_3c)).astype(np.uint8)
+    # Convert refined RGB back to tensor
+    refined_rgb_tensor = torch.from_numpy(refined_rgb_np).float().permute(2, 0, 1).unsqueeze(0).to("cuda")
+    return refined_rgb_tensor
+@spaces.GPU(duration=120)
+def generate_image_condition(position_imgs, normal_imgs, mask_imgs, w2c, text_prompt, selected_view="First View", seed=42, model="SDXL", edge_refinement=True, progress=gr.Progress()):
+    """
+    Generate the image condition based on the selected view's silhouette and text prompt.
+    :param position_imgs: Position images from different views.
+    :param normal_imgs: Normal images from different views.
+    :param mask_imgs: Mask images from different views.
+    :param w2c: World-to-camera transformation matrices.
+    :param text_prompt: The text prompt for image generation.
+    :param selected_view: The selected view for image generation.
+    :param seed: Random seed for image generation.
+    :param model: The image generation model type, supports "SDXL" and "FLUX".
+    :param progress: Progress callback for Gradio.
+    :param edge_refinement: Whether to apply edge refinement to smooth mask boundaries (default: True).
+    :return: Generated condition image and status message.
+    """
+    progress(0, desc="Handling geometry information...")
+    silhouette = get_silhouette_image(position_imgs, normal_imgs, mask_imgs=mask_imgs, w2c=w2c, selected_view=selected_view)
+    depth_img = silhouette[0]
+    normal_img = silhouette[1]
+    mask = silhouette[2]
+    try:
+        if model == "SDXL":
+            condition = generate_sdxl_condition(depth_img, normal_img, text_prompt, mask, seed, edge_refinement=edge_refinement, progress=progress)
+            return condition, "SDXL condition generated successfully."
+        elif model == "FLUX":
+            # FLUX only supports depth control, not normal
+            condition = generate_flux_condition(depth_img, text_prompt, mask, seed, edge_refinement=edge_refinement, progress=progress)
+            return condition, "FLUX condition generated successfully (depth-only control)."
+        else:
+            raise ValueError(f"Unsupported image generation model type: {model}. Supported models: 'SDXL', 'FLUX'.")
+    finally:
+        torch.cuda.empty_cache()

utils/mesh_utils.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import os
+import tempfile
+import numpy as np
+import torch
+import trimesh
+import xatlas
+from PIL import Image
+import gradio as gr
+from .render_utils import (get_mvp_matrix, get_pure_texture, render_geo_map,
+                           render_geo_views_tensor, render_views, setup_lights)
+class Mesh:
+    def __init__(self, mesh_path=None, uv_tool="xAtlas", device='cuda', progress=gr.Progress()):
+        """
+        Initialize the Mesh object with a mesh file path.
+        :param mesh_path: Path to the mesh file (e.g., .obj or .glb).
+        """
+        self.device = device
+        if mesh_path is not None:
+            # Initialize _parts dictionary to store all parts
+            self._parts = {}
+            if mesh_path.endswith('.obj'):
+                progress(0., f"Loading mesh in .obj format...")
+                mesh_data = trimesh.load(mesh_path, process=False)
+                # Check if it's a mesh list (multi-part obj)
+                if isinstance(mesh_data, list):
+                    progress(0.1, f"Handling part list...")
+                    for i, mesh_part in enumerate(mesh_data):
+                        self._add_part_to_parts(f"part_{i}", mesh_part)
+                # Check if it's a Scene (another multi-part format)
+                elif isinstance(mesh_data, trimesh.Scene):
+                    progress(0.1, f"Handling Scenes...")
+                    geometry = mesh_data.geometry
+                    if len(geometry) > 0:
+                        for key, mesh_part in geometry.items():
+                            self._add_part_to_parts(key, mesh_part)
+                    else:
+                        raise ValueError("Empty scene, no mesh data found.")
+                else:
+                    # Single part obj
+                    progress(0.1, f"Handling single part...")
+                    self._add_part_to_parts("part_0", mesh_data)
+            elif mesh_path.endswith('.glb'):
+                progress(0., f"Loading mesh in .glb format...")
+                mesh_loaded = trimesh.load(mesh_path)
+                # Check if it's a Scene (multi-part glb)
+                if isinstance(mesh_loaded, trimesh.Scene):
+                    progress(0.1, f"Handling Scenes...")
+                    geometry = mesh_loaded.geometry
+                    if len(geometry) > 0:
+                        for key, mesh_part in geometry.items():
+                            self._add_part_to_parts(key, mesh_part)
+                    else:
+                        raise ValueError("Empty scene, no mesh data found.")
+                else:
+                    # Single part glb
+                    progress(0.1, f"Handling single part...")
+                    self._add_part_to_parts("part_0", mesh_loaded)
+            else:
+                raise ValueError(f"Unsupported file format: {mesh_path}")
+            # Automatically merge all parts during initialization
+            progress(0.2, f"Merging if the mesh have multiple parts.")
+            self._merge_parts_internal()
+        else:
+            raise ValueError("Mesh path cannot be None.")
+        self.to(self.device)  # Move to the specified device
+        # Initialize transformation flags
+        self._upside_down_applied = False
+        # UV parameterization
+        if self.has_multi_parts or not self.has_uv:
+            progress(0.4, f"Using {uv_tool} for UV parameterization. It may take quite a while (several minutes), if there are many faces. We STRONLY recommend using a mesh with UV parameterization.")
+            if uv_tool == "xAtlas":
+                self.uv_xatlas_mapping() # Use default parameters
+            elif uv_tool == "UVAtlas":
+                raise NotImplementedError("UVAtlas parameterization is not implemented yet.")
+            else:
+                raise ValueError("Unsupported UV parameterization tool.")
+            print("UV parameterization completed.")
+        else:
+            progress(0.4, f"The model has SINGLE UV parameterization, no need to reparameterize.")
+            self._vmapping = None  # No vmapping needed when not reparameterizing
+    def to(self, device):
+        """
+        Move the mesh data to the specified device.
+        :param device: The target device (e.g., 'cuda' or 'cpu').
+        """
+        self._v_pos = self._v_pos.to(device)
+        self._t_pos_idx = self._t_pos_idx.to(device)
+        if self._v_tex is not None:
+            self._v_tex = self._v_tex.to(device)
+            self._t_tex_idx = self._t_tex_idx.to(device)
+        if hasattr(self, '_vmapping') and self._vmapping is not None:
+            self._vmapping = self._vmapping.to(device)
+        self._v_normal = self._v_normal.to(device)
+    @property
+    def has_multi_parts(self):
+        """
+        Check if the mesh has multiple parts.
+        :return: Boolean indicating whether the mesh has multiple parts.
+        """
+        # If _parts is None, it means already merged, not multi-part
+        if self._parts is None:
+            return False
+        return len(self._parts) > 1
+    @property
+    def v_pos(self):
+        """Vertex positions property."""
+        return self._v_pos
+    @v_pos.setter
+    def v_pos(self, value):
+        self._v_pos = value
+    @property
+    def t_pos_idx(self):
+        """Triangle position indices property."""
+        return self._t_pos_idx
+    @t_pos_idx.setter
+    def t_pos_idx(self, value):
+        self._t_pos_idx = value
+    @property
+    def v_tex(self):
+        """Vertex texture coordinates property."""
+        return self._v_tex
+    @v_tex.setter
+    def v_tex(self, value):
+        self._v_tex = value
+    @property
+    def t_tex_idx(self):
+        """Triangle texture indices property."""
+        return self._t_tex_idx
+    @t_tex_idx.setter
+    def t_tex_idx(self, value):
+        self._t_tex_idx = value
+    @property
+    def v_normal(self):
+        """Vertex normals property."""
+        return self._v_normal
+    @v_normal.setter
+    def v_normal(self, value):
+        self._v_normal = value
+    @property
+    def has_uv(self):
+        """
+        Check if the mesh has a valid UV mapping.
+        :return: Boolean indicating whether the mesh has UV mapping.
+        """
+        return self.v_tex is not None
+    def uv_xatlas_mapping(self, xatlas_chart_options: dict = {}, xatlas_pack_options: dict = {}):
+        # Merged mesh, directly add_mesh as a whole
+        atlas = xatlas.Atlas()
+        v_pos_np = self.v_pos.detach().cpu().numpy()
+        t_pos_idx_np = self.t_pos_idx.cpu().numpy()
+        atlas.add_mesh(v_pos_np, t_pos_idx_np)
+        # Set reasonable pack parameters to avoid overlap
+        co = xatlas.ChartOptions()
+        po = xatlas.PackOptions()
+        # Recommended default parameters
+        if 'resolution' not in xatlas_pack_options:
+            po.resolution = 1024  # or larger
+        if 'padding' not in xatlas_pack_options:
+            po.padding = 2
+        for k, v in xatlas_chart_options.items():
+            setattr(co, k, v)
+        for k, v in xatlas_pack_options.items():
+            setattr(po, k, v)
+        atlas.generate(co, po)
+        # Get unpacked data
+        vmapping, indices, uvs = atlas.get_mesh(0)
+        # vmapping: new UV vertex -> original mesh vertex
+        # indices: new triangle face indices (based on new UV vertices)
+        # uvs: new UV vertex coordinates
+        device = self.v_pos.device
+        vmapping = torch.from_numpy(vmapping.astype(np.uint64, casting="same_kind").view(np.int64)).to(device).long()
+        uvs = torch.from_numpy(uvs).to(device).float()
+        indices = torch.from_numpy(indices.astype(np.uint64, casting="same_kind").view(np.int64)).to(device).long()
+        self.v_tex = uvs  # new UV vertices
+        self.t_tex_idx = indices  # new triangle face indices (based on UV vertices)
+        self._vmapping = vmapping  # save UV vertex to original vertex mapping for export
+    def normalize(self):
+        """
+        Normalize mesh vertices to [-1, 1] range.
+        """
+        vertices = self.v_pos
+        bounding_box_max = vertices.max(0)[0]
+        bounding_box_min = vertices.min(0)[0]
+        mesh_scale = 2.0  # Scale to [-1, 1]
+        scale = mesh_scale / ((bounding_box_max - bounding_box_min).max() + 1e-6)
+        center_offset = (bounding_box_max + bounding_box_min) * 0.5
+        self.v_pos = (vertices - center_offset) * scale
+    def vertex_transform(self):
+        """
+        Apply coordinate transformation to mesh vertices and normals.
+        """
+        # Transform normals
+        pre_normals = self.v_normal
+        normals = torch.clone(pre_normals)
+        normals[:, 1] = -pre_normals[:, 2]  # -z --> y
+        normals[:, 2] = pre_normals[:, 1]  # y --> z
+        # Transform vertices
+        pre_vertices = self.v_pos
+        vertices = torch.clone(pre_vertices)
+        vertices[:, 1] = -pre_vertices[:, 2]  # -z --> y
+        vertices[:, 2] = pre_vertices[:, 1]  # y --> z
+        # Update mesh
+        self.v_normal = normals
+        self.v_pos = vertices
+    def vertex_transform_y2x(self):
+        """
+        Apply coordinate transformation to mesh vertices and normals.
+        """
+        # Transform normals
+        pre_normals = self.v_normal
+        normals = torch.clone(pre_normals)
+        normals[:, 1] = -pre_normals[:, 0]  # -x --> y
+        normals[:, 0] = pre_normals[:, 1]  # y --> x
+        # Transform vertices
+        pre_vertices = self.v_pos
+        vertices = torch.clone(pre_vertices)
+        vertices[:, 1] = -pre_vertices[:, 0]  # -z --> y
+        vertices[:, 0] = pre_vertices[:, 1]  # y --> z
+        # 更新网格
+        self.v_normal = normals
+        self.v_pos = vertices
+    def vertex_transform_z2x(self):
+        """
+        Apply coordinate transformation to mesh vertices and normals.
+        """
+        # 变换法向量
+        pre_normals = self.v_normal
+        normals = torch.clone(pre_normals)
+        normals[:, 2] = -pre_normals[:, 0]  # -x --> z
+        normals[:, 0] = pre_normals[:, 2]  # z --> x
+        # 变换顶点
+        pre_vertices = self.v_pos
+        vertices = torch.clone(pre_vertices)
+        vertices[:, 2] = -pre_vertices[:, 0]  # -z --> y
+        vertices[:, 0] = pre_vertices[:, 2]  # y --> z
+        # 更新网格
+        self.v_normal = normals
+        self.v_pos = vertices
+    def vertex_transform_upsidedown(self):
+        """
+        Apply upside-down transformation to mesh vertices and normals.
+        """
+        # 变换法向量
+        pre_normals = self.v_normal
+        normals = torch.clone(pre_normals)
+        normals[:, 2] = -pre_normals[:, 2]
+        # 变换顶点
+        pre_vertices = self.v_pos
+        vertices = torch.clone(pre_vertices)
+        vertices[:, 2] = -pre_vertices[:, 2]
+        # 更新网格
+        self.v_normal = normals
+        self.v_pos = vertices
+        # self.t_pos_idx = faces
+        # 标记已应用上下翻转变换
+        self._upside_down_applied = True
+    def _add_part_to_parts(self, key, mesh_part):
+        """
+        将单个mesh部分添加到_parts字典中
+        :param key: 部分的键名
+        :param mesh_part: trimesh对象
+        """
+        # exclude PointCloud parts and empty parts
+        if hasattr(mesh_part, 'vertices') and hasattr(mesh_part, 'faces') and len(mesh_part.vertices) > 0 and len(mesh_part.faces) > 0:
+            raw_uv = getattr(mesh_part.visual, 'uv', None)
+            processed_v_tex = None
+            processed_t_tex_idx = None
+            # 仅当UV数据存在且不为空时才处理
+            if raw_uv is not None and np.asarray(raw_uv).size > 0 and np.asarray(raw_uv).shape[0] > 0:
+                processed_v_tex = torch.tensor(raw_uv, dtype=torch.float32)
+                # 假设当源数据提供UV时，t_tex_idx 与 t_pos_idx 使用相同的面索引
+                # trimesh 通常提供每个顶点的UV
+                processed_t_tex_idx = torch.tensor(mesh_part.faces, dtype=torch.int32)
+            self._parts[key] = {
+                'v_pos': torch.tensor(mesh_part.vertices, dtype=torch.float32),
+                't_pos_idx': torch.tensor(mesh_part.faces, dtype=torch.int32),
+                'v_tex': processed_v_tex,
+                't_tex_idx': processed_t_tex_idx,
+                'v_normal': torch.tensor(mesh_part.vertex_normals, dtype=torch.float32)
+            }
+    def _merge_parts_internal(self):
+        """
+        内部使用的合并函数，在初始化时自动调用
+        将_parts中的所有部分合并为单一的mesh表示
+        """
+        # 如果没有部分或只有一个部分，简化处理
+        if not self._parts:
+            raise ValueError("No mesh parts.")
+        elif len(self._parts) == 1:
+            key = next(iter(self._parts))
+            part = self._parts[key]
+            self._v_pos = part['v_pos']
+            self._t_pos_idx = part['t_pos_idx']
+            self._v_tex = part['v_tex']
+            self._t_tex_idx = part['t_tex_idx']
+            self._v_normal = part['v_normal']
+            self._parts = None  # 清理_parts字典，释放内存
+            return
+        # 初始化合并后的数据
+        vertices = []
+        faces = []
+        normals = []
+        # Record vertex count for each part, used to adjust face indices
+        v_count = 0
+        # Iterate through all parts
+        for key, part in self._parts.items():
+            # Add vertices
+            vertices.append(part['v_pos'])
+            # Adjust face indices and add
+            if len(faces) > 0:
+                adjusted_faces = part['t_pos_idx'] + v_count
+                faces.append(adjusted_faces)
+            else:
+                faces.append(part['t_pos_idx'])
+            # Add normals
+            normals.append(part['v_normal'])
+            # Update vertex count
+            v_count += part['v_pos'].shape[0]
+        self._parts = None  # Clear _parts dictionary to free memory
+        # Merge all data
+        self._v_pos = torch.cat(vertices, dim=0)
+        self._t_pos_idx = torch.cat(faces, dim=0)
+        self._v_normal = torch.cat(normals, dim=0)
+        self._v_tex = None  # multi-parts mesh must be reparameterized
+        self._t_tex_idx = None  # multi-parts mesh must be reparameterized
+        self._vmapping = None  # multi-parts mesh must be reparameterized
+    @classmethod
+    def export(cls, mesh, save_path=None, texture_map: Image.Image = None):
+        """
+        Exports the mesh to a GLB file.
+        :param mesh: Mesh instance to export
+        :param save_path: Optional path to save the GLB file. If None, a temporary file will be created.
+        :param texture_map: Optional PIL.Image to use as the texture. If None, a default texture will be used.
+        :return: Path to the exported GLB file.
+        """
+        # 由于传入的mesh一定是process过的，所以断言确保是单个part且有UV
+        assert not mesh.has_multi_parts, "Mesh should be processed and merged to single part"
+        assert mesh.has_uv, "Mesh should have UV mapping after processing"
+        if save_path is None:
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
+            save_path = temp_file.name
+            temp_file.close()
+        # 创建材质
+        if texture_map is not None:
+            if type(texture_map) is np.ndarray:
+                texture_map = Image.fromarray(texture_map)
+            assert type(texture_map) is Image.Image, "texture_map should be a PIL.Image"
+            texture_map = texture_map.transpose(Image.FLIP_TOP_BOTTOM).convert("RGB")
+            material = trimesh.visual.texture.SimpleMaterial(image=texture_map)
+        else:
+            default_texture = Image.new("RGB", (1024, 1024), (200, 200, 200))
+            material = trimesh.visual.texture.SimpleMaterial(image=default_texture)
+        # If vmapping exists (processed by xatlas), need to rebuild vertices to match UV layout
+        if hasattr(mesh, '_vmapping') and mesh._vmapping is not None:
+            # Use xatlas-generated UV layout to rebuild mesh
+            vertices = mesh.v_pos[mesh._vmapping].cpu().numpy()
+            faces = mesh.t_tex_idx.cpu().numpy()
+            uvs = mesh.v_tex.cpu().numpy()
+        else:
+            # Original UV mapping, directly use original vertices and faces
+            vertices = mesh.v_pos.cpu().numpy()
+            faces = mesh.t_pos_idx.cpu().numpy()
+            uvs = mesh.v_tex.cpu().numpy()
+        # If upside_down transformation was applied, need to apply face orientation correction
+        if hasattr(mesh, '_upside_down_applied') and mesh._upside_down_applied:
+            faces_corrected = faces.copy()
+            faces_corrected[:, [1, 2]] = faces[:, [2, 1]]  # (0,1,2) -> (0,2,1)
+            faces = faces_corrected
+        # Apply inverse transformation to convert vertices from rendering coordinate system back to GLB coordinate system
+        # This is the inverse of vertex_transform:
+        # vertex_transform: y = -z, z = y
+        # inverse transformation: y = z, z = -y
+        vertices_export = vertices.copy()
+        vertices_export[:, 1] = vertices[:, 2]   # z → y
+        vertices_export[:, 2] = -vertices[:, 1]  # -y → z
+        # Create Trimesh object and set texture
+        mesh_export = trimesh.Trimesh(vertices=vertices_export, faces=faces, process=False)
+        mesh_export.visual = trimesh.visual.TextureVisuals(uv=uvs, material=material)
+        # Export GLB file
+        mesh_export.export(file_obj=save_path, file_type='glb')
+        return save_path
+    @classmethod
+    def process(cls, mesh_file, uv_tool="xAtlas", y2z=True, y2x=False, z2x=False, upside_down=False, img_size=(512, 512), uv_size=(1024, 1024), device='cuda', progress=gr.Progress()):
+        """
+        Handle the mesh processing, which includes normalization, parts merging, and UV mapping.
+        Then render the untextured mesh from four views.
+        :param mesh_file: uploaded mesh file.
+        :param uv_tool: the UV parameterization tool, default is "xAtlas".
+        :return: rendered clay model images from four views.
+        """
+        # load mesh (automatically merge multiple parts)
+        mesh: Mesh = cls(mesh_file, uv_tool, device, progress=progress)
+        progress(0.7, f"Handling transformation and normalization...")
+        # normalize mesh
+        if y2z:
+            mesh.vertex_transform()  # transform vertices and normals
+        if y2x:
+            mesh.vertex_transform_y2x()
+        if z2x:
+            mesh.vertex_transform_z2x()
+        if upside_down:
+            mesh.vertex_transform_upsidedown()
+        mesh.normalize()
+        # render preparation
+        texture = get_pure_texture(uv_size).to(device) # tensor of shape (3, height, width)
+        # lights = setup_lights()
+        lights = None
+        mvp_matrix, w2c = get_mvp_matrix(mesh)
+        mvp_matrix = mvp_matrix.to(device)
+        w2c = w2c.to(device)
+        # render untextured mesh from four views
+        # images = render_views(mesh, texture, mvp_matrix, lights, img_size) # PIL.Image
+        progress(0.8, f"Rendering clay model views...")
+        print(f"Rendering geometry views...")
+        position_images, normal_images, mask_images = render_geo_views_tensor(mesh, mvp_matrix, img_size) # torch.Tensor # [batch_size, height, width, 3]
+        progress(0.9, f"Rendering geometry maps...")
+        print(f"Rendering geometry maps...")
+        position_map, normal_map = render_geo_map(mesh)
+        progress(1, f"Mesh processing completed.")
+        return position_map, normal_map, position_images, normal_images, mask_images.squeeze(-1), w2c, mesh, mvp_matrix, "Mesh processing completed."
+if __name__ == '__main__':
+    glb_path = "/mnt/pfs/users/yuanze/projects/clean_seqtex/gradio/examples/multi_parts.glb"
+    position_map, normal_map, position_images, normal_images, w2c = Mesh.process(glb_path)
+    position_map.save("position_map.png")
+    normal_map.save("normal_map.png")
+    # 将 [-1, 1] 范围的normal_images save PIL
+    # normal_images = rearrange(normal_images, "B H W C -> B C H W")
+    # save_image(normal_images, "normal_images.png", normalize=True, value_range=(-1, 1))

utils/pipeline_controlnet_union_sd_xl.py ADDED Viewed

	@@ -0,0 +1,1397 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import gradio as gr
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer,CLIPImageProcessor,CLIPVisionModelWithProjection
+from diffusers.utils.import_utils import is_invisible_watermark_available
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin,IPAdapterMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel,ImageProjection
+from .controlnet_union import ControlNetModel_Union
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+        >>> import cv2
+        >>> from PIL import Image
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+        ... ).images[0]
+        ```
+"""
+class StableDiffusionXLControlNetUnionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin,IPAdapterMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings should always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
+            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+            watermarker is used.
+    """
+    model_cpu_offload_seq = (
+        "text_encoder->text_encoder_2->image_encoder->unet->vae"  # leave controlnet out on purpose because it iterates with unet
+    )
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin,):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                prompt_embeds = text_encoder(
+                    text_input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+                prompt_embeds_list.append(prompt_embeds)
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+            return image_embeds, uncond_image_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, ControlNetModel_Union)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel_Union)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, ControlNetModel_Union)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel_Union)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+        return image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image_list: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        union_control = False,
+        union_control_type = None,
+        progress=gr.Progress(),
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
+                control_guidance_end
+            ]
+        # 1. Check inputs. Raise error if not correct
+        for image in image_list:
+            if image:
+                self.check_inputs(
+                    prompt,
+                    prompt_2,
+                    image,
+                    callback_steps,
+                    negative_prompt,
+                    negative_prompt_2,
+                    prompt_embeds,
+                    negative_prompt_embeds,
+                    pooled_prompt_embeds,
+                    negative_pooled_prompt_embeds,
+                    controlnet_conditioning_scale,
+                    control_guidance_start,
+                    control_guidance_end,
+                    ip_adapter_image,
+                    ip_adapter_image_embeds,
+                )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                do_classifier_free_guidance,
+            )
+        # 4. Prepare image
+        assert isinstance(controlnet, ControlNetModel_Union)
+        for idx in range(len(image_list)):
+            if image_list[idx]:
+                image = self.prepare_image(
+                    image=image_list[idx],
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                height, width = image.shape[-2:]
+                image_list[idx] = image
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) or isinstance(controlnet, ControlNetModel_Union) else keeps)
+        # 7.2 Prepare added time ids & embeddings
+        for image in image_list:
+            if isinstance(image, torch.Tensor):
+                original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+        # print(original_size)
+        # print(target_size)
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # with self.progress_bar(total=num_inference_steps) as progress_bar:
+        # with progress.tqdm(range(num_inference_steps), desc="Diffusing...") as progress_bar:
+        for i, t in progress.tqdm(enumerate(timesteps), desc="Diffusing..."):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids, \
+                "control_type":union_control_type.reshape(1, -1).to(device, dtype=prompt_embeds.dtype).repeat(batch_size * num_images_per_prompt * 2, 1)}
+            # controlnet(s) inference
+            if guess_mode and do_classifier_free_guidance:
+                # Infer ControlNet only for the conditional batch.
+                control_model_input = latents
+                control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                controlnet_added_cond_kwargs = {
+                    "text_embeds": add_text_embeds.chunk(2)[1],
+                    "time_ids": add_time_ids.chunk(2)[1],
+                }
+            else:
+                control_model_input = latent_model_input
+                controlnet_prompt_embeds = prompt_embeds
+                controlnet_added_cond_kwargs = added_cond_kwargs
+            if isinstance(controlnet_keep[i], list):
+                cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+            else:
+                controlnet_cond_scale = controlnet_conditioning_scale
+                if isinstance(controlnet_cond_scale, list):
+                    controlnet_cond_scale = controlnet_cond_scale[0]
+                cond_scale = controlnet_cond_scale * controlnet_keep[i]
+            # print(image.shape)
+            if isinstance(controlnet, ControlNetModel_Union):
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond_list=image_list,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+            if guess_mode and do_classifier_free_guidance:
+                # Infered ControlNet only for the conditional batch.
+                # To apply the output of ControlNet to both the unconditional and conditional batches,
+                # add 0 to the unconditional batch to keep it unchanged.
+                down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                added_cond_kwargs["image_embeds"] = image_embeds
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+            # call the callback, if provided
+            # if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+            #     progress_bar.update()
+            #     if callback is not None and i % callback_steps == 0:
+            #         callback(i, t, latents)
+        # manually for max memory savings
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)
+    # Overrride to properly handle the loading and unloading of the additional text encoder.
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.load_lora_weights
+    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        # We could have accessed the unet config from `lora_state_dict()` too. We pass
+        # it here explicitly to be able to tell that it's coming from an SDXL
+        # pipeline.
+        # Remove any existing hooks.
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+        else:
+            raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        recursive = False
+        for _, component in self.components.items():
+            if isinstance(component, torch.nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    recursive = is_sequential_cpu_offload
+                    remove_hook_from_module(component, recurse=recursive)
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict,
+            unet_config=self.unet.config,
+            **kwargs,
+        )
+        self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+            )
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+            )
+        # Offload back.
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+    @classmethod
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.save_lora_weights
+    def save_lora_weights(
+        self,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        state_dict = {}
+        def pack_weights(layers, prefix):
+            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+            return layers_state_dict
+        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
+            )
+        if unet_lora_layers:
+            state_dict.update(pack_weights(unet_lora_layers, "unet"))
+        if text_encoder_lora_layers and text_encoder_2_lora_layers:
+            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+        self.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._remove_text_encoder_monkey_patch
+    def _remove_text_encoder_monkey_patch(self):
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
+        self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)

utils/pipeline_stable_diffusion_switcher.py ADDED Viewed

	@@ -0,0 +1,1240 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+from PIL import Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+import torchvision.transforms.functional as TF
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+def scale_latents_rm(latents):
+    latents = latents * 0.9702 - 0.5742
+    return latents
+def unscale_latents_rm(latents):
+    latents = (latents + 0.5742) / 0.9702
+    return latents
+def scale_latents_bump(latents):
+    latents = latents * 0.9462 + 0.3770
+    return latents
+def unscale_latents_bump(latents):
+    latents = (latents - 0.3770) / 0.9462
+    return latents
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class StableDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+        return prompt_embeds, negative_prompt_embeds
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+            return image_embeds, uncond_image_embeds
+    def prepare_cond_image_latents(self, image, normal, mask, cond_vae, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = self.vae.dtype
+        if isinstance(image, list):
+            image = torch.stack([TF.to_tensor(img) for img in image], dim=0).to(device=device, dtype=dtype)
+        elif isinstance(image, torch.Tensor):
+            image = image.to(device=device, dtype=dtype)
+        if isinstance(normal, list):
+            normal = torch.stack([TF.to_tensor(img) for img in normal], dim=0).to(device=device, dtype=dtype)
+        elif isinstance(normal, torch.Tensor):
+            normal = normal.to(device=device, dtype=dtype)
+        if isinstance(mask, list):
+            if isinstance(mask[0], np.ndarray):
+                mask = [Image.fromarray((img*255).astype(np.uint8), mode='L') for img in mask]
+                mask = [img.resize((image.shape[3]//8, image.shape[2]//8), resample=Image.NEAREST) for img in mask]
+            elif isinstance(mask[0], Image.Image):
+                mask = [img.resize((image.shape[3]//8, image.shape[2]//8), resample=Image.NEAREST) for img in mask]
+            mask = torch.stack([TF.to_tensor(img) for img in mask], dim=0).to(device=device, dtype=dtype)
+        elif isinstance(mask, torch.Tensor):
+            mask = Image.fromarray((mask.cpu().numpy()*255).astype(np.uint8), mode='L')
+            mask = mask.resize((image.shape[3]//8, image.shape[2]//8), resample=Image.NEAREST)
+            mask = TF.to_tensor(mask).to(device=device, dtype=dtype)
+        if cond_vae is not None:
+            image = image * 2.0 - 1.0
+            if normal is not None:
+                normal = normal * 2.0 - 1.0
+                image = torch.cat([image, normal], dim=1)
+            latents = cond_vae(image) * self.vae.config.scaling_factor
+        else:
+            # vae encoder
+            image = image * 2.0 - 1.0
+            latents = self.vae.encode(image).latent_dist.mode() * self.vae.config.scaling_factor
+            latents = latents.repeat(num_images_per_prompt, 1, 1, 1)
+            if normal is not None:
+                normal = normal * 2.0 - 1.0
+                normal_latents = self.vae.encode(normal).latent_dist.mode() * self.vae.config.scaling_factor
+                normal_latents = normal_latents.repeat(num_images_per_prompt, 1, 1, 1)
+                latents = torch.cat([latents, normal_latents], dim=1)
+            if mask is not None:
+                # mask = torch.ones_like(mask)
+                mask = mask * 2.0 - 1.0
+                mask_latents = mask.repeat(num_images_per_prompt, 1, 1, 1)
+                latents = torch.cat([latents, mask_latents.to(latents)], dim=1)
+        if do_classifier_free_guidance:
+            # uncond_latens = self.vae.encode(torch.zeros_like(image)).latent_dist.mode() * self.vae.config.scaling_factor
+            # uncond_latens.repeat(num_images_per_prompt, 1, 1, 1)
+            uncond_latens = torch.zeros_like(latents)
+            latents = torch.cat([latents, latents])
+        return latents
+    def prepare_init_latents(self, init_materials, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = self.vae.dtype
+        image = torch.cat([
+            init_materials['albedo'][...,:3].permute(0, 3, 1, 2),
+            init_materials['roughness_metallic'][...,:3].permute(0, 3, 1, 2),
+            init_materials['bump'][...,:3].permute(0, 3, 1, 2),
+        ], dim=0).to(device=device, dtype=dtype)
+        from einops import rearrange
+        # vae encoder
+        image = image * 2.0 - 1.0
+        latents = self.vae.encode(image).latent_dist.mode() * self.vae.config.scaling_factor
+        latents = rearrange(latents, '(s b) c h w -> b (s c) h w', s=3)
+        latents = latents.repeat(num_images_per_prompt, 1, 1, 1)
+        # if do_classifier_free_guidance:
+        #     # uncond_latens = self.vae.encode(torch.zeros_like(image)).latent_dist.mode() * self.vae.config.scaling_factor
+        #     # uncond_latens.repeat(num_images_per_prompt, 1, 1, 1)
+        #     # uncond_latens = torch.zeros_like(latents)
+        #     latents = torch.cat([latents, latents])
+        return latents
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+        return image_embeds
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, copy_noise=False):
+        if copy_noise:
+            shape = (batch_size, num_channels_latents//3, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        else:
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            if copy_noise:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = torch.cat([latents, latents, latents], dim=1)
+            else:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.FloatTensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    def _get_add_time_ids(
+        self, albedo_label, rough_meta_label, bump_label, dtype
+    ):
+        add_time_ids = list(albedo_label + rough_meta_label + bump_label)
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) // 3
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        cond_image: Optional[PipelineImageInput] = None,
+        normal_image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        init_materials: Optional[dict] = None,
+        masks: Optional[torch.FloatTensor] = None,
+        cond_vae = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        unscale_latents: bool = False,
+        copy_noise: bool = False,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0] // 3
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        # 4.1 Prepare additional class embedding
+        if self.unet.config.addition_time_embed_dim is not None:
+            albedo_label = (1, 0, 0)
+            rough_meta_label = (0, 1, 0)
+            nump_label = (0, 0, 1)
+            add_time_ids = self._get_add_time_ids(
+                albedo_label,
+                rough_meta_label,
+                nump_label,
+                dtype=prompt_embeds.dtype,
+            )
+            negative_add_time_ids = add_time_ids
+            if self.do_classifier_free_guidance:
+                add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+            add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels_no_cond
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            copy_noise,
+        )
+        # 5.1 Prepare conditional image latents
+        cond_latents = None
+        mask_image = [mask.cpu().numpy() for mask in masks]
+        if cond_image is not None:
+            cond_latents = self.prepare_cond_image_latents(
+                cond_image,
+                normal_image,
+                mask_image,
+                cond_vae,
+                device,
+                num_images_per_prompt,
+                self.do_classifier_free_guidance
+            )
+        init_latents = None
+        if init_materials is not None:
+            init_latents = self.prepare_init_latents(
+                init_materials,
+                device,
+                num_images_per_prompt,
+                self.do_classifier_free_guidance
+            )
+            import cv2
+            import numpy as np
+            from PIL import Image
+            masks = cv2.erode((masks[0].cpu().numpy()*255).astype(np.uint8), kernel=np.ones((5, 5), np.uint8), iterations=4)
+            masks = Image.fromarray(masks.astype(np.uint8)).convert("L")
+            masks = masks.resize((height // 8, width // 8), Image.NEAREST)
+            masks = TF.to_tensor(masks).to(init_latents.device, init_latents.dtype).unsqueeze(1)
+            # masks = torch.zeros_like(masks)
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # # 6.1 Add image embeds for IP-Adapter
+        # added_cond_kwargs = (
+        #     {"image_embeds": image_embeds}
+        #     if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+        #     else None
+        # )
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if cond_latents is not None:
+                    latent_model_input = torch.cat([latent_model_input, cond_latents], dim=1)
+                # predict the noise residual
+                added_cond_kwargs = {}
+                if self.unet.config.addition_time_embed_dim is not None:
+                    added_cond_kwargs["time_ids"] = add_time_ids
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    # only do cfg for roughness, metallic and bump
+                    noise_pred = noise_pred_uncond[:,4:]  + self.guidance_scale * (noise_pred_text[:,4:]  - noise_pred_uncond[:,4:])
+                    noise_pred = torch.cat([noise_pred_text[:, :4], noise_pred], dim=1)
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False, init_latents=init_latents, masks=masks)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            if num_channels_latents == 12:
+                latents = latents / self.vae.config.scaling_factor
+                if unscale_latents:
+                    latents[:, 4:8] = unscale_latents_rm(latents[:, 4:8])
+                    latents[:, 8:] = unscale_latents_bump(latents[:, 8:])
+                latents = torch.cat([latents[:, :4], latents[:, 4:8], latents[:, 8:]], dim=0)
+                image = self.vae.decode(latents, return_dict=False, generator=generator)[
+                    0
+                ]
+            else:
+                image = self.vae.decode(latents/ self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                    0
+                ]
+            has_nsfw_concept = None
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

utils/rasterize.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import nvdiffrast.torch as dr
+import torch
+from torch import Tensor
+from jaxtyping import Float, Integer
+from typing import Union, Tuple
+class NVDiffRasterizerContext:
+    def __init__(self, context_type: str, device: torch.device) -> None:
+        self.device = device
+        self.ctx = self.initialize_context(context_type, device)
+    def initialize_context(
+        self, context_type: str, device: torch.device
+    ) -> Union[dr.RasterizeGLContext, dr.RasterizeCudaContext]:
+        if context_type == "gl":
+            return dr.RasterizeGLContext(device=device)
+        elif context_type == "cuda":
+            return dr.RasterizeCudaContext(device=device)
+        else:
+            raise ValueError(f"Unknown rasterizer context type: {context_type}")
+    def vertex_transform(
+        self, verts: Float[Tensor, "Nv 3"], mvp_mtx: Float[Tensor, "B 4 4"]
+    ) -> Float[Tensor, "B Nv 4"]:
+        with torch.amp.autocast("cuda", enabled=False):
+            verts_homo = torch.cat(
+                [verts, torch.ones([verts.shape[0], 1]).to(verts)], dim=-1
+            )
+            verts_clip = torch.matmul(verts_homo, mvp_mtx.permute(0, 2, 1))
+        return verts_clip
+    def rasterize(
+        self,
+        pos: Float[Tensor, "B Nv 4"],
+        tri: Integer[Tensor, "Nf 3"],
+        resolution: Union[int, Tuple[int, int]],
+    ):
+        # rasterize in instance mode (single topology)
+        return dr.rasterize(self.ctx, pos.float(), tri.int(), resolution, grad_db=True)
+    def rasterize_one(
+        self,
+        pos: Float[Tensor, "Nv 4"],
+        tri: Integer[Tensor, "Nf 3"],
+        resolution: Union[int, Tuple[int, int]],
+    ):
+        # rasterize one single mesh under a single viewpoint
+        rast, rast_db = self.rasterize(pos[None, ...], tri, resolution)
+        return rast[0], rast_db[0]
+    def antialias(
+        self,
+        color: Float[Tensor, "B H W C"],
+        rast: Float[Tensor, "B H W 4"],
+        pos: Float[Tensor, "B Nv 4"],
+        tri: Integer[Tensor, "Nf 3"],
+    ) -> Float[Tensor, "B H W C"]:
+        return dr.antialias(color.float(), rast, pos.float(), tri.int())
+    def interpolate(
+        self,
+        attr: Float[Tensor, "B Nv C"],
+        rast: Float[Tensor, "B H W 4"],
+        tri: Integer[Tensor, "Nf 3"],
+        rast_db=None,
+        diff_attrs=None,
+    ) -> Float[Tensor, "B H W C"]:
+        return dr.interpolate(
+            attr.float(), rast, tri.int(), rast_db=rast_db, diff_attrs=diff_attrs
+        )
+    def interpolate_one(
+        self,
+        attr: Float[Tensor, "Nv C"],
+        rast: Float[Tensor, "B H W 4"],
+        tri: Integer[Tensor, "Nf 3"],
+        rast_db=None,
+        diff_attrs=None,
+    ) -> Float[Tensor, "B H W C"]:
+        return self.interpolate(attr[None, ...], rast, tri, rast_db, diff_attrs)
+def texture_map_to_rgb(tex_map, uv_coordinates):
+    return dr.texture(tex_map.float(), uv_coordinates)
+def render_rgb_from_texture_mesh_with_mask(
+    ctx,
+    mesh,
+    tex_map: Float[Tensor, "1 H W C"],
+    mvp_matrix: Float[Tensor, "batch 4 4"],
+    image_height: int,
+    image_width: int,
+    background_color: Tensor = torch.tensor([0.0, 0.0, 0.0]),
+):
+    batch_size = mvp_matrix.shape[0]
+    tex_map = tex_map.contiguous()
+    if tex_map.dim() == 3:
+        tex_map = tex_map.unsqueeze(0)  # Add batch dimension if missing
+    vertex_positions_clip = ctx.vertex_transform(mesh.v_pos, mvp_matrix)
+    rasterized_output, _ = ctx.rasterize(vertex_positions_clip, mesh.t_pos_idx, (image_height, image_width))
+    mask = rasterized_output[..., 3:] > 0
+    mask_antialiased = ctx.antialias(mask.float(), rasterized_output, vertex_positions_clip, mesh.t_pos_idx)
+    interpolated_texture_coords, _ = ctx.interpolate_one(mesh._v_tex, rasterized_output, mesh._t_tex_idx)
+    rgb_foreground = texture_map_to_rgb(tex_map.float(), interpolated_texture_coords)
+    rgb_foreground_batched = torch.zeros(batch_size, image_height, image_width, 3).to(rgb_foreground)
+    rgb_background_batched = torch.zeros(batch_size, image_height, image_width, 3).to(rgb_foreground)
+    rgb_background_batched += background_color.view(1, 1, 1, 3).to(rgb_foreground)
+    selector = mask[..., 0]
+    rgb_foreground_batched[selector] = rgb_foreground[selector]
+    # Use the anti-aliased mask for blending
+    final_rgb = torch.lerp(rgb_background_batched, rgb_foreground_batched, mask_antialiased)
+    final_rgb_aa = ctx.antialias(final_rgb, rasterized_output, vertex_positions_clip, mesh.t_pos_idx)
+    return final_rgb_aa, selector
+def render_geo_from_mesh(ctx, mesh, mvp_matrix, image_height, image_width):
+    device = mvp_matrix.device
+    vertex_positions_clip = ctx.vertex_transform(mesh.v_pos.to(device), mvp_matrix)
+    rasterized_output, _ = ctx.rasterize(vertex_positions_clip, mesh.t_pos_idx.to(device), (image_height, image_width))
+    interpolated_positions, _ = ctx.interpolate_one(mesh.v_pos.to(device), rasterized_output, mesh.t_pos_idx.to(device))
+    interpolated_normals, _ = ctx.interpolate_one(mesh.v_normal.to(device).contiguous(), rasterized_output, mesh.t_pos_idx.to(device))
+    mask = rasterized_output[..., 3:] > 0
+    mask_antialiased = ctx.antialias(mask.float(), rasterized_output, vertex_positions_clip, mesh.t_pos_idx.to(device))
+    batch_size = mvp_matrix.shape[0]
+    rgb_foreground_pos_batched = torch.zeros(batch_size, image_height, image_width, 3).to(interpolated_positions)
+    rgb_foreground_norm_batched = torch.zeros(batch_size, image_height, image_width, 3).to(interpolated_positions)
+    rgb_background_batched = torch.zeros(batch_size, image_height, image_width, 3).to(interpolated_positions)
+    selector = mask[..., 0]
+    rgb_foreground_pos_batched[selector] = interpolated_positions[selector]
+    rgb_foreground_norm_batched[selector] = interpolated_normals[selector]
+    final_pos_rgb = torch.lerp(rgb_background_batched, rgb_foreground_pos_batched, mask_antialiased)
+    final_norm_rgb = torch.lerp(rgb_background_batched, rgb_foreground_norm_batched, mask_antialiased)
+    final_pos_rgb_aa = ctx.antialias(final_pos_rgb, rasterized_output, vertex_positions_clip, mesh.t_pos_idx.to(device))
+    final_norm_rgb_aa = ctx.antialias(final_norm_rgb, rasterized_output, vertex_positions_clip, mesh.t_pos_idx.to(device))
+    return final_pos_rgb_aa, final_norm_rgb_aa, mask_antialiased
+def rasterize_position_and_normal_maps(ctx, mesh, rasterize_height, rasterize_width):
+    device = ctx.device
+    # Convert mesh data to torch tensors
+    mesh_v = mesh.v_pos.to(device)
+    mesh_f = mesh.t_pos_idx.to(device)
+    uvs_tensor = mesh._v_tex.to(device)
+    indices_tensor = mesh._t_tex_idx.to(device)
+    normal_v = mesh.v_normal.to(device).contiguous()
+    # Interpolate mesh data
+    uv_clip = uvs_tensor[None, ...] * 2.0 - 1.0
+    uv_clip_padded = torch.cat((uv_clip, torch.zeros_like(uv_clip[..., :1]), torch.ones_like(uv_clip[..., :1])), dim=-1)
+    rasterized_output, _ = ctx.rasterize(uv_clip_padded, indices_tensor.int(), (rasterize_height, rasterize_width))
+    # Interpolate positions.
+    position_map, _ = ctx.interpolate_one(mesh_v, rasterized_output, mesh_f.int())
+    normal_map, _ = ctx.interpolate_one(normal_v, rasterized_output, mesh_f.int())
+    rasterization_mask = rasterized_output[..., 3:4] > 0
+    return position_map, normal_map, rasterization_mask

utils/render_utils.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import math
+from functools import cache
+from typing import Dict, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from jaxtyping import Float
+from PIL import Image
+from torch import Tensor
+from torchvision.transforms import ToPILImage
+from .rasterize import (NVDiffRasterizerContext,
+                        rasterize_position_and_normal_maps,
+                        render_geo_from_mesh,
+                        render_rgb_from_texture_mesh_with_mask)
+CTX = NVDiffRasterizerContext('cuda', 'cuda')
+def setup_lights():
+    """
+    Set three random point lights in the scene.
+    """
+    raise NotImplementedError("setup_lights function is not implemented yet.")
+def render_views(mesh, texture, mvp_matrix, lights=None, img_size=(512, 512)) -> Image.Image:
+    """
+    Render the RGB color images of the mesh. The background will be transparent.
+    :param mesh: The mesh to be rendered. Class: Mesh.
+    :param texture: The texture of the mesh, a tensor of shape (H, W, 3).
+    :param mvp_matrix: The Model-View-Projection matrix for rendering, a tensor of shape (n_v, 4, 4).
+    :param lights: The lights in the scene.
+    :param img_size: The size of the output image, a tuple (height, width).
+    :return: A concatenated PIL Image.
+    """
+    if texture.shape[-1] != 3:
+        texture = texture.permute(1, 2, 0)
+    image_height, image_width = img_size
+    rgb_cond, mask = render_rgb_from_texture_mesh_with_mask(
+            CTX, mesh, texture, mvp_matrix, image_height, image_width, torch.tensor([0.0, 0.0, 0.0], device='cuda'))
+    if mvp_matrix.shape[0] == 0:
+        return None
+    pil_images = []
+    for i in range(mvp_matrix.shape[0]):
+        rgba_img = torch.cat([rgb_cond[i], mask[i].unsqueeze(-1)], dim=-1)  # [H, W, 3] + [H, W, 1] -> [H, W, 4]
+        rgba_img = (rgba_img * 255).to(torch.uint8)  # Convert to uint8
+        rgba_img = rgba_img.cpu().numpy()  # Convert to numpy array
+        pil_images.append(Image.fromarray(rgba_img, mode='RGBA'))
+    if not pil_images:
+        return None
+    total_width = sum(img.width for img in pil_images)
+    max_height = max(img.height for img in pil_images)
+    concatenated_image = Image.new('RGBA', (total_width, max_height))
+    current_x = 0
+    for img in pil_images:
+        concatenated_image.paste(img, (current_x, 0))
+        current_x += img.width
+    return concatenated_image
+def render_geo_views_tensor(mesh, mvp_matrix, img_size=(512, 512)) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    render the geometry information including position and normal from views that mvp matrix implies.
+    """
+    image_height, image_width = img_size
+    position_images, normal_images, mask_images = render_geo_from_mesh(CTX, mesh, mvp_matrix, image_height, image_width)
+    return position_images, normal_images, mask_images
+def render_geo_map(mesh, map_size=(1024, 1024)) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Render the geometry information including position and normal from UV parameterization.
+    """
+    map_height, map_width = map_size
+    position_images, normal_images, mask = rasterize_position_and_normal_maps(CTX, mesh, map_height, map_width)
+    # out_imgs = []
+    # if mask.ndim == 4:
+    #     mask = mask[0]
+    # for img_map in [position_images, normal_images]:
+    #     if img_map.ndim == 4:
+    #         img_map = img_map[0]
+    #     # normalize to [0, 1]
+    #     img_map = (img_map - img_map.min()) / (img_map.max() - img_map.min() + 1e-6)
+    #     rgba_img = torch.cat([img_map, mask], dim=-1)  # [H, W, 3] + [H, W, 1] -> [H, W, 4]
+    #     rgba_img = (rgba_img * 255).to(torch.uint8)  # Convert to uint8
+    #     rgba_img = rgba_img.cpu().numpy()  # Convert to numpy array
+    #     out_imgs.append(Image.fromarray(rgba_img, mode='RGBA'))
+    return position_images, normal_images
+@cache
+def get_pure_texture(uv_size, color=(int("0x55", 16), int("0x55", 16), int("0x55", 16))) -> torch.Tensor:
+    """
+    get a pure texture image with the specified color.
+    :param uv_size: The size of the UV map (height, width).
+    :param color: The color of the texture, default is "0x555555" (light gray).
+    :return: A texture image tensor of shape (height, width, 3).
+    """
+    height, width = uv_size
+    color = torch.tensor(color, dtype=torch.float32).view(1, 1, 3) / 255.0
+    texture = color.repeat(height, width, 1)
+    return texture
+def get_c2w(
+        azimuth_deg,
+        elevation_deg,
+        camera_distances,):
+    assert len(azimuth_deg) == len(elevation_deg) == len(camera_distances)
+    n_views = len(azimuth_deg)
+    #camera_distances = torch.full_like(elevation_deg, dis)
+    elevation = elevation_deg * math.pi / 180
+    azimuth = azimuth_deg * math.pi / 180
+    camera_positions = torch.stack(
+        [
+            camera_distances * torch.cos(elevation) * torch.cos(azimuth),
+            camera_distances * torch.cos(elevation) * torch.sin(azimuth),
+            camera_distances * torch.sin(elevation),
+        ],
+        dim=-1,
+    )
+    center = torch.zeros_like(camera_positions)
+    up = torch.as_tensor([0, 0, 1], dtype=torch.float32)[None, :].repeat(n_views, 1)
+    lookat = F.normalize(center - camera_positions, dim=-1)
+    right = F.normalize(torch.cross(lookat, up, dim=-1), dim=-1)
+    up = F.normalize(torch.cross(right, lookat, dim=-1), dim=-1)
+    c2w3x4 = torch.cat(
+        [torch.stack([right, up, -lookat], dim=-1), camera_positions[:, :, None]],
+        dim=-1,
+    )
+    c2w = torch.cat([c2w3x4, torch.zeros_like(c2w3x4[:, :1])], dim=1)
+    c2w[:, 3, 3] = 1.0
+    return c2w
+def camera_strategy_test_4_90deg(
+        mesh: Dict,
+        num_views: int = 4,
+        **kwargs) -> Dict:
+    """
+    For sup views: Random elevation and azimuth, fixed distance and close fov.
+    :param num_views: number of supervision views
+    :param kwargs: additional arguments
+    """
+    # Default camera intrinsics
+    default_elevation = 10
+    default_camera_lens = 50
+    default_camera_sensor_width = 36
+    default_fovy = 2 * np.arctan(default_camera_sensor_width / (2 * default_camera_lens))
+    bbox_size = mesh.v_pos.max(dim=0)[0] - mesh.v_pos.min(dim=0)[0]
+    distance = default_camera_lens / default_camera_sensor_width * \
+               math.sqrt(bbox_size[0] ** 2 + bbox_size[1] ** 2 + bbox_size[2] ** 2)
+    all_azimuth_deg = torch.linspace(0, 360.0, num_views + 1)[:num_views] - 90
+    all_elevation_deg = torch.full_like(all_azimuth_deg, default_elevation)
+    # Get the corresponding azimuth and elevation
+    view_idxs = torch.arange(0, num_views)
+    azimuth = all_azimuth_deg[view_idxs]
+    elevation = all_elevation_deg[view_idxs]
+    camera_distances = torch.full_like(elevation, distance)
+    c2w = get_c2w(azimuth, elevation, camera_distances)
+    if c2w.ndim == 2:
+        w2c: Float[Tensor, "4 4"] = torch.zeros(4, 4).to(c2w)
+        w2c[:3, :3] = c2w[:3, :3].permute(1, 0)
+        w2c[:3, 3:] = -c2w[:3, :3].permute(1, 0) @ c2w[:3, 3:]
+        w2c[3, 3] = 1.0
+    else:
+        w2c: Float[Tensor, "B 4 4"] = torch.zeros(c2w.shape[0], 4, 4).to(c2w)
+        w2c[:, :3, :3] = c2w[:, :3, :3].permute(0, 2, 1)
+        w2c[:, :3, 3:] = -c2w[:, :3, :3].permute(0, 2, 1) @ c2w[:, :3, 3:]
+        w2c[:, 3, 3] = 1.0
+    fovy = torch.full_like(azimuth, default_fovy)
+    return {
+        'cond_sup_view_idxs': view_idxs,
+        'cond_sup_c2w': c2w,
+        'cond_sup_w2c': w2c,
+        'cond_sup_fovy': fovy,
+        # 'cond_sup_azimuth': azimuth,
+        # 'cond_sup_elevation': elevation,
+    }
+def _get_projection_matrix(
+    fovy: Union[float, Float[Tensor, "B"]], aspect_wh: float, near: float, far: float
+) -> Float[Tensor, "*B 4 4"]:
+    if isinstance(fovy, float):
+        proj_mtx = torch.zeros(4, 4, dtype=torch.float32)
+        proj_mtx[0, 0] = 1.0 / (math.tan(fovy / 2.0) * aspect_wh)
+        proj_mtx[1, 1] = -1.0 / math.tan(
+            fovy / 2.0
+        )  # add a negative sign here as the y axis is flipped in nvdiffrast output
+        proj_mtx[2, 2] = -(far + near) / (far - near)
+        proj_mtx[2, 3] = -2.0 * far * near / (far - near)
+        proj_mtx[3, 2] = -1.0
+    else:
+        batch_size = fovy.shape[0]
+        proj_mtx = torch.zeros(batch_size, 4, 4, dtype=torch.float32)
+        proj_mtx[:, 0, 0] = 1.0 / (torch.tan(fovy / 2.0) * aspect_wh)
+        proj_mtx[:, 1, 1] = -1.0 / torch.tan(
+            fovy / 2.0
+        )  # add a negative sign here as the y axis is flipped in nvdiffrast output
+        proj_mtx[:, 2, 2] = -(far + near) / (far - near)
+        proj_mtx[:, 2, 3] = -2.0 * far * near / (far - near)
+        proj_mtx[:, 3, 2] = -1.0
+    return proj_mtx
+def _get_mvp_matrix(
+    c2w: Float[Tensor, "*B 4 4"], proj_mtx: Float[Tensor, "*B 4 4"]
+) -> Float[Tensor, "*B 4 4"]:
+    # calculate w2c from c2w: R' = Rt, t' = -Rt * t
+    # mathematically equivalent to (c2w)^-1
+    if c2w.ndim == 2:
+        assert proj_mtx.ndim == 2
+        w2c: Float[Tensor, "4 4"] = torch.zeros(4, 4).to(c2w)
+        w2c[:3, :3] = c2w[:3, :3].permute(1, 0)
+        w2c[:3, 3:] = -c2w[:3, :3].permute(1, 0) @ c2w[:3, 3:]
+        w2c[3, 3] = 1.0
+    else:
+        w2c: Float[Tensor, "B 4 4"] = torch.zeros(c2w.shape[0], 4, 4).to(c2w)
+        w2c[:, :3, :3] = c2w[:, :3, :3].permute(0, 2, 1)
+        w2c[:, :3, 3:] = -c2w[:, :3, :3].permute(0, 2, 1) @ c2w[:, :3, 3:]
+        w2c[:, 3, 3] = 1.0
+    # calculate mvp matrix by proj_mtx @ w2c (mv_mtx)
+    mvp_mtx = proj_mtx @ w2c
+    return mvp_mtx
+def get_mvp_matrix(mesh, num_views=4, width=512, height=512, strategy="strategy_test_4_90deg"):
+    """
+    Get Model-View-Projection (MVP) matrix for rendering views.
+    :param mesh: The mesh object to determine camera positioning.
+    :param num_views: Number of views to generate, default is 4.
+    :param width: Image width for projection matrix calculation.
+    :param height: Image height for projection matrix calculation.
+    :param strategy: Camera positioning strategy, default is "strategy_test_4_90deg".
+    :return: MVP matrix and world-to-camera transformation matrix.
+    """
+    if strategy == "strategy_test_4_90deg":
+        camera_info = camera_strategy_test_4_90deg(
+            mesh=mesh,  # Dummy mesh for camera strategy
+            num_views=num_views,
+        )
+        cond_sup_fovy = camera_info["cond_sup_fovy"]
+        cond_sup_c2w = camera_info["cond_sup_c2w"]
+        cond_sup_w2c = camera_info["cond_sup_w2c"]
+        # cond_sup_azimuth = camera_info["cond_sup_azimuth"]
+        # cond_sup_elevation = camera_info["cond_sup_elevation"]
+    else:
+        raise ValueError(f"Unsupported camera strategy: {strategy}")
+    cond_sup_proj_mtx: Float[Tensor, "B 4 4"] = _get_projection_matrix(
+        cond_sup_fovy, width / height, 0.1, 1000.0
+    )
+    mvp_mtx: Float[Tensor, "B 4 4"] = _get_mvp_matrix(cond_sup_c2w, cond_sup_proj_mtx)
+    return mvp_mtx, cond_sup_w2c
+@torch.cuda.amp.autocast(enabled=False)
+def _get_depth_noraml_map_with_mask(xyz_map, normal_map, mask, w2c, device="cuda", background_color=(0, 0, 0)):
+    """
+    Get depth and normal map with mask from position and normal images.
+    :param xyz_map: Position images in world coordinate, shape [B, Nv, H, W, 3]. It is the return value of `render_geo_views`.
+    :param normal_map: Normal images in world coordinate, shape [B, Nv, H, W, 3]. It is the return value of `render_geo_views`.
+    :param mask: Mask for the images, shape [B, Nv, H, W]. It is the return value of `render_geo_views`.
+    :param w2c: World to camera transformation matrix, shape [B, Nv, 4, 4].
+    :param device: Device to run the computation on, default is "cuda".
+    :param background_color: Background color for the depth and normal maps.
+    :return: depth_map, normal_map, mask
+    """
+    w2c = w2c.to(device)
+    # Render world coordinate position map and mask
+    B, Nv, H, W, C = xyz_map.shape  # B: batch size, Nv: number of views, H/W: height/width, C: channels
+    assert Nv == 1
+    # Rearrange tensors for batch processing
+    xyz_map = rearrange(xyz_map, "B Nv H W C -> (B Nv) (H W) C")
+    normal_map = rearrange(normal_map, "B Nv H W C -> (B Nv) (H W) C")
+    w2c = rearrange(w2c, "B Nv C1 C2 -> (B Nv) C1 C2")
+    # Create homogeneous coordinates and correctly transform to camera coordinate system
+    # Points in world coordinate system need to be multiplied by world-to-camera transformation matrix
+    B_Nv, N, C = xyz_map.shape
+    ones = torch.ones(B_Nv, N, 1, dtype=xyz_map.dtype, device=xyz_map.device)
+    homogeneous_xyz = torch.cat([xyz_map, ones], dim=2)  # [x,y,z,1]
+    zeros = torch.zeros(B_Nv, N, 1, dtype=xyz_map.dtype, device=xyz_map.device)
+    homogeneous_normal = torch.cat([normal_map, zeros], dim=2)  # [x,y,z,1]
+    camera_coords = torch.bmm(homogeneous_xyz, w2c.transpose(1, 2))
+    camera_normals = torch.bmm(homogeneous_normal, w2c.transpose(1, 2))
+    depth_map = camera_coords[..., 2:3]  # Z-axis is the depth direction in camera coordinate system
+    depth_map = rearrange(depth_map, "(B Nv) (H W) 1 -> B Nv H W", B=B, Nv=Nv, H=H, W=W)
+    normal_map = camera_normals[..., :3]  # Keep only x, y, z components
+    normal_map = rearrange(normal_map, "(B Nv) (H W) c -> B Nv H W c", B=B, Nv=Nv, H=H, W=W)
+    assert depth_map.dtype == torch.float32, f"depth_map must be float32, otherwise there will be artifact in controlnet generated pictures, but got {depth_map.dtype}"
+    # Calculate min and max values
+    min_depth = depth_map.amin((1,2,3), keepdim=True)
+    max_depth = depth_map.amax((1,2,3), keepdim=True)
+    depth_map = (depth_map - min_depth) / (max_depth - min_depth + 1e-6)  # Normalize to [0, 1]
+    depth_map = depth_map.repeat(1, 3, 1, 1)  # Repeat 3 times to get RGB depth map
+    normal_map = normal_map * 0.5 + 0.5  # Normalize to [0, 1], [B, Nv, H, W, 3]
+    normal_map = normal_map[:,0].permute(0, 3, 1, 2)  # [B, 3, H, W]
+    rgb_background_batched = torch.tensor(background_color, dtype=torch.float32, device=device).view(1, 3, 1, 1)
+    depth_map = torch.lerp(rgb_background_batched, depth_map, mask)
+    normal_map = torch.lerp(rgb_background_batched, normal_map, mask)
+    return depth_map, normal_map, mask
+def get_silhouette_image(position_imgs, normal_imgs, mask_imgs, w2c, selected_view="First View") -> tuple[Image.Image, Image.Image]:
+    """
+    Get the silhouette image based on geometry image.
+    :param position_imgs: Position images from different views, shape [Nv, H, W, 3].
+    :param normal_imgs: Normal images from different views, shape [Nv, H, W, 3].
+    :param mask_imgs: Mask for the images, shape [Nv, H, W]. It is the return value of `render_geo_views`.
+    :param w2c: World to camera transformation matrix, shape [Nv, 4, 4].
+    :param selected_view: The view selected for generating the image condition.
+    :return: silhouettes (including depth and normal, which is in camera coordinate system).
+    """
+    view_id_map = {
+        "First View": 0,
+        "Second View": 1,
+        "Third View": 2,
+        "Fourth View": 3
+    }
+    view_id = view_id_map[selected_view]
+    position_view = position_imgs[view_id: view_id + 1]
+    normal_view = normal_imgs[view_id: view_id + 1]
+    mask_view = mask_imgs[view_id: view_id + 1]
+    w2c = w2c[view_id: view_id + 1]  # Select the corresponding w2c for the view
+    depth_img, normal_img, mask = _get_depth_noraml_map_with_mask(
+        position_view.unsqueeze(0),  # Add batch dimension
+        normal_view.unsqueeze(0),
+        mask_view.unsqueeze(0),
+        w2c.unsqueeze(0),
+    )
+    to_img = ToPILImage()
+    return to_img(depth_img.squeeze(0)), to_img(normal_img.squeeze(0)), to_img(mask.squeeze(0))

utils/texture_generation.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import os
+import threading
+from dataclasses import dataclass
+from urllib.parse import urlparse
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from diffusers.models import AutoencoderKLWan
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from einops import rearrange
+from jaxtyping import Float
+from peft import LoraConfig
+from PIL import Image
+from torch import Tensor
+from wan.pipeline_wan_t2tex_extra import WanT2TexPipeline
+from wan.wan_t2tex_transformer_3d_extra import WanT2TexTransformer3DModel
+TEX_PIPE = None
+VAE = None
+LATENTS_MEAN, LATENTS_STD = None, None
+TEX_PIPE_LOCK = threading.Lock()
+@dataclass
+class Config:
+    video_base_name: str = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+    seqtex_path: str = "https://huggingface.co/VAST-AI/SeqTex/resolve/main/.gitattributes/edm2_ema_12176_clean.pth"
+    min_noise_level_index: int = 15 # which is same as paper [WorldMem](https://arxiv.org/pdf/2504.12369v1)
+    use_causal_mask: bool = False
+    addtional_qk_geometry: bool = False
+    use_normal: bool = True
+    use_position: bool = True
+    randomly_init: bool = True # we load the weights from a corresponding ckpt
+    num_views: int = 4
+    uv_num_views: int = 1
+    mv_height: int = 512
+    mv_width: int = 512
+    uv_height: int = 1024
+    uv_width: int = 1024
+    flow_shift: float = 5.0
+    eval_guidance_scale: float = 1.0
+    eval_num_inference_steps: int = 30
+    eval_seed: int = 42
+    lora_rank: int = 128
+    lora_alpha: int = 64
+cfg = Config()
+def load_model_weights(model_path: str, map_location="cpu"):
+    """
+    Load model weights from either a URL or local file path.
+    Args:
+        model_path (str): Path to model weights, can be URL or local file path
+        map_location (str): Device to map the model to
+    Returns:
+        Dict: Loaded state dictionary
+    """
+    # Check if the path is a URL
+    parsed_url = urlparse(model_path)
+    if parsed_url.scheme in ('http', 'https'):
+        # Load from URL using torch.hub
+        try:
+            state_dict = torch.hub.load_state_dict_from_url(
+                model_path,
+                map_location=map_location,
+                progress=True
+            )
+            return state_dict
+        except Exception as e:
+            gr.Warning(f"Failed to load from URL: {e}")
+            raise e
+    else:
+        # Load from local file path
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Local model file not found: {model_path}")
+        return torch.load(model_path, map_location=map_location)
+def lazy_get_seqtex_pipe():
+    """
+    Lazy load the SeqTex pipeline for texture generation.
+    """
+    global TEX_PIPE, VAE, LATENTS_MEAN, LATENTS_STD
+    if TEX_PIPE is not None:
+        return TEX_PIPE
+    gr.Info("First called, loading SeqTex pipeline... It may take about 1 minute.")
+    with TEX_PIPE_LOCK:
+        if TEX_PIPE is not None:
+            return TEX_PIPE
+        # Pipeline
+        TEX_PIPE = WanT2TexPipeline.from_pretrained(cfg.video_base_name)
+        # Models
+        transformer = WanT2TexTransformer3DModel(
+            TEX_PIPE.transformer,
+            use_causal_mask=cfg.use_causal_mask,
+            addtional_qk_geo=cfg.addtional_qk_geometry,
+            use_normal=cfg.use_normal,
+            use_position=cfg.use_position,
+            randomly_init=cfg.randomly_init,
+        )
+        transformer.add_adapter(
+            LoraConfig(
+                r=cfg.lora_rank,
+                lora_alpha=cfg.lora_alpha,
+                init_lora_weights=True,
+                target_modules=["attn1.to_q", "attn1.to_k", "attn1.to_v", "attn1.to_out.0", "attn1.to_out.2",
+                                "ffn.net.0.proj", "ffn.net.2"],
+            )
+        )
+        # load transformer
+        state_dict = load_model_weights(cfg.seqtex_path, map_location="cpu")
+        transformer.load_state_dict(state_dict, strict=True)
+        TEX_PIPE.transformer = transformer
+        VAE = AutoencoderKLWan.from_pretrained(cfg.video_base_name, subfolder="vae", torch_dtype=torch.float32).to("cuda").requires_grad_(False)
+        TEX_PIPE.vae = VAE
+        # Some useful parameters
+        LATENTS_MEAN = torch.tensor(VAE.config.latents_mean).view(
+            1, VAE.config.z_dim, 1, 1, 1
+        ).to("cuda", dtype=torch.float32)
+        LATENTS_STD = 1.0 / torch.tensor(VAE.config.latents_std).view(
+            1, VAE.config.z_dim, 1, 1, 1
+        ).to("cuda", dtype=torch.float32)
+        scheduler: FlowMatchEulerDiscreteScheduler = (
+            FlowMatchEulerDiscreteScheduler.from_config(
+                TEX_PIPE.scheduler.config, shift=cfg.flow_shift
+            )
+        )
+        min_noise_level_index = scheduler.config.num_train_timesteps - cfg.min_noise_level_index # in our scheduler, the first time is noise. set to 1000 - 15 typically
+        setattr(TEX_PIPE, "min_noise_level_index", min_noise_level_index)
+        min_noise_level_timestep = scheduler.timesteps[min_noise_level_index]
+        setattr(TEX_PIPE, "min_noise_level_timestep", min_noise_level_timestep)
+        setattr(TEX_PIPE, "min_noise_level_sigma", min_noise_level_timestep / 1000.)
+        TEX_PIPE = TEX_PIPE.to("cuda", dtype=torch.float32) # use float32 for inference
+        return TEX_PIPE
+@torch.amp.autocast('cuda', dtype=torch.float32)
+def encode_images(
+    images: Float[Tensor, "B F H W C"], encode_as_first: bool = False
+) -> Float[Tensor, "B C' F H/8 W/8"]:
+    """
+    Encode images to latent space using VAE.
+    Every frame is seen as a separate image, without any awareness of the temporal dimension.
+    :param images: Input images tensor with shape [B, F, H, W, C].
+    :param encode_as_first: Whether to encode all frames as the first frame.
+    :return: Encoded latents with shape [B, C', F, H/8, W/8].
+    """
+    if images.min() < - 0.1:
+        # images are in [-1, 1] range
+        images = (images + 1.0) / 2.0  # Normalize to [0, 1] range
+    if encode_as_first:
+        # encode all the frame as the first one
+        B = images.shape[0]
+        images = rearrange(images, "B F H W C -> (B F) C 1 H W")
+        latents = (VAE.encode(images).latent_dist.sample() - LATENTS_MEAN) * LATENTS_STD
+        latents = rearrange(latents, "(B F) C 1 H W -> B C F H W", B=B)
+    else:
+        raise NotImplementedError("Currently only support encode as first frame.")
+    return latents
+# @torch.no_grad()
+# @torch.amp.autocast('cuda', dtype=torch.float32)
+# def decode_images(self, latents: Float[Tensor, "B C F H W"], decode_as_first: bool = False):
+#     if decode_as_first:
+#         F = latents.shape[2]
+#         latents = latents.to(self.vae.dtype)
+#         latents = latents / self.latents_std + self.latents_mean
+#         latents = rearrange(latents, "B C F H W -> (B F) C 1 H W")
+#         images = self.vae.decode(latents, return_dict=False)[0]
+#         images = rearrange(images, "(B F) C Nv H W -> B C (F Nv) H W", F=F, Nv=1)
+#     else:
+#        raise NotImplementedError("Currently only support decode as first frame.")
+#     return images
+@torch.amp.autocast('cuda', dtype=torch.float32)
+def decode_images(latents: Float[Tensor, "B C F H W"], decode_as_first: bool = False):
+    """
+    Decode latents back to images using VAE.
+    :param latents: Input latents with shape [B, C, F, H, W].
+    :param decode_as_first: Whether to decode all frames as the first frame.
+    :return: Decoded images with shape [B, C, F*Nv, H*8, W*8].
+    """
+    if decode_as_first:
+        F = latents.shape[2]
+        latents = latents.to(VAE.dtype)
+        latents = latents / LATENTS_STD + LATENTS_MEAN
+        latents = rearrange(latents, "B C F H W -> (B F) C 1 H W")
+        images = VAE.decode(latents, return_dict=False)[0]
+        images = rearrange(images, "(B F) C Nv H W -> B C (F Nv) H W", F=F, Nv=1)
+    else:
+        raise NotImplementedError("Currently only support decode as first frame.")
+    return images
+def convert_img_to_tensor(image: Image.Image, device="cuda") -> Float[Tensor, "H W C"]:
+    """
+    Convert a PIL Image to a tensor. If Image is RGBA, mask it with black background using a-channel mask.
+    :param image: PIL Image to convert. [0, 255]
+    :return: Tensor representation of the image. [0.0, 1.0], still [H, W, C]
+    """
+    # Convert to RGBA to ensure alpha channel exists
+    image = image.convert("RGBA")
+    np_img = np.array(image)
+    rgb = np_img[..., :3]
+    alpha = np_img[..., 3:4] / 255.0  # Normalize alpha to [0, 1]
+    # Blend with black background using alpha mask
+    rgb = rgb * alpha
+    rgb = rgb.astype(np.float32) / 255.0  # Normalize to [0, 1]
+    tensor = torch.from_numpy(rgb).to(device)
+    return tensor
+@spaces.GPU(duration=120)
+@torch.cuda.amp.autocast(dtype=torch.float32)
+@torch.inference_mode
+@torch.no_grad
+def generate_texture(position_map, normal_map, position_images, normal_images, condition_image, text_prompt, selected_view, negative_prompt=None, device="cuda", progress=gr.Progress()):
+    """
+    Use SeqTex to generate texture for the mesh based on the image condition.
+    :param position_images: List of position images from different views.
+    :param normal_images: List of normal images from different views.
+    :param condition_image: Image condition generated from the selected view.
+    :param text_prompt: Text prompt for texture generation.
+    :param selected_view: The view selected for generating the image condition.
+    :return: Generated texture map, and multi-view frames in tensor.
+    """
+    progress(0, desc="Loading SeqTex pipeline...")
+    tex_pipe = lazy_get_seqtex_pipe()
+    progress(0.2, desc="SeqTex pipeline loaded successfully.")
+    view_id_map = {
+        "First View": 0,
+        "Second View": 1,
+        "Third View": 2,
+        "Fourth View": 3
+    }
+    view_id = view_id_map[selected_view]
+    progress(0.3, desc="Encoding position and normal images...")
+    nat_seq = torch.cat([position_images.unsqueeze(0), normal_images.unsqueeze(0)], dim=0) # 1 F H W C
+    uv_seq = torch.cat([position_map.unsqueeze(0), normal_map.unsqueeze(0)], dim=0)
+    nat_latents = encode_images(nat_seq, encode_as_first=True) # B C F H W
+    uv_latents = encode_images(uv_seq, encode_as_first=True) # B C F' H' W'
+    nat_pos_latents, nat_norm_latents = torch.chunk(nat_latents, 2, dim=0)
+    uv_pos_latents, uv_norm_latents = torch.chunk(uv_latents, 2, dim=0)
+    nat_geo_latents = torch.cat([nat_pos_latents, nat_norm_latents], dim=1)
+    uv_geo_latents = torch.cat([uv_pos_latents, uv_norm_latents], dim=1)
+    cond_model_latents = (nat_geo_latents, uv_geo_latents)
+    num_frames = cfg.num_views * (2 ** sum(VAE.config.temperal_downsample))
+    uv_num_frames = cfg.uv_num_views * (2 ** sum(VAE.config.temperal_downsample))
+    progress(0.4, desc="Encoding condition image...")
+    if isinstance(condition_image, Image.Image):
+        condition_image = condition_image.resize((cfg.mv_width, cfg.mv_height), Image.LANCZOS)
+        # Convert PIL Image to tensor
+        condition_image = convert_img_to_tensor(condition_image, device=device)
+        condition_image = condition_image.unsqueeze(0).unsqueeze(0)
+    gt_latents = (encode_images(condition_image, encode_as_first=True), None)
+    progress(0.5, desc="Generating texture with SeqTex...")
+    latents = tex_pipe(
+        prompt=text_prompt,
+        negative_prompt=negative_prompt,
+        num_frames=num_frames,
+        generator=torch.Generator(device=device).manual_seed(cfg.eval_seed),
+        num_inference_steps=cfg.eval_num_inference_steps,
+        guidance_scale=cfg.eval_guidance_scale,
+        height=cfg.mv_height,
+        width=cfg.mv_width,
+        output_type="latent",
+        cond_model_latents=cond_model_latents,
+        # mask_indices=test_mask_indices,
+        uv_height=cfg.uv_height,
+        uv_width=cfg.uv_width,
+        uv_num_frames=uv_num_frames,
+        treat_as_first=True,
+        gt_condition=gt_latents,
+        inference_img_cond_frame=view_id,
+        use_qk_geometry=True,
+        task_type="img2tex", # img2tex
+        progress=progress,
+    ).frames
+    mv_latents, uv_latents = latents
+    progress(0.9, desc="Decoding generated latents to images...")
+    mv_frames = decode_images(mv_latents, decode_as_first=True) # B C 4 H W
+    uv_frames = decode_images(uv_latents, decode_as_first=True) # B C 1 H W
+    uv_map_pred = uv_frames[:, :, -1, ...]
+    uv_map_pred.squeeze_(0)
+    mv_out = rearrange(mv_frames[:, :, :cfg.num_views, ...], "B C (F N) H W -> N C (B H) (F W)", N=1)[0]
+    mv_out = torch.clamp(mv_out, 0.0, 1.0)
+    uv_map_pred = torch.clamp(uv_map_pred, 0.0, 1.0)
+    progress(1, desc="Texture generated successfully.")
+    return uv_map_pred.float(), mv_out.float(), "Step 3: Texture generated successfully."

wan/__init__.py ADDED Viewed

File without changes

wan/pipeline_wan_t2tex_extra.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import copy
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+from einops import rearrange
+import regex as re
+import torch
+from diffusers.pipelines.wan.pipeline_wan import WanPipeline
+from diffusers.pipelines.wan.pipeline_output import WanPipelineOutput
+from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
+from diffusers.utils.torch_utils import randn_tensor
+from torch import Tensor
+from transformers import AutoTokenizer, UMT5EncoderModel
+from jaxtyping import Float
+import gradio as gr
+def get_sigmas(scheduler, timesteps, dtype=torch.float32, device="cuda"):
+    sigmas = scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    return sigma
+class WanT2TexPipeline(WanPipeline):
+    def __init__(self, tokenizer, text_encoder, transformer, vae, scheduler):
+        super().__init__(tokenizer, text_encoder, transformer, vae, scheduler)
+        self.uv_scheduler = copy.deepcopy(scheduler)
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        treat_as_first: Optional[bool] = True,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        ####################
+        if treat_as_first:
+            num_latent_frames = num_frames // self.vae_scale_factor_temporal
+        else:
+            num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        ####################
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        cond_model_latents: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        uv_height=None,
+        uv_width=None,
+        uv_num_frames=None,
+        # multi_task_cond=None,
+        treat_as_first=True,
+        gt_condition:Tuple[Optional[Float[Tensor, "B C F H W"]], Optional[Float[Tensor, "B C F H W"]]]=None,
+        inference_img_cond_frame=None,
+        use_qk_geometry=False,
+        task_type="all",
+        progress=gr.Progress()
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, defaults to `480`):
+                The height in pixels of the generated image.
+            width (`int`, defaults to `832`):
+                The width in pixels of the generated image.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
+                The dtype to use for the torch.amp.autocast.
+        Examples:
+        Returns:
+            [`~WanPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        # ATTENTION: My inputs are images, so the num_frames is 5, without time dimension compression.
+        # if num_frames % self.vae_scale_factor_temporal != 1:
+        #     raise ValueError(
+        #         f"num_frames should be divisible by {self.vae_scale_factor_temporal} + 1, but got {num_frames}."
+        #     )
+        #     num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        # num_frames = max(num_frames, 1)
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        device = self._execution_device
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if self.do_classifier_free_guidance:
+            if negative_prompt_embeds is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        self.uv_scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        mv_latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            treat_as_first=treat_as_first,
+        )
+        uv_latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            uv_height,
+            uv_width,
+            uv_num_frames,
+            torch.float32,
+            device,
+            generator,
+            treat_as_first=True # UV latents are always different from the others, so treat as the first frame
+        )
+        # 6. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        # with progress.tqdm(total=num_inference_steps, desc="Diffusing...") as progress_bar:
+        for i, t in progress.tqdm(enumerate(timesteps), desc="Diffusing..."):
+            if self.interrupt:
+                continue
+            # set conditions
+            timestep_df = torch.ones((batch_size, num_frames // self.vae_scale_factor_temporal + 1)).to(device) * t
+            sigmas = get_sigmas(self.scheduler, rearrange(timestep_df, "B F -> (B F)"), dtype=transformer_dtype, device=device)
+            sigmas = rearrange(sigmas, "(B F) -> B 1 F 1 1", B=batch_size)
+            match task_type:
+                case "geo+mv2tex":
+                    timestep_df[:, :num_frames // self.vae_scale_factor_temporal] = self.min_noise_level_timestep
+                    sigmas[:, :, :num_frames // self.vae_scale_factor_temporal, ...] = self.min_noise_level_sigma
+                    mv_noise = torch.randn_like(mv_latents) # B C 4 H W
+                    mv_latents = (1.0 - sigmas[:, :, :-1, ...]) * gt_condition[0] + sigmas[:, :, :-1, ...] * mv_noise
+                case "img2tex":
+                    assert inference_img_cond_frame is not None, "inference_img_cond_frame should be specified for img2tex task"
+                    # Use specified frame index as condition instead of just first frame
+                    timestep_df[:, inference_img_cond_frame: inference_img_cond_frame + 1] = self.min_noise_level_timestep
+                    sigmas[:, :, inference_img_cond_frame: inference_img_cond_frame + 1, ...] = self.min_noise_level_sigma
+                    mv_noise = randn_tensor(mv_latents[:, :, inference_img_cond_frame: inference_img_cond_frame + 1].shape, generator=generator, device=device, dtype=self.dtype)
+                    # mv_noise = torch.randn_like(mv_latents[:, :, inference_img_cond_frame: inference_img_cond_frame + 1], generator=generator) # B C selected_frames H W
+                    mv_latents[:, :, inference_img_cond_frame: inference_img_cond_frame + 1, ...] = (1.0 - sigmas[:, :, inference_img_cond_frame: inference_img_cond_frame + 1, ...]) * gt_condition[0] + sigmas[:, :, inference_img_cond_frame: inference_img_cond_frame + 1, ...] * mv_noise
+                case "soft_render":
+                    timestep_df[:, -1:] = self.min_noise_level_timestep
+                    sigmas[:, :, -1:, ...] = self.min_noise_level_sigma
+                    uv_noise = torch.randn_like(uv_latents) # B C 1 H W
+                    uv_latents = (1.0 - sigmas[:, :, -1:, ...]) * gt_condition[1] + sigmas[:, :, -1:, ...] * uv_noise
+                case "geo2mv":
+                    timestep_df[:, -1:] = 1000.
+                    sigmas[:, :, -1:, ...] = 1.
+                case _:
+                    pass
+            # add geometry information to channel C
+            mv_latents_input = torch.cat([mv_latents, cond_model_latents[0]], dim=1)
+            uv_latents_input = torch.cat([uv_latents, cond_model_latents[1]], dim=1)
+            if self.do_classifier_free_guidance:
+                mv_latents_input = torch.cat([mv_latents_input, mv_latents_input], dim=0)
+                uv_latents_input = torch.cat([uv_latents_input, uv_latents_input], dim=0)
+            self._current_timestep = t
+            latent_model_input = (mv_latents_input.to(transformer_dtype), uv_latents_input.to(transformer_dtype))
+            # timestep = t.expand(mv_latents.shape[0])
+            noise_out = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep_df,
+                encoder_hidden_states=prompt_embeds,
+                attention_kwargs=attention_kwargs,
+                # task_cond=multi_task_cond,
+                return_dict=False,
+                use_qk_geometry=use_qk_geometry
+            )[0]
+            mv_noise_out, uv_noise_out = noise_out
+            if self.do_classifier_free_guidance:
+                mv_noise_uncond, mv_noise_pred = mv_noise_out.chunk(2)
+                uv_noise_uncond, uv_noise_pred = uv_noise_out.chunk(2)
+                mv_noise_pred = mv_noise_uncond + guidance_scale * (mv_noise_pred - mv_noise_uncond)
+                uv_noise_pred = uv_noise_uncond + guidance_scale * (uv_noise_pred - uv_noise_uncond)
+            else:
+                mv_noise_pred = mv_noise_out
+                uv_noise_pred = uv_noise_out
+            # compute the previous noisy sample x_t -> x_t-1
+            # The conditions will be replaced anyway, so perhaps we don't need to step frames seperately
+            mv_latents = self.scheduler.step(mv_noise_pred, t, mv_latents, return_dict=False)[0]
+            uv_latents = self.uv_scheduler.step(uv_noise_pred, t, uv_latents, return_dict=False)[0]
+            if callback_on_step_end is not None:
+                raise NotImplementedError()
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+            # # call the callback, if provided
+            # if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+            #     progress_bar.update()
+        self._current_timestep = None
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            # video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = (mv_latents, uv_latents)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return WanPipelineOutput(frames=video)

wan/wan_t2tex_transformer_3d_extra.py ADDED Viewed

	@@ -0,0 +1,634 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+from functools import cache
+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models import WanTransformer3DModel
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import (PixArtAlphaTextProjection,
+                                         TimestepEmbedding, Timesteps,
+                                         get_1d_rotary_pos_embed)
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import FP32LayerNorm
+from diffusers.models.transformers.transformer_wan import \
+    WanTimeTextImageEmbedding
+from diffusers.utils import (USE_PEFT_BACKEND, logging, scale_lora_layers,
+                             unscale_lora_layers)
+class WanT2TexAttnProcessor2_0:
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("WanAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[torch.Tensor] = None,
+        geometry_embedding: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        encoder_hidden_states_img = None
+        if attn.add_k_proj is not None:
+            encoder_hidden_states_img = encoder_hidden_states[:, :257]
+            encoder_hidden_states = encoder_hidden_states[:, 257:]
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if geometry_embedding is not None:
+            # add-type geometry embedding
+            if True:
+                if isinstance(geometry_embedding, Tuple):
+                    query = query + geometry_embedding[0]
+                    key = key + geometry_embedding[1]
+                else:
+                    query = query + geometry_embedding
+                    key = key + geometry_embedding
+            else:
+                # mul-type geometry embedding
+                if isinstance(geometry_embedding, Tuple):
+                    query = query * (1 + geometry_embedding[0])
+                    key = key * (1 + geometry_embedding[1])
+                else:
+                    query = query * (1 + geometry_embedding)
+                    key = key * (1 + geometry_embedding)
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2) # [B, F*H*W, 2C] -> [B, H, F*H*W, 2C//H]
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        if rotary_emb is not None:
+            def apply_rotary_emb(hidden_states: torch.Tensor, freqs: torch.Tensor):
+                x_rotated = torch.view_as_complex(hidden_states.to(torch.float64).unflatten(3, (-1, 2)))
+                x_out = torch.view_as_real(x_rotated * freqs).flatten(3, 4)
+                return x_out.type_as(hidden_states)
+            if isinstance(rotary_emb, Tuple):
+                query = apply_rotary_emb(query, rotary_emb[0])
+                key = apply_rotary_emb(key, rotary_emb[1])
+            else:
+                query = apply_rotary_emb(query, rotary_emb)
+                key = apply_rotary_emb(key, rotary_emb)
+        # I2V task
+        hidden_states_img = None
+        if encoder_hidden_states_img is not None:
+            key_img = attn.add_k_proj(encoder_hidden_states_img)
+            key_img = attn.norm_added_k(key_img)
+            value_img = attn.add_v_proj(encoder_hidden_states_img)
+            key_img = key_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            value_img = value_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            hidden_states_img = F.scaled_dot_product_attention(
+                query, key_img, value_img, attn_mask=None, dropout_p=0.0, is_causal=False
+            )
+            hidden_states_img = hidden_states_img.transpose(1, 2).flatten(2, 3)
+            hidden_states_img = hidden_states_img.type_as(query)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        if hidden_states_img is not None:
+            hidden_states = hidden_states + hidden_states_img
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class WanTimeTaskTextImageEmbedding(WanTimeTextImageEmbedding):
+    def __init__(
+        self,
+        original_model,
+        dim: int,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+        randomly_init: bool = False,
+    ):
+        super(WanTimeTaskTextImageEmbedding, self).__init__(dim, time_freq_dim, time_proj_dim, text_embed_dim, image_embed_dim)
+        if not randomly_init:
+            self.load_state_dict(original_model.state_dict(), strict=True)
+        # cond_proj = nn.Linear(512, original_model.timesteps_proj.num_channels, bias=False)
+        # setattr(self.time_embedder, "cond_proj", cond_proj)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        # time_cond: Optional[torch.Tensor] = None,
+    ):
+        B = timestep.shape[0]
+        timestep = rearrange(timestep, "B F -> (B F)")
+        timestep = self.timesteps_proj(timestep)
+        timestep = rearrange(timestep, "(B F) D -> B F D", B=B)
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+        return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self, attention_head_dim: int, patch_size: Tuple[int, int, int], max_seq_len: int, theta: float = 10000.0, addtional_qk_geo: bool = False
+    ):
+        super().__init__()
+        if addtional_qk_geo: # to add PE to geometry embedding
+            attention_head_dim = attention_head_dim * 2
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+        freqs = []
+        for dim in [t_dim, h_dim, w_dim]:
+            freq = get_1d_rotary_pos_embed(
+                dim, max_seq_len, theta, use_real=False, repeat_interleave_real=False, freqs_dtype=torch.float64
+            )
+            freqs.append(freq)
+        self.freqs = torch.cat(freqs, dim=1)
+    def forward(self, hidden_states: torch.Tensor, uv_hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        _, _, uv_num_frames, uv_height, uv_width = uv_hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+        uppf, upph, uppw = uv_num_frames // p_t, uv_height // p_h, uv_width // p_w
+        self.freqs = self.freqs.to(hidden_states.device)
+        freqs = self.freqs.split_with_sizes(
+            [
+                self.attention_head_dim // 2 - 2 * (self.attention_head_dim // 6),
+                self.attention_head_dim // 6,
+                self.attention_head_dim // 6,
+            ],
+            dim=1,
+        )
+        freqs_f = freqs[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_h = freqs[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_w = freqs[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        uv_freqs_f = freqs[0][ppf:ppf+uppf].view(uppf, 1, 1, -1).expand(uppf, upph, uppw, -1)
+        uv_freqs_h = freqs[1][:upph].view(1, upph, 1, -1).expand(uppf, upph, uppw, -1)
+        uv_freqs_w = freqs[2][:uppw].view(1, 1, uppw, -1).expand(uppf, upph, uppw, -1)
+        freqs = torch.cat([freqs_f, freqs_h, freqs_w], dim=-1).reshape(1, 1, ppf * pph * ppw, -1)
+        uv_freqs = torch.cat([uv_freqs_f, uv_freqs_h, uv_freqs_w], dim=-1).reshape(1, 1, uppf * upph * uppw, -1)
+        return torch.cat([freqs, uv_freqs], dim=-2)
+# def pseudo_code(freqs, mv_tokens_shape, uv_tokens_shape, dimmension):
+#     """
+#     Input:
+#         freqs: [S, D/2], S is the number of tokens, D is the dimension of tokens, 2 indicates Cos and Sin in original RoPE.
+#         mv_tokens_shape: (mv_num_frames, mv_height, mv_width)
+#         uv_tokens_shape: (uv_num_frames, uv_height, uv_width)
+#         dimension: the dimension of tokens
+#     Output:
+#     """
+#     mpf, mph, mpw = mv_tokens_shape # mv_num_frames, mv_height, mv_width
+#     upf, uph, upw = uv_tokens_shape # uv_num_frames, uv_height, uv_width
+#     # 1. To evenly split the freqs into 3 parts
+#     freqs = freqs.split_with_sizes(
+#         [
+#             dimmension // 2 - 2 * (dimmension // 6),
+#             dimmension // 6,
+#             dimmension // 6,
+#         ],
+#         dim=1,
+#     )
+#     # 2. In time dimension, the freqs for UV are subsequent to the freqs for MV
+#     freqs_f = freqs[0][:mpf].view(mpf, 1, 1, -1).expand(mpf, mph, mpw, -1)
+#     uv_freqs_f = freqs[0][mpf:mpf+upf].view(upf, 1, 1, -1).expand(upf, uph, upw, -1)
+#     # 3. The freqs in height and width dimension are the same for mv and uv
+#     freqs_h = freqs[1][:mph].view(1, mph, 1, -1).expand(mpf, mph, mpw, -1)
+#     uv_freqs_h = freqs[1][:uph].view(1, uph, 1, -1).expand(upf, uph, upw, -1)
+#     freqs_w = freqs[2][:mpw].view(1, 1, mpw, -1).expand(mpf, mph, mpw, -1)
+#     uv_freqs_w = freqs[2][:upw].view(1, 1, upw, -1).expand(upf, uph, upw, -1)
+#     # 4. rearrange three 1D RoPEs into 3D RoPE in channel dimension
+#     mv_rope = torch.cat([freqs_f, freqs_h, freqs_w], dim=-1).reshape(mpf * mph * mpw, -1)
+#     uv_rope = torch.cat([uv_freqs_f, uv_freqs_h, uv_freqs_w], dim=-1).reshape(upf * uph * upw, -1)
+#     return torch.cat([mv_rope, uv_rope], dim=-2)
+class WanT2TexTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+        addtional_qk_geo: bool = False,
+    ):
+        super().__init__()
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            processor=WanT2TexAttnProcessor2_0(),
+        )
+        # 2. Cross-attention
+        self.attn2 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            added_kv_proj_dim=added_kv_proj_dim,
+            added_proj_bias=True,
+            processor=WanT2TexAttnProcessor2_0(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.geometry_caster = nn.Linear(dim, dim)
+        nn.init.zeros_(self.geometry_caster.weight.data)
+        nn.init.zeros_(self.geometry_caster.bias.data)
+        self.attnuv = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            processor=WanT2TexAttnProcessor2_0(),
+        )
+        self.normuv2 = FP32LayerNorm(dim, eps, elementwise_affine=True)
+        self.scale_shift_table_uv = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.ffnuv = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+        attn_bias: Optional[torch.Tensor] = None,
+        geometry_embedding: Optional[torch.Tensor] = None,
+        token_shape: Optional[Tuple[int, int, int, int, int, int]] = None,
+    ) -> torch.Tensor:
+        post_patch_num_frames, post_patch_height, post_patch_width, post_uv_num_frames, post_uv_height, post_uv_width = token_shape
+        mv_temb, uv_temb = temb[:, :post_patch_num_frames], temb[:, post_patch_num_frames:]
+        mv_temb = repeat(mv_temb, "B F N D -> B N (F H W) D", H=post_patch_height, W=post_patch_width)
+        uv_temb = repeat(uv_temb, "B F N D -> B N (F H W) D", H=post_uv_height, W=post_uv_width)
+        dit_ssg = rearrange(self.scale_shift_table, "1 N D -> 1 N 1 D") + mv_temb.float()
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = torch.unbind(dit_ssg, dim=1)
+        dit_ssg_uv = rearrange(self.scale_shift_table_uv, "1 N D -> 1 N 1 D") + uv_temb.float()
+        shift_msa_uv, scale_msa_uv, gate_msa_uv, c_shift_msa_uv, c_scale_msa_uv, c_gate_msa_uv = torch.unbind(dit_ssg_uv, dim=1)
+        geometry_embedding = self.geometry_caster(geometry_embedding)
+        n_mv, n_uv = post_patch_num_frames * post_patch_height * post_patch_width, post_uv_num_frames * post_uv_height * post_uv_width
+        assert hidden_states.shape[1] == n_mv + n_uv, f"hidden_states shape {hidden_states.shape} is not equal to {n_mv + n_uv}"
+        mv_hidden_states, uv_hidden_states = hidden_states[:, :n_mv], hidden_states[:, n_mv:]
+        # 1. Self-attention
+        mv_norm_hidden_states = (self.norm1(mv_hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(mv_hidden_states)
+        uv_norm_hidden_states = (self.norm1(uv_hidden_states.float()) * (1 + scale_msa_uv) + shift_msa_uv).type_as(uv_hidden_states)
+        mv_attn_output = self.attn1(hidden_states=mv_norm_hidden_states, rotary_emb=rotary_emb[:, :, :n_mv], attention_mask=attn_bias, geometry_embedding=geometry_embedding[:, :n_mv])
+        mv_hidden_states = (mv_hidden_states.float() + mv_attn_output * gate_msa).type_as(mv_hidden_states)
+        uv_attn_output = self.attnuv(hidden_states=uv_norm_hidden_states, encoder_hidden_states=torch.cat([mv_hidden_states, uv_norm_hidden_states], dim=1),
+                                      rotary_emb=(rotary_emb[:, :, n_mv:], rotary_emb), geometry_embedding=(geometry_embedding[:, n_mv:], geometry_embedding))
+        uv_hidden_states = (uv_hidden_states.float() + uv_attn_output * gate_msa_uv).type_as(uv_hidden_states)
+        # 2. Cross-attention
+        mv_norm_hidden_states = self.norm2(mv_hidden_states.float()).type_as(mv_hidden_states)
+        uv_norm_hidden_states = self.normuv2(uv_hidden_states.float()).type_as(uv_hidden_states)
+        attn_output = self.attn2(hidden_states=torch.cat([mv_norm_hidden_states, uv_norm_hidden_states], dim=1), encoder_hidden_states=encoder_hidden_states)
+        mv_attn_output, uv_attn_output = attn_output[:, :n_mv], attn_output[:, n_mv:]
+        mv_hidden_states.add_(mv_attn_output)
+        uv_hidden_states.add_(uv_attn_output)
+        # 3. Feed-forward
+        mv_norm_hidden_states = (self.norm3(mv_hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            mv_hidden_states
+        )
+        uv_norm_hidden_states = (self.norm3(uv_hidden_states.float()) * (1 + c_scale_msa_uv) + c_shift_msa_uv).type_as(
+            uv_hidden_states
+        )
+        ff_output = self.ffn(mv_norm_hidden_states)
+        mv_hidden_states = (mv_hidden_states.float() + ff_output.float() * c_gate_msa).type_as(mv_hidden_states)
+        ff_output_uv = self.ffnuv(uv_norm_hidden_states)
+        uv_hidden_states = (uv_hidden_states.float() + ff_output_uv.float() * c_gate_msa_uv).type_as(uv_hidden_states)
+        hidden_states = torch.cat([mv_hidden_states, uv_hidden_states], dim=1)
+        return hidden_states
+class WanT2TexTransformer3DModel(WanTransformer3DModel):
+    """
+    3D Transformer model for T2Tex.
+    """
+    def __init__(self, original_model, use_causal_mask=False, addtional_qk_geo=False, randomly_init=False, **kwargs):
+        super(WanT2TexTransformer3DModel, self).__init__(**original_model.config)
+        if not randomly_init:
+            self.load_state_dict(original_model.state_dict(), strict=True)
+        self.addtional_qk_geo = addtional_qk_geo
+        if addtional_qk_geo:
+            raise ValueError("addtional_qk_geo did not work")
+            warn("addtional_qk_geo is set to True, this will drastically increase the memory usage and slow down the training, without significant performance gain.")
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(self.rope.attention_head_dim, self.rope.patch_size, self.rope.max_seq_len, addtional_qk_geo=addtional_qk_geo)
+        self.use_normal, self.use_position = kwargs.get("use_normal", True), kwargs.get("use_position", True)
+        if self.use_normal:
+            self.norm_patch_embedding = copy.deepcopy(self.patch_embedding)
+            # torch.nn.init.zeros_(self.norm_patch_embedding.weight.data)
+            # torch.nn.init.zeros_(self.norm_patch_embedding.bias.data)
+        if self.use_position:
+            self.pos_patch_embedding = copy.deepcopy(self.patch_embedding)
+            # torch.nn.init.zeros_(self.pos_patch_embedding.weight.data)
+            # torch.nn.init.zeros_(self.pos_patch_embedding.bias.data)
+        # 2. Condition embeddings
+        inner_dim = original_model.config.num_attention_heads * original_model.config.attention_head_dim
+        self.condition_embedder = WanTimeTaskTextImageEmbedding(
+            original_model=self.condition_embedder,
+            dim=inner_dim,
+            time_freq_dim=original_model.config.freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=original_model.config.text_dim,
+            image_embed_dim=original_model.config.image_dim,
+            randomly_init=randomly_init,
+        )
+        # 3. Transformer blocks
+        self.use_causal_mask = use_causal_mask
+        self.num_attention_heads = original_model.config.num_attention_heads
+        block = WanT2TexTransformerBlock(
+            inner_dim,
+            original_model.config.ffn_dim,
+            original_model.config.num_attention_heads,
+            original_model.config.qk_norm,
+            original_model.config.cross_attn_norm,
+            original_model.config.eps,
+            original_model.config.added_kv_proj_dim,
+        )
+        self.blocks = None
+        self.blocks = nn.ModuleList(
+            [
+                copy.deepcopy(block)
+                for _ in range(original_model.config.num_layers)
+            ]
+        )
+        self.scale_shift_table_uv = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+        if not randomly_init:
+            self.scale_shift_table_uv.data.copy_(self.scale_shift_table.data)
+            self.blocks.load_state_dict(original_model.blocks.state_dict(), strict=False)
+            for block in self.blocks:
+                block.attnuv.load_state_dict(block.attn1.state_dict())
+                block.scale_shift_table_uv.data.copy_(block.scale_shift_table.data)
+                block.normuv2.load_state_dict(block.norm2.state_dict())
+                block.ffnuv.load_state_dict(block.ffn.state_dict())
+        # 4. Output norm & projection
+        pass
+    @cache
+    def get_attention_bias(self, mv_length, uv_length):
+        total_len = mv_length + uv_length
+        attention_mask = torch.ones((total_len, total_len), dtype=torch.bool)
+        uv_start = mv_length
+        attention_mask[:uv_start, uv_start:] = False
+        attention_mask = repeat(attention_mask, "s l -> 1 h s l", h=self.num_attention_heads)
+        attention_bias = torch.ones_like(attention_mask)
+        attention_bias.masked_fill_(attention_mask.logical_not(), float("-inf"))
+        attention_bias = attention_bias.to("cuda").contiguous()
+        return attention_bias
+    def forward(
+            self,
+            hidden_states: Tuple[torch.Tensor, torch.Tensor],
+            timestep: torch.LongTensor,
+            encoder_hidden_states: torch.Tensor,
+            encoder_hidden_states_image: Optional[torch.Tensor] = None,
+            # task_cond: Optional[torch.Tensor] = None,
+            return_dict: bool = True,
+            attention_kwargs: Optional[Dict[str, Any]] = None,
+            use_qk_geometry: Optional[bool] = False,
+        ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                raise NotImplementedError()
+        assert timestep.ndim == 2, "Use Diffusion Forcing to set seperate timestep for each frame."
+        mv_hidden_states, uv_hidden_states = hidden_states
+        batch_size, num_channels, num_frames, height, width = mv_hidden_states.shape
+        _, _, uv_num_frames, uv_height, uv_width = uv_hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+        post_uv_num_frames = uv_num_frames // p_t
+        post_uv_height = uv_height // p_h
+        post_uv_width = uv_width // p_w
+        rotary_emb = self.rope(mv_hidden_states, uv_hidden_states)
+        # Patchify
+        if self.use_normal and self.use_position:
+            mv_rgb_hidden_states, mv_pos_hidden_states, mv_norm_hidden_states = torch.chunk(mv_hidden_states, 3, dim=1)
+            uv_rgb_hidden_states, uv_pos_hidden_states, uv_norm_hidden_states = torch.chunk(uv_hidden_states, 3, dim=1)
+            mv_geometry_embedding = self.pos_patch_embedding(mv_pos_hidden_states) + self.norm_patch_embedding(mv_norm_hidden_states)
+            uv_geometry_embedding = self.pos_patch_embedding(uv_pos_hidden_states) + self.norm_patch_embedding(uv_norm_hidden_states)
+        elif self.use_normal:
+            mv_rgb_hidden_states, mv_norm_hidden_states = torch.chunk(mv_hidden_states, 2, dim=1)
+            uv_rgb_hidden_states, uv_norm_hidden_states = torch.chunk(uv_hidden_states, 2, dim=1)
+            mv_geometry_embedding = self.norm_patch_embedding(mv_norm_hidden_states)
+            uv_geometry_embedding = self.norm_patch_embedding(uv_norm_hidden_states)
+        elif self.use_position:
+            mv_rgb_hidden_states, mv_pos_hidden_states = torch.chunk(mv_hidden_states, 2, dim=1)
+            uv_rgb_hidden_states, uv_pos_hidden_states = torch.chunk(uv_hidden_states, 2, dim=1)
+            mv_geometry_embedding = self.pos_patch_embedding(mv_pos_hidden_states)
+            uv_geometry_embedding = self.pos_patch_embedding(uv_pos_hidden_states)
+        else:
+            raise ValueError("use_normal and use_position are both False, please set at least one of them to True.")
+        mv_hidden_states = self.patch_embedding(mv_rgb_hidden_states)
+        uv_hidden_states = self.patch_embedding(uv_rgb_hidden_states)
+        if use_qk_geometry:
+            mv_geometry_embedding = mv_geometry_embedding.flatten(2).transpose(1, 2)
+            uv_geometry_embedding = uv_geometry_embedding.flatten(2).transpose(1, 2) # [B, F*H*W, C]
+            geometry_embedding = torch.cat([mv_geometry_embedding, uv_geometry_embedding], dim=1)
+        else:
+            raise NotImplementedError("please set use_qk_geometry to True")
+            # geometry_embedding = None
+            # mv_hidden_states = mv_hidden_states + mv_geometry_embedding
+            # uv_hidden_states = uv_hidden_states + uv_geometry_embedding
+        mv_hidden_states = mv_hidden_states.flatten(2).transpose(1, 2)
+        uv_hidden_states = uv_hidden_states.flatten(2).transpose(1, 2) # [B, F*H*W, C]
+        hidden_states = torch.cat([mv_hidden_states, uv_hidden_states], dim=1) # [B, F*H*W, C]
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image
+        )
+        # temb [B, F, 6*D], timestep_proj [B, F, 6*D], used to be [B, 6*D]
+        timestep_proj = timestep_proj.unflatten(-1, (6, -1)) # [B, F, 6*D] -> [B, F, 6, D]
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+        # # Get attention bias
+        # if self.use_causal_mask:
+        #     # This may be gainless, because the patch embedding is not causal, which will leak information to MV
+        #     attn_bias = self.get_attention_bias(post_patch_num_frames * post_patch_height * post_patch_width,
+        #                                         post_uv_num_frames * post_uv_height * post_uv_width)
+        # else:
+        attn_bias = None
+        # 4. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.blocks:
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb,
+                    attn_bias, geometry_embedding, (post_patch_num_frames, post_patch_height, post_patch_width, post_uv_num_frames, post_uv_height, post_uv_width)
+                )
+        else:
+            for block in self.blocks:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb,
+                                      attn_bias=attn_bias, geometry_embedding=geometry_embedding,
+                                      token_shape=(post_patch_num_frames, post_patch_height, post_patch_width, post_uv_num_frames, post_uv_height, post_uv_width))
+        # 5. Output norm, projection & unpatchify
+        # [B, 2, D] chunk into [B, 1, D] and [B, 1, D], D is 1536
+        inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        mv_temb, uv_temb = temb[:, :post_patch_num_frames], temb[:, post_patch_num_frames:]
+        mv_temb = repeat(mv_temb, "B F D -> B 1 (F H W) D", H=post_patch_height, W=post_patch_width)
+        uv_temb = repeat(uv_temb, "B F D -> B 1 (F H W) D", H=post_uv_height, W=post_uv_width)
+        shift, scale = (self.scale_shift_table.view(1, 2, 1, inner_dim) + mv_temb).chunk(2, dim=1)
+        shift_uv, scale_uv = (self.scale_shift_table_uv.view(1, 2, 1, inner_dim) + uv_temb).chunk(2, dim=1)
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.squeeze(1).to(hidden_states.device)
+        scale = scale.squeeze(1).to(hidden_states.device)
+        shift_uv = shift_uv.squeeze(1).to(hidden_states.device)
+        scale_uv = scale_uv.squeeze(1).to(hidden_states.device)
+        # Unpatchify
+        uv_token_length = post_uv_num_frames * post_uv_height * post_uv_width
+        mv_token_length = post_patch_num_frames * post_patch_height * post_patch_width
+        assert uv_token_length + mv_token_length == hidden_states.shape[1]
+        uv_hidden_states = hidden_states[:, mv_token_length:]
+        mv_hidden_states = hidden_states[:, :mv_token_length]
+        mv_hidden_states = (self.norm_out(mv_hidden_states.float()) * (1 + scale) + shift).type_as(mv_hidden_states)
+        uv_hidden_states = (self.norm_out(uv_hidden_states.float()) * (1 + scale_uv) + shift_uv).type_as(uv_hidden_states)
+        mv_hidden_states = self.proj_out(mv_hidden_states)
+        uv_hidden_states = self.proj_out(uv_hidden_states)
+        mv_hidden_states = mv_hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        mv_hidden_states = mv_hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        mv_output = mv_hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        uv_hidden_states = uv_hidden_states.reshape(
+            batch_size, post_uv_num_frames, post_uv_height, post_uv_width, p_t, p_h, p_w, -1
+        )
+        uv_hidden_states = uv_hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        uv_output = uv_hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        return ((mv_output, uv_output),)