MagicQuill

Running on L4

App Files Files Community

Baykon commited on 7 days ago

Commit

ee43d06

•

1 Parent(s): 54f5ad4

Delete magic quill

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

MagicQuill/.DS_Store +0 -0
MagicQuill/brushnet/brushnet.json +0 -58
MagicQuill/brushnet/brushnet.py +0 -949
MagicQuill/brushnet/brushnet_ca.py +0 -983
MagicQuill/brushnet/brushnet_xl.json +0 -63
MagicQuill/brushnet/powerpaint.json +0 -57
MagicQuill/brushnet/powerpaint_utils.py +0 -496
MagicQuill/brushnet/unet_2d_blocks.py +0 -0
MagicQuill/brushnet/unet_2d_condition.py +0 -1355
MagicQuill/brushnet_nodes.py +0 -1094
MagicQuill/comfy/.DS_Store +0 -0
MagicQuill/comfy/checkpoint_pickle.py +0 -13
MagicQuill/comfy/cldm/__pycache__/cldm.cpython-310.pyc +0 -0
MagicQuill/comfy/cldm/cldm.py +0 -313
MagicQuill/comfy/cli_args.py +0 -143
MagicQuill/comfy/clip_config_bigg.json +0 -23
MagicQuill/comfy/clip_model.py +0 -194
MagicQuill/comfy/clip_vision.py +0 -117
MagicQuill/comfy/clip_vision_config_g.json +0 -18
MagicQuill/comfy/clip_vision_config_h.json +0 -18
MagicQuill/comfy/clip_vision_config_vitl.json +0 -18
MagicQuill/comfy/conds.py +0 -83
MagicQuill/comfy/controlnet.py +0 -554
MagicQuill/comfy/diffusers_convert.py +0 -281
MagicQuill/comfy/diffusers_load.py +0 -36
MagicQuill/comfy/extra_samplers/__pycache__/uni_pc.cpython-310.pyc +0 -0
MagicQuill/comfy/extra_samplers/uni_pc.py +0 -875
MagicQuill/comfy/gligen.py +0 -343
MagicQuill/comfy/k_diffusion/__pycache__/sampling.cpython-310.pyc +0 -0
MagicQuill/comfy/k_diffusion/__pycache__/utils.cpython-310.pyc +0 -0
MagicQuill/comfy/k_diffusion/sampling.py +0 -843
MagicQuill/comfy/k_diffusion/utils.py +0 -313
MagicQuill/comfy/latent_formats.py +0 -141
MagicQuill/comfy/ldm/.DS_Store +0 -0
MagicQuill/comfy/ldm/__pycache__/util.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/audio/__pycache__/autoencoder.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/audio/__pycache__/dit.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/audio/__pycache__/embedders.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/audio/autoencoder.py +0 -282
MagicQuill/comfy/ldm/audio/dit.py +0 -888
MagicQuill/comfy/ldm/audio/embedders.py +0 -108
MagicQuill/comfy/ldm/cascade/__pycache__/common.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/cascade/__pycache__/controlnet.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/cascade/__pycache__/stage_a.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/cascade/__pycache__/stage_b.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/cascade/__pycache__/stage_c.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/cascade/__pycache__/stage_c_coder.cpython-310.pyc +0 -0
MagicQuill/comfy/ldm/cascade/common.py +0 -161
MagicQuill/comfy/ldm/cascade/controlnet.py +0 -93
MagicQuill/comfy/ldm/cascade/stage_a.py +0 -255

MagicQuill/.DS_Store DELETED Viewed

Binary file (6.15 kB)

MagicQuill/brushnet/brushnet.json DELETED Viewed

@@ -1,58 +0,0 @@
-{
-  "_class_name": "BrushNetModel",
-  "_diffusers_version": "0.27.0.dev0",
-  "_name_or_path": "runs/logs/brushnet_randommask/checkpoint-100000",
-  "act_fn": "silu",
-  "addition_embed_type": null,
-  "addition_embed_type_num_heads": 64,
-  "addition_time_embed_dim": null,
-  "attention_head_dim": 8,
-  "block_out_channels": [
-    320,
-    640,
-    1280,
-    1280
-  ],
-  "brushnet_conditioning_channel_order": "rgb",
-  "class_embed_type": null,
-  "conditioning_channels": 5,
-  "conditioning_embedding_out_channels": [
-    16,
-    32,
-    96,
-    256
-  ],
-  "cross_attention_dim": 768,
-  "down_block_types": [
-    "DownBlock2D",
-    "DownBlock2D",
-    "DownBlock2D",
-    "DownBlock2D"
-  ],
-  "downsample_padding": 1,
-  "encoder_hid_dim": null,
-  "encoder_hid_dim_type": null,
-  "flip_sin_to_cos": true,
-  "freq_shift": 0,
-  "global_pool_conditions": false,
-  "in_channels": 4,
-  "layers_per_block": 2,
-  "mid_block_scale_factor": 1,
-  "mid_block_type": "MidBlock2D",
-  "norm_eps": 1e-05,
-  "norm_num_groups": 32,
-  "num_attention_heads": null,
-  "num_class_embeds": null,
-  "only_cross_attention": false,
-  "projection_class_embeddings_input_dim": null,
-  "resnet_time_scale_shift": "default",
-  "transformer_layers_per_block": 1,
-  "up_block_types": [
-    "UpBlock2D",
-    "UpBlock2D",
-    "UpBlock2D",
-    "UpBlock2D"
-  ],
-  "upcast_attention": false,
-  "use_linear_projection": false
-}

MagicQuill/brushnet/brushnet.py DELETED Viewed

@@ -1,949 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from torch import nn
-from torch.nn import functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.utils import BaseOutput, logging
-from diffusers.models.attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
-from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
-from diffusers.models.modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
-    UNetMidBlock2D,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-    get_mid_block,
-    get_up_block,
-    MidBlock2D
-)
-from .unet_2d_condition import UNet2DConditionModel
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-@dataclass
-class BrushNetOutput(BaseOutput):
-    """
-    The output of [`BrushNetModel`].
-    Args:
-        up_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of upsample activations at different resolutions for each upsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's upsampling activations.
-        down_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's downsampling activations.
-        mid_down_block_re_sample (`torch.Tensor`):
-            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
-            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
-            Output can be used to condition the original UNet's middle block activation.
-    """
-    up_block_res_samples: Tuple[torch.Tensor]
-    down_block_res_samples: Tuple[torch.Tensor]
-    mid_block_res_sample: torch.Tensor
-class BrushNetModel(ModelMixin, ConfigMixin):
-    """
-    A BrushNet model.
-    Args:
-        in_channels (`int`, defaults to 4):
-            The number of channels in the input sample.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, defaults to 0):
-            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
-            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
-            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
-            The tuple of upsample blocks to use.
-        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
-        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, defaults to 2):
-            The number of layers per block.
-        downsample_padding (`int`, defaults to 1):
-            The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, defaults to 1):
-            The scale factor to use for the mid block.
-        act_fn (`str`, defaults to "silu"):
-            The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
-            in post-processing.
-        norm_eps (`float`, defaults to 1e-5):
-            The epsilon to use for the normalization.
-        cross_attention_dim (`int`, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
-            dimension to `cross_attention_dim`.
-        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
-            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
-            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
-        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
-            The dimension of the attention heads.
-        use_linear_projection (`bool`, defaults to `False`):
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to `None`):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        num_class_embeds (`int`, *optional*, defaults to 0):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        upcast_attention (`bool`, defaults to `False`):
-        resnet_time_scale_shift (`str`, defaults to `"default"`):
-            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
-        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
-            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
-            `class_embed_type="projection"`.
-        brushnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-        global_pool_conditions (`bool`, defaults to `False`):
-            TODO(Patrick) - unused parameter.
-        addition_embed_type_num_heads (`int`, defaults to 64):
-            The number of heads to use for the `TextTimeEmbedding` layer.
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        conditioning_channels: int = 5,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str, ...] = (
-            "DownBlock2D",
-            "DownBlock2D",
-            "DownBlock2D",
-            "DownBlock2D",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2D",
-        up_block_types: Tuple[str, ...] = (
-            "UpBlock2D",
-            "UpBlock2D",
-            "UpBlock2D",
-            "UpBlock2D",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
-        encoder_hid_dim: Optional[int] = None,
-        encoder_hid_dim_type: Optional[str] = None,
-        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        addition_embed_type: Optional[str] = None,
-        addition_time_embed_dim: Optional[int] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        brushnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-        addition_embed_type_num_heads: int = 64,
-    ):
-        super().__init__()
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-        # Check inputs
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
-        # input
-        conv_in_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in_condition = nn.Conv2d(
-            in_channels+conditioning_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
-            encoder_hid_dim_type = "text_proj"
-            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
-            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
-        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
-            )
-        if encoder_hid_dim_type == "text_proj":
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        elif encoder_hid_dim_type == "text_image_proj":
-            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
-            self.encoder_hid_proj = TextImageProjection(
-                text_embed_dim=encoder_hid_dim,
-                image_embed_dim=cross_attention_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
-            )
-        else:
-            self.encoder_hid_proj = None
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-        if addition_embed_type == "text":
-            if encoder_hid_dim is not None:
-                text_time_embedding_from_dim = encoder_hid_dim
-            else:
-                text_time_embedding_from_dim = cross_attention_dim
-            self.add_embedding = TextTimeEmbedding(
-                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
-            )
-        elif addition_embed_type == "text_image":
-            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
-            self.add_embedding = TextImageTimeEmbedding(
-                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
-            )
-        elif addition_embed_type == "text_time":
-            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
-            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
-        self.down_blocks = nn.ModuleList([])
-        self.brushnet_down_blocks = nn.ModuleList([])
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-        # down
-        output_channel = block_out_channels[0]
-        brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-        brushnet_block = zero_module(brushnet_block)
-        self.brushnet_down_blocks.append(brushnet_block)
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                transformer_layers_per_block=transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=num_attention_heads[i],
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-                downsample_padding=downsample_padding,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-            self.down_blocks.append(down_block)
-            for _ in range(layers_per_block):
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_down_blocks.append(brushnet_block)
-            if not is_final_block:
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_down_blocks.append(brushnet_block)
-        # mid
-        mid_block_channel = block_out_channels[-1]
-        brushnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
-        brushnet_block = zero_module(brushnet_block)
-        self.brushnet_mid_block = brushnet_block
-        self.mid_block = get_mid_block(
-                mid_block_type,
-                transformer_layers_per_block=transformer_layers_per_block[-1],
-                in_channels=mid_block_channel,
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=num_attention_heads[-1],
-                resnet_groups=norm_num_groups,
-                use_linear_projection=use_linear_projection,
-                upcast_attention=upcast_attention,
-        )
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_num_attention_heads = list(reversed(num_attention_heads))
-        reversed_transformer_layers_per_block = (list(reversed(transformer_layers_per_block)))
-        only_cross_attention = list(reversed(only_cross_attention))
-        output_channel = reversed_block_out_channels[0]
-        self.up_blocks = nn.ModuleList([])
-        self.brushnet_up_blocks = nn.ModuleList([])
-        for i, up_block_type in enumerate(up_block_types):
-            is_final_block = i == len(block_out_channels) - 1
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-            # add upsample block for all BUT final layer
-            if not is_final_block:
-                add_upsample = True
-                self.num_upsamplers += 1
-            else:
-                add_upsample = False
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block+1,
-                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=add_upsample,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resolution_idx=i,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=reversed_num_attention_heads[i],
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-            for _ in range(layers_per_block+1):
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_up_blocks.append(brushnet_block)
-            if not is_final_block:
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_up_blocks.append(brushnet_block)
-    @classmethod
-    def from_unet(
-        cls,
-        unet: UNet2DConditionModel,
-        brushnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        load_weights_from_unet: bool = True,
-        conditioning_channels: int = 5,
-    ):
-        r"""
-        Instantiate a [`BrushNetModel`] from [`UNet2DConditionModel`].
-        Parameters:
-            unet (`UNet2DConditionModel`):
-                The UNet model weights to copy to the [`BrushNetModel`]. All configuration options are also copied
-                where applicable.
-        """
-        transformer_layers_per_block = (
-            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
-        )
-        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
-        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
-        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
-        addition_time_embed_dim = (
-            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
-        )
-        brushnet = cls(
-            in_channels=unet.config.in_channels,
-            conditioning_channels=conditioning_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            down_block_types=["DownBlock2D" for block_name in unet.config.down_block_types],
-            mid_block_type='MidBlock2D',
-            up_block_types=["UpBlock2D" for block_name in unet.config.down_block_types],
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            encoder_hid_dim=encoder_hid_dim,
-            encoder_hid_dim_type=encoder_hid_dim_type,
-            attention_head_dim=unet.config.attention_head_dim,
-            num_attention_heads=unet.config.num_attention_heads,
-            use_linear_projection=unet.config.use_linear_projection,
-            class_embed_type=unet.config.class_embed_type,
-            addition_embed_type=addition_embed_type,
-            addition_time_embed_dim=addition_time_embed_dim,
-            num_class_embeds=unet.config.num_class_embeds,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            brushnet_conditioning_channel_order=brushnet_conditioning_channel_order,
-            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-        )
-        if load_weights_from_unet:
-            conv_in_condition_weight=torch.zeros_like(brushnet.conv_in_condition.weight)
-            conv_in_condition_weight[:,:4,...]=unet.conv_in.weight
-            conv_in_condition_weight[:,4:8,...]=unet.conv_in.weight
-            brushnet.conv_in_condition.weight=torch.nn.Parameter(conv_in_condition_weight)
-            brushnet.conv_in_condition.bias=unet.conv_in.bias
-            brushnet.time_proj.load_state_dict(unet.time_proj.state_dict())
-            brushnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
-            if brushnet.class_embedding:
-                brushnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
-            brushnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(),strict=False)
-            brushnet.mid_block.load_state_dict(unet.mid_block.state_dict(),strict=False)
-            brushnet.up_blocks.load_state_dict(unet.up_blocks.state_dict(),strict=False)
-        return brushnet
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-        self.set_attn_processor(processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
-    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
-                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-        num_sliceable_layers = len(sliceable_head_dims)
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        encoder_hidden_states: torch.Tensor,
-        brushnet_cond: torch.FloatTensor,
-        timestep = None,
-        time_emb = None,
-        conditioning_scale: float = 1.0,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-        debug = False,
-    ) -> Union[BrushNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
-        """
-        The [`BrushNetModel`] forward method.
-        Args:
-            sample (`torch.FloatTensor`):
-                The noisy input tensor.
-            timestep (`Union[torch.Tensor, float, int]`):
-                The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.Tensor`):
-                The encoder hidden states.
-            brushnet_cond (`torch.FloatTensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            conditioning_scale (`float`, defaults to `1.0`):
-                The scale factor for BrushNet outputs.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
-                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
-                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
-                embeddings.
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            added_cond_kwargs (`dict`):
-                Additional conditions for the Stable Diffusion XL UNet.
-            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            guess_mode (`bool`, defaults to `False`):
-                In this mode, the BrushNet encoder tries its best to recognize the input content of the input even if
-                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.brushnet.BrushNetOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.brushnet.BrushNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.brushnet.BrushNetOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
-        """
-        # check channel order
-        channel_order = self.config.brushnet_conditioning_channel_order
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        elif channel_order == "bgr":
-            brushnet_cond = torch.flip(brushnet_cond, dims=[1])
-        else:
-            raise ValueError(f"unknown `brushnet_conditioning_channel_order`: {channel_order}")
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-        if timestep is None and time_emb is None:
-            raise ValueError(f"`timestep` and `emb` are both None")
-        #print("BN: sample.device", sample.device)
-        #print("BN: TE.device", self.time_embedding.linear_1.weight.device)
-        if timestep is not None:
-            # 1. time
-            timesteps = timestep
-            if not torch.is_tensor(timesteps):
-                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-                # This would be a good case for the `match` statement (Python 3.10+)
-                is_mps = sample.device.type == "mps"
-                if isinstance(timestep, float):
-                    dtype = torch.float32 if is_mps else torch.float64
-                else:
-                    dtype = torch.int32 if is_mps else torch.int64
-                timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-            elif len(timesteps.shape) == 0:
-                timesteps = timesteps[None].to(sample.device)
-            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-            timesteps = timesteps.expand(sample.shape[0])
-            t_emb = self.time_proj(timesteps)
-            # timesteps does not contain any weights and will always return f32 tensors
-            # but time_embedding might actually be running in fp16. so we need to cast here.
-            # there might be better ways to encapsulate this.
-            t_emb = t_emb.to(dtype=sample.dtype)
-            #print("t_emb.device =",t_emb.device)
-            emb = self.time_embedding(t_emb, timestep_cond)
-            aug_emb = None
-            #print('emb.shape', emb.shape)
-            if self.class_embedding is not None:
-                if class_labels is None:
-                    raise ValueError("class_labels should be provided when num_class_embeds > 0")
-                if self.config.class_embed_type == "timestep":
-                    class_labels = self.time_proj(class_labels)
-                class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-                emb = emb + class_emb
-            if self.config.addition_embed_type is not None:
-                if self.config.addition_embed_type == "text":
-                    aug_emb = self.add_embedding(encoder_hidden_states)
-                elif self.config.addition_embed_type == "text_time":
-                    if "text_embeds" not in added_cond_kwargs:
-                        raise ValueError(
-                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                        )
-                    text_embeds = added_cond_kwargs.get("text_embeds")
-                    if "time_ids" not in added_cond_kwargs:
-                        raise ValueError(
-                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                        )
-                    time_ids = added_cond_kwargs.get("time_ids")
-                    time_embeds = self.add_time_proj(time_ids.flatten())
-                    time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-                    add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-                    add_embeds = add_embeds.to(emb.dtype)
-                    aug_emb = self.add_embedding(add_embeds)
-                    #print('text_embeds', text_embeds.shape, 'time_ids', time_ids.shape, 'time_embeds', time_embeds.shape, 'add__embeds', add_embeds.shape, 'aug_emb', aug_emb.shape)
-            emb = emb + aug_emb if aug_emb is not None else emb
-        else:
-            emb = time_emb
-        # 2. pre-process
-        brushnet_cond=torch.concat([sample,brushnet_cond],1)
-        sample = self.conv_in_condition(brushnet_cond)
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-            down_block_res_samples += res_samples
-        # 4. PaintingNet down blocks
-        brushnet_down_block_res_samples = ()
-        for down_block_res_sample, brushnet_down_block in zip(down_block_res_samples, self.brushnet_down_blocks):
-            down_block_res_sample = brushnet_down_block(down_block_res_sample)
-            brushnet_down_block_res_samples = brushnet_down_block_res_samples + (down_block_res_sample,)
-        # 5. mid
-        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
-        # 6. BrushNet mid blocks
-        brushnet_mid_block_res_sample = self.brushnet_mid_block(sample)
-        # 7. up
-        up_block_res_samples = ()
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample, up_res_samples = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                    return_res_samples=True
-                )
-            else:
-                sample, up_res_samples = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                    return_res_samples=True
-                )
-            up_block_res_samples += up_res_samples
-        # 8. BrushNet up blocks
-        brushnet_up_block_res_samples = ()
-        for up_block_res_sample, brushnet_up_block in zip(up_block_res_samples, self.brushnet_up_blocks):
-            up_block_res_sample = brushnet_up_block(up_block_res_sample)
-            brushnet_up_block_res_samples = brushnet_up_block_res_samples + (up_block_res_sample,)
-        # 6. scaling
-        if guess_mode and not self.config.global_pool_conditions:
-            scales = torch.logspace(-1, 0, len(brushnet_down_block_res_samples) + 1 + len(brushnet_up_block_res_samples), device=sample.device)  # 0.1 to 1.0
-            scales = scales * conditioning_scale
-            brushnet_down_block_res_samples = [sample * scale for sample, scale in zip(brushnet_down_block_res_samples, scales[:len(brushnet_down_block_res_samples)])]
-            brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * scales[len(brushnet_down_block_res_samples)]
-            brushnet_up_block_res_samples = [sample * scale for sample, scale in zip(brushnet_up_block_res_samples, scales[len(brushnet_down_block_res_samples)+1:])]
-        else:
-            brushnet_down_block_res_samples = [sample * conditioning_scale for sample in brushnet_down_block_res_samples]
-            brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * conditioning_scale
-            brushnet_up_block_res_samples = [sample * conditioning_scale for sample in brushnet_up_block_res_samples]
-        if self.config.global_pool_conditions:
-            brushnet_down_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_down_block_res_samples
-            ]
-            brushnet_mid_block_res_sample = torch.mean(brushnet_mid_block_res_sample, dim=(2, 3), keepdim=True)
-            brushnet_up_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_up_block_res_samples
-            ]
-        if not return_dict:
-            return (brushnet_down_block_res_samples, brushnet_mid_block_res_sample, brushnet_up_block_res_samples)
-        return BrushNetOutput(
-            down_block_res_samples=brushnet_down_block_res_samples,
-            mid_block_res_sample=brushnet_mid_block_res_sample,
-            up_block_res_samples=brushnet_up_block_res_samples
-        )
-def zero_module(module):
-    for p in module.parameters():
-        nn.init.zeros_(p)
-    return module

MagicQuill/brushnet/brushnet_ca.py DELETED Viewed

@@ -1,983 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from torch import nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.utils import BaseOutput, logging
-from diffusers.models.attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
-from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
-from diffusers.models.modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
-    UNetMidBlock2D,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-    get_mid_block,
-    get_up_block,
-    MidBlock2D
-)
-from .unet_2d_condition import UNet2DConditionModel
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-@dataclass
-class BrushNetOutput(BaseOutput):
-    """
-    The output of [`BrushNetModel`].
-    Args:
-        up_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of upsample activations at different resolutions for each upsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's upsampling activations.
-        down_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's downsampling activations.
-        mid_down_block_re_sample (`torch.Tensor`):
-            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
-            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
-            Output can be used to condition the original UNet's middle block activation.
-    """
-    up_block_res_samples: Tuple[torch.Tensor]
-    down_block_res_samples: Tuple[torch.Tensor]
-    mid_block_res_sample: torch.Tensor
-class BrushNetModel(ModelMixin, ConfigMixin):
-    """
-    A BrushNet model.
-    Args:
-        in_channels (`int`, defaults to 4):
-            The number of channels in the input sample.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, defaults to 0):
-            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
-            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
-            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
-            The tuple of upsample blocks to use.
-        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
-        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, defaults to 2):
-            The number of layers per block.
-        downsample_padding (`int`, defaults to 1):
-            The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, defaults to 1):
-            The scale factor to use for the mid block.
-        act_fn (`str`, defaults to "silu"):
-            The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
-            in post-processing.
-        norm_eps (`float`, defaults to 1e-5):
-            The epsilon to use for the normalization.
-        cross_attention_dim (`int`, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
-            dimension to `cross_attention_dim`.
-        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
-            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
-            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
-        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
-            The dimension of the attention heads.
-        use_linear_projection (`bool`, defaults to `False`):
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to `None`):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        num_class_embeds (`int`, *optional*, defaults to 0):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        upcast_attention (`bool`, defaults to `False`):
-        resnet_time_scale_shift (`str`, defaults to `"default"`):
-            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
-        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
-            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
-            `class_embed_type="projection"`.
-        brushnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-        global_pool_conditions (`bool`, defaults to `False`):
-            TODO(Patrick) - unused parameter.
-        addition_embed_type_num_heads (`int`, defaults to 64):
-            The number of heads to use for the `TextTimeEmbedding` layer.
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        conditioning_channels: int = 5,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str, ...] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str, ...] = (
-            "UpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
-        encoder_hid_dim: Optional[int] = None,
-        encoder_hid_dim_type: Optional[str] = None,
-        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        addition_embed_type: Optional[str] = None,
-        addition_time_embed_dim: Optional[int] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        brushnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-        addition_embed_type_num_heads: int = 64,
-    ):
-        super().__init__()
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-        # Check inputs
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
-        # input
-        conv_in_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in_condition = nn.Conv2d(
-            in_channels + conditioning_channels,
-            block_out_channels[0],
-            kernel_size=conv_in_kernel,
-            padding=conv_in_padding,
-        )
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
-            encoder_hid_dim_type = "text_proj"
-            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
-            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
-        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
-            )
-        if encoder_hid_dim_type == "text_proj":
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        elif encoder_hid_dim_type == "text_image_proj":
-            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
-            self.encoder_hid_proj = TextImageProjection(
-                text_embed_dim=encoder_hid_dim,
-                image_embed_dim=cross_attention_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
-            )
-        else:
-            self.encoder_hid_proj = None
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-        if addition_embed_type == "text":
-            if encoder_hid_dim is not None:
-                text_time_embedding_from_dim = encoder_hid_dim
-            else:
-                text_time_embedding_from_dim = cross_attention_dim
-            self.add_embedding = TextTimeEmbedding(
-                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
-            )
-        elif addition_embed_type == "text_image":
-            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
-            self.add_embedding = TextImageTimeEmbedding(
-                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
-            )
-        elif addition_embed_type == "text_time":
-            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
-            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
-        self.down_blocks = nn.ModuleList([])
-        self.brushnet_down_blocks = nn.ModuleList([])
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-        # down
-        output_channel = block_out_channels[0]
-        brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-        brushnet_block = zero_module(brushnet_block)
-        self.brushnet_down_blocks.append(brushnet_block)
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                transformer_layers_per_block=transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=num_attention_heads[i],
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-                downsample_padding=downsample_padding,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-            self.down_blocks.append(down_block)
-            for _ in range(layers_per_block):
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_down_blocks.append(brushnet_block)
-            if not is_final_block:
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_down_blocks.append(brushnet_block)
-        # mid
-        mid_block_channel = block_out_channels[-1]
-        brushnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
-        brushnet_block = zero_module(brushnet_block)
-        self.brushnet_mid_block = brushnet_block
-        self.mid_block = get_mid_block(
-            mid_block_type,
-            transformer_layers_per_block=transformer_layers_per_block[-1],
-            in_channels=mid_block_channel,
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads[-1],
-            resnet_groups=norm_num_groups,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-        )
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_num_attention_heads = list(reversed(num_attention_heads))
-        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
-        only_cross_attention = list(reversed(only_cross_attention))
-        output_channel = reversed_block_out_channels[0]
-        self.up_blocks = nn.ModuleList([])
-        self.brushnet_up_blocks = nn.ModuleList([])
-        for i, up_block_type in enumerate(up_block_types):
-            is_final_block = i == len(block_out_channels) - 1
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-            # add upsample block for all BUT final layer
-            if not is_final_block:
-                add_upsample = True
-                self.num_upsamplers += 1
-            else:
-                add_upsample = False
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=add_upsample,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resolution_idx=i,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=reversed_num_attention_heads[i],
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-            for _ in range(layers_per_block + 1):
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_up_blocks.append(brushnet_block)
-            if not is_final_block:
-                brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                brushnet_block = zero_module(brushnet_block)
-                self.brushnet_up_blocks.append(brushnet_block)
-    @classmethod
-    def from_unet(
-        cls,
-        unet: UNet2DConditionModel,
-        brushnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        load_weights_from_unet: bool = True,
-        conditioning_channels: int = 5,
-    ):
-        r"""
-        Instantiate a [`BrushNetModel`] from [`UNet2DConditionModel`].
-        Parameters:
-            unet (`UNet2DConditionModel`):
-                The UNet model weights to copy to the [`BrushNetModel`]. All configuration options are also copied
-                where applicable.
-        """
-        transformer_layers_per_block = (
-            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
-        )
-        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
-        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
-        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
-        addition_time_embed_dim = (
-            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
-        )
-        brushnet = cls(
-            in_channels=unet.config.in_channels,
-            conditioning_channels=conditioning_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            # down_block_types=['DownBlock2D','DownBlock2D','DownBlock2D','DownBlock2D'],
-            down_block_types=[
-                "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D",
-                "CrossAttnDownBlock2D",
-                "DownBlock2D",
-            ],
-            # mid_block_type='MidBlock2D',
-            mid_block_type="UNetMidBlock2DCrossAttn",
-            # up_block_types=['UpBlock2D','UpBlock2D','UpBlock2D','UpBlock2D'],
-            up_block_types=["UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"],
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            encoder_hid_dim=encoder_hid_dim,
-            encoder_hid_dim_type=encoder_hid_dim_type,
-            attention_head_dim=unet.config.attention_head_dim,
-            num_attention_heads=unet.config.num_attention_heads,
-            use_linear_projection=unet.config.use_linear_projection,
-            class_embed_type=unet.config.class_embed_type,
-            addition_embed_type=addition_embed_type,
-            addition_time_embed_dim=addition_time_embed_dim,
-            num_class_embeds=unet.config.num_class_embeds,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            brushnet_conditioning_channel_order=brushnet_conditioning_channel_order,
-            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-        )
-        if load_weights_from_unet:
-            conv_in_condition_weight = torch.zeros_like(brushnet.conv_in_condition.weight)
-            conv_in_condition_weight[:, :4, ...] = unet.conv_in.weight
-            conv_in_condition_weight[:, 4:8, ...] = unet.conv_in.weight
-            brushnet.conv_in_condition.weight = torch.nn.Parameter(conv_in_condition_weight)
-            brushnet.conv_in_condition.bias = unet.conv_in.bias
-            brushnet.time_proj.load_state_dict(unet.time_proj.state_dict())
-            brushnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
-            if brushnet.class_embedding:
-                brushnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
-            brushnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False)
-            brushnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False)
-            brushnet.up_blocks.load_state_dict(unet.up_blocks.state_dict(), strict=False)
-        return brushnet.to(unet.dtype)
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-        self.set_attn_processor(processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
-    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
-                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-        num_sliceable_layers = len(sliceable_head_dims)
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        brushnet_cond: torch.FloatTensor,
-        conditioning_scale: float = 1.0,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-        debug=False,
-    ) -> Union[BrushNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
-        """
-        The [`BrushNetModel`] forward method.
-        Args:
-            sample (`torch.FloatTensor`):
-                The noisy input tensor.
-            timestep (`Union[torch.Tensor, float, int]`):
-                The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.Tensor`):
-                The encoder hidden states.
-            brushnet_cond (`torch.FloatTensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            conditioning_scale (`float`, defaults to `1.0`):
-                The scale factor for BrushNet outputs.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
-                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
-                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
-                embeddings.
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            added_cond_kwargs (`dict`):
-                Additional conditions for the Stable Diffusion XL UNet.
-            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            guess_mode (`bool`, defaults to `False`):
-                In this mode, the BrushNet encoder tries its best to recognize the input content of the input even if
-                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.brushnet.BrushNetOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.brushnet.BrushNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.brushnet.BrushNetOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
-        """
-        # check channel order
-        channel_order = self.config.brushnet_conditioning_channel_order
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        elif channel_order == "bgr":
-            brushnet_cond = torch.flip(brushnet_cond, dims=[1])
-        else:
-            raise ValueError(f"unknown `brushnet_conditioning_channel_order`: {channel_order}")
-        if debug: print('BrushNet CA: attn mask')
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-        if debug: print('BrushNet CA: time')
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-        t_emb = self.time_proj(timesteps)
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-        emb = self.time_embedding(t_emb, timestep_cond)
-        aug_emb = None
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
-        if self.config.addition_embed_type is not None:
-            if self.config.addition_embed_type == "text":
-                aug_emb = self.add_embedding(encoder_hidden_states)
-            elif self.config.addition_embed_type == "text_time":
-                if "text_embeds" not in added_cond_kwargs:
-                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                    )
-                text_embeds = added_cond_kwargs.get("text_embeds")
-                if "time_ids" not in added_cond_kwargs:
-                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                    )
-                time_ids = added_cond_kwargs.get("time_ids")
-                time_embeds = self.add_time_proj(time_ids.flatten())
-                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-                add_embeds = add_embeds.to(emb.dtype)
-                aug_emb = self.add_embedding(add_embeds)
-        emb = emb + aug_emb if aug_emb is not None else emb
-        if debug: print('BrushNet CA: pre-process')
-        # 2. pre-process
-        brushnet_cond = torch.concat([sample, brushnet_cond], 1)
-        sample = self.conv_in_condition(brushnet_cond)
-        if debug: print('BrushNet CA: down')
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                if debug: print('BrushNet CA (down block with XA): ', type(downsample_block))
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    debug=debug,
-                )
-            else:
-                if debug: print('BrushNet CA (down block): ', type(downsample_block))
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, debug=debug)
-            down_block_res_samples += res_samples
-        if debug: print('BrushNet CA: PP down')
-        # 4. PaintingNet down blocks
-        brushnet_down_block_res_samples = ()
-        for down_block_res_sample, brushnet_down_block in zip(down_block_res_samples, self.brushnet_down_blocks):
-            down_block_res_sample = brushnet_down_block(down_block_res_sample)
-            brushnet_down_block_res_samples = brushnet_down_block_res_samples + (down_block_res_sample,)
-        if debug: print('BrushNet CA: PP mid')
-        # 5. mid
-        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
-        if debug: print('BrushNet CA: mid')
-        # 6. BrushNet mid blocks
-        brushnet_mid_block_res_sample = self.brushnet_mid_block(sample)
-        if debug: print('BrushNet CA: PP up')
-        # 7. up
-        up_block_res_samples = ()
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample, up_res_samples = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                    return_res_samples=True,
-                )
-            else:
-                sample, up_res_samples = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                    return_res_samples=True,
-                )
-            up_block_res_samples += up_res_samples
-        if debug: print('BrushNet CA: up')
-        # 8. BrushNet up blocks
-        brushnet_up_block_res_samples = ()
-        for up_block_res_sample, brushnet_up_block in zip(up_block_res_samples, self.brushnet_up_blocks):
-            up_block_res_sample = brushnet_up_block(up_block_res_sample)
-            brushnet_up_block_res_samples = brushnet_up_block_res_samples + (up_block_res_sample,)
-        if debug: print('BrushNet CA: scaling')
-        # 6. scaling
-        if guess_mode and not self.config.global_pool_conditions:
-            scales = torch.logspace(
-                -1,
-                0,
-                len(brushnet_down_block_res_samples) + 1 + len(brushnet_up_block_res_samples),
-                device=sample.device,
-            )  # 0.1 to 1.0
-            scales = scales * conditioning_scale
-            brushnet_down_block_res_samples = [
-                sample * scale
-                for sample, scale in zip(
-                    brushnet_down_block_res_samples, scales[: len(brushnet_down_block_res_samples)]
-                )
-            ]
-            brushnet_mid_block_res_sample = (
-                brushnet_mid_block_res_sample * scales[len(brushnet_down_block_res_samples)]
-            )
-            brushnet_up_block_res_samples = [
-                sample * scale
-                for sample, scale in zip(
-                    brushnet_up_block_res_samples, scales[len(brushnet_down_block_res_samples) + 1 :]
-                )
-            ]
-        else:
-            brushnet_down_block_res_samples = [
-                sample * conditioning_scale for sample in brushnet_down_block_res_samples
-            ]
-            brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * conditioning_scale
-            brushnet_up_block_res_samples = [sample * conditioning_scale for sample in brushnet_up_block_res_samples]
-        if self.config.global_pool_conditions:
-            brushnet_down_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_down_block_res_samples
-            ]
-            brushnet_mid_block_res_sample = torch.mean(brushnet_mid_block_res_sample, dim=(2, 3), keepdim=True)
-            brushnet_up_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_up_block_res_samples
-            ]
-        if debug: print('BrushNet CA: finish')
-        if not return_dict:
-            return (brushnet_down_block_res_samples, brushnet_mid_block_res_sample, brushnet_up_block_res_samples)
-        return BrushNetOutput(
-            down_block_res_samples=brushnet_down_block_res_samples,
-            mid_block_res_sample=brushnet_mid_block_res_sample,
-            up_block_res_samples=brushnet_up_block_res_samples,
-        )
-def zero_module(module):
-    for p in module.parameters():
-        nn.init.zeros_(p)
-    return module

MagicQuill/brushnet/brushnet_xl.json DELETED Viewed

@@ -1,63 +0,0 @@
-{
-  "_class_name": "BrushNetModel",
-  "_diffusers_version": "0.27.0.dev0",
-  "_name_or_path": "runs/logs/brushnetsdxl_randommask/checkpoint-80000",
-  "act_fn": "silu",
-  "addition_embed_type": "text_time",
-  "addition_embed_type_num_heads": 64,
-  "addition_time_embed_dim": 256,
-  "attention_head_dim": [
-    5,
-    10,
-    20
-  ],
-  "block_out_channels": [
-    320,
-    640,
-    1280
-  ],
-  "brushnet_conditioning_channel_order": "rgb",
-  "class_embed_type": null,
-  "conditioning_channels": 5,
-  "conditioning_embedding_out_channels": [
-    16,
-    32,
-    96,
-    256
-  ],
-  "cross_attention_dim": 2048,
-  "down_block_types": [
-    "DownBlock2D",
-    "DownBlock2D",
-    "DownBlock2D"
-  ],
-  "downsample_padding": 1,
-  "encoder_hid_dim": null,
-  "encoder_hid_dim_type": null,
-  "flip_sin_to_cos": true,
-  "freq_shift": 0,
-  "global_pool_conditions": false,
-  "in_channels": 4,
-  "layers_per_block": 2,
-  "mid_block_scale_factor": 1,
-  "mid_block_type": "MidBlock2D",
-  "norm_eps": 1e-05,
-  "norm_num_groups": 32,
-  "num_attention_heads": null,
-  "num_class_embeds": null,
-  "only_cross_attention": false,
-  "projection_class_embeddings_input_dim": 2816,
-  "resnet_time_scale_shift": "default",
-  "transformer_layers_per_block": [
-    1,
-    2,
-    10
-  ],
-  "up_block_types": [
-    "UpBlock2D",
-    "UpBlock2D",
-    "UpBlock2D"
-  ],
-  "upcast_attention": null,
-  "use_linear_projection": true
-}

MagicQuill/brushnet/powerpaint.json DELETED Viewed

@@ -1,57 +0,0 @@
-{
-  "_class_name": "BrushNetModel",
-  "_diffusers_version": "0.27.2",
-  "act_fn": "silu",
-  "addition_embed_type": null,
-  "addition_embed_type_num_heads": 64,
-  "addition_time_embed_dim": null,
-  "attention_head_dim": 8,
-  "block_out_channels": [
-    320,
-    640,
-    1280,
-    1280
-  ],
-  "brushnet_conditioning_channel_order": "rgb",
-  "class_embed_type": null,
-  "conditioning_channels": 5,
-  "conditioning_embedding_out_channels": [
-    16,
-    32,
-    96,
-    256
-  ],
-  "cross_attention_dim": 768,
-  "down_block_types": [
-    "CrossAttnDownBlock2D",
-    "CrossAttnDownBlock2D",
-    "CrossAttnDownBlock2D",
-    "DownBlock2D"
-  ],
-  "downsample_padding": 1,
-  "encoder_hid_dim": null,
-  "encoder_hid_dim_type": null,
-  "flip_sin_to_cos": true,
-  "freq_shift": 0,
-  "global_pool_conditions": false,
-  "in_channels": 4,
-  "layers_per_block": 2,
-  "mid_block_scale_factor": 1,
-  "mid_block_type": "UNetMidBlock2DCrossAttn",
-  "norm_eps": 1e-05,
-  "norm_num_groups": 32,
-  "num_attention_heads": null,
-  "num_class_embeds": null,
-  "only_cross_attention": false,
-  "projection_class_embeddings_input_dim": null,
-  "resnet_time_scale_shift": "default",
-  "transformer_layers_per_block": 1,
-  "up_block_types": [
-    "UpBlock2D",
-    "CrossAttnUpBlock2D",
-    "CrossAttnUpBlock2D",
-    "CrossAttnUpBlock2D"
-  ],
-  "upcast_attention": false,
-  "use_linear_projection": false
-}

MagicQuill/brushnet/powerpaint_utils.py DELETED Viewed

@@ -1,496 +0,0 @@
-import copy
-import random
-import torch
-import torch.nn as nn
-from transformers import CLIPTokenizer
-from typing import Any, List, Optional, Union
-class TokenizerWrapper:
-    """Tokenizer wrapper for CLIPTokenizer. Only support CLIPTokenizer
-    currently. This wrapper is modified from https://github.com/huggingface/dif
-    fusers/blob/e51f19aee82c8dd874b715a09dbc521d88835d68/src/diffusers/loaders.
-    py#L358  # noqa.
-    Args:
-        from_pretrained (Union[str, os.PathLike], optional): The *model id*
-            of a pretrained model or a path to a *directory* containing
-            model weights and config. Defaults to None.
-        from_config (Union[str, os.PathLike], optional): The *model id*
-            of a pretrained model or a path to a *directory* containing
-            model weights and config. Defaults to None.
-        *args, **kwargs: If `from_pretrained` is passed, *args and **kwargs
-            will be passed to `from_pretrained` function. Otherwise, *args
-            and **kwargs will be used to initialize the model by
-            `self._module_cls(*args, **kwargs)`.
-    """
-    def __init__(self, tokenizer: CLIPTokenizer):
-        self.wrapped = tokenizer
-        self.token_map = {}
-    def __getattr__(self, name: str) -> Any:
-        if name in self.__dict__:
-            return getattr(self, name)
-        #if name == "wrapped":
-        #    return getattr(self, 'wrapped')#super().__getattr__("wrapped")
-        try:
-            return getattr(self.wrapped, name)
-        except AttributeError:
-            raise AttributeError(
-                "'name' cannot be found in both "
-                f"'{self.__class__.__name__}' and "
-                f"'{self.__class__.__name__}.tokenizer'."
-            )
-    def try_adding_tokens(self, tokens: Union[str, List[str]], *args, **kwargs):
-        """Attempt to add tokens to the tokenizer.
-        Args:
-            tokens (Union[str, List[str]]): The tokens to be added.
-        """
-        num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
-        assert num_added_tokens != 0, (
-            f"The tokenizer already contains the token {tokens}. Please pass "
-            "a different `placeholder_token` that is not already in the "
-            "tokenizer."
-        )
-    def get_token_info(self, token: str) -> dict:
-        """Get the information of a token, including its start and end index in
-        the current tokenizer.
-        Args:
-            token (str): The token to be queried.
-        Returns:
-            dict: The information of the token, including its start and end
-                index in current tokenizer.
-        """
-        token_ids = self.__call__(token).input_ids
-        start, end = token_ids[1], token_ids[-2] + 1
-        return {"name": token, "start": start, "end": end}
-    def add_placeholder_token(self, placeholder_token: str, *args, num_vec_per_token: int = 1, **kwargs):
-        """Add placeholder tokens to the tokenizer.
-        Args:
-            placeholder_token (str): The placeholder token to be added.
-            num_vec_per_token (int, optional): The number of vectors of
-                the added placeholder token.
-            *args, **kwargs: The arguments for `self.wrapped.add_tokens`.
-        """
-        output = []
-        if num_vec_per_token == 1:
-            self.try_adding_tokens(placeholder_token, *args, **kwargs)
-            output.append(placeholder_token)
-        else:
-            output = []
-            for i in range(num_vec_per_token):
-                ith_token = placeholder_token + f"_{i}"
-                self.try_adding_tokens(ith_token, *args, **kwargs)
-                output.append(ith_token)
-        for token in self.token_map:
-            if token in placeholder_token:
-                raise ValueError(
-                    f"The tokenizer already has placeholder token {token} "
-                    f"that can get confused with {placeholder_token} "
-                    "keep placeholder tokens independent"
-                )
-        self.token_map[placeholder_token] = output
-    def replace_placeholder_tokens_in_text(
-        self, text: Union[str, List[str]], vector_shuffle: bool = False, prop_tokens_to_load: float = 1.0
-    ) -> Union[str, List[str]]:
-        """Replace the keywords in text with placeholder tokens. This function
-        will be called in `self.__call__` and `self.encode`.
-        Args:
-            text (Union[str, List[str]]): The text to be processed.
-            vector_shuffle (bool, optional): Whether to shuffle the vectors.
-                Defaults to False.
-            prop_tokens_to_load (float, optional): The proportion of tokens to
-                be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0.
-        Returns:
-            Union[str, List[str]]: The processed text.
-        """
-        if isinstance(text, list):
-            output = []
-            for i in range(len(text)):
-                output.append(self.replace_placeholder_tokens_in_text(text[i], vector_shuffle=vector_shuffle))
-            return output
-        for placeholder_token in self.token_map:
-            if placeholder_token in text:
-                tokens = self.token_map[placeholder_token]
-                tokens = tokens[: 1 + int(len(tokens) * prop_tokens_to_load)]
-                if vector_shuffle:
-                    tokens = copy.copy(tokens)
-                    random.shuffle(tokens)
-                text = text.replace(placeholder_token, " ".join(tokens))
-        return text
-    def replace_text_with_placeholder_tokens(self, text: Union[str, List[str]]) -> Union[str, List[str]]:
-        """Replace the placeholder tokens in text with the original keywords.
-        This function will be called in `self.decode`.
-        Args:
-            text (Union[str, List[str]]): The text to be processed.
-        Returns:
-            Union[str, List[str]]: The processed text.
-        """
-        if isinstance(text, list):
-            output = []
-            for i in range(len(text)):
-                output.append(self.replace_text_with_placeholder_tokens(text[i]))
-            return output
-        for placeholder_token, tokens in self.token_map.items():
-            merged_tokens = " ".join(tokens)
-            if merged_tokens in text:
-                text = text.replace(merged_tokens, placeholder_token)
-        return text
-    def __call__(
-        self,
-        text: Union[str, List[str]],
-        *args,
-        vector_shuffle: bool = False,
-        prop_tokens_to_load: float = 1.0,
-        **kwargs,
-    ):
-        """The call function of the wrapper.
-        Args:
-            text (Union[str, List[str]]): The text to be tokenized.
-            vector_shuffle (bool, optional): Whether to shuffle the vectors.
-                Defaults to False.
-            prop_tokens_to_load (float, optional): The proportion of tokens to
-                be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0
-            *args, **kwargs: The arguments for `self.wrapped.__call__`.
-        """
-        replaced_text = self.replace_placeholder_tokens_in_text(
-            text, vector_shuffle=vector_shuffle, prop_tokens_to_load=prop_tokens_to_load
-        )
-        return self.wrapped.__call__(replaced_text, *args, **kwargs)
-    def encode(self, text: Union[str, List[str]], *args, **kwargs):
-        """Encode the passed text to token index.
-        Args:
-            text (Union[str, List[str]]): The text to be encode.
-            *args, **kwargs: The arguments for `self.wrapped.__call__`.
-        """
-        replaced_text = self.replace_placeholder_tokens_in_text(text)
-        return self.wrapped(replaced_text, *args, **kwargs)
-    def decode(self, token_ids, return_raw: bool = False, *args, **kwargs) -> Union[str, List[str]]:
-        """Decode the token index to text.
-        Args:
-            token_ids: The token index to be decoded.
-            return_raw: Whether keep the placeholder token in the text.
-                Defaults to False.
-            *args, **kwargs: The arguments for `self.wrapped.decode`.
-        Returns:
-            Union[str, List[str]]: The decoded text.
-        """
-        text = self.wrapped.decode(token_ids, *args, **kwargs)
-        if return_raw:
-            return text
-        replaced_text = self.replace_text_with_placeholder_tokens(text)
-        return replaced_text
-    def __repr__(self):
-        """The representation of the wrapper."""
-        s = super().__repr__()
-        prefix = f"Wrapped Module Class: {self._module_cls}\n"
-        prefix += f"Wrapped Module Name: {self._module_name}\n"
-        if self._from_pretrained:
-            prefix += f"From Pretrained: {self._from_pretrained}\n"
-        s = prefix + s
-        return s
-class EmbeddingLayerWithFixes(nn.Module):
-    """The revised embedding layer to support external embeddings. This design
-    of this class is inspired by https://github.com/AUTOMATIC1111/stable-
-    diffusion-webui/blob/22bcc7be428c94e9408f589966c2040187245d81/modules/sd_hi
-    jack.py#L224  # noqa.
-    Args:
-        wrapped (nn.Emebdding): The embedding layer to be wrapped.
-        external_embeddings (Union[dict, List[dict]], optional): The external
-            embeddings added to this layer. Defaults to None.
-    """
-    def __init__(self, wrapped: nn.Embedding, external_embeddings: Optional[Union[dict, List[dict]]] = None):
-        super().__init__()
-        self.wrapped = wrapped
-        self.num_embeddings = wrapped.weight.shape[0]
-        self.external_embeddings = []
-        if external_embeddings:
-            self.add_embeddings(external_embeddings)
-        self.trainable_embeddings = nn.ParameterDict()
-    @property
-    def weight(self):
-        """Get the weight of wrapped embedding layer."""
-        return self.wrapped.weight
-    def check_duplicate_names(self, embeddings: List[dict]):
-        """Check whether duplicate names exist in list of 'external
-        embeddings'.
-        Args:
-            embeddings (List[dict]): A list of embedding to be check.
-        """
-        names = [emb["name"] for emb in embeddings]
-        assert len(names) == len(set(names)), (
-            "Found duplicated names in 'external_embeddings'. Name list: " f"'{names}'"
-        )
-    def check_ids_overlap(self, embeddings):
-        """Check whether overlap exist in token ids of 'external_embeddings'.
-        Args:
-            embeddings (List[dict]): A list of embedding to be check.
-        """
-        ids_range = [[emb["start"], emb["end"], emb["name"]] for emb in embeddings]
-        ids_range.sort()  # sort by 'start'
-        # check if 'end' has overlapping
-        for idx in range(len(ids_range) - 1):
-            name1, name2 = ids_range[idx][-1], ids_range[idx + 1][-1]
-            assert ids_range[idx][1] <= ids_range[idx + 1][0], (
-                f"Found ids overlapping between embeddings '{name1}' " f"and '{name2}'."
-            )
-    def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
-        """Add external embeddings to this layer.
-        Use case:
-        >>> 1. Add token to tokenizer and get the token id.
-        >>> tokenizer = TokenizerWrapper('openai/clip-vit-base-patch32')
-        >>> # 'how much' in kiswahili
-        >>> tokenizer.add_placeholder_tokens('ngapi', num_vec_per_token=4)
-        >>>
-        >>> 2. Add external embeddings to the model.
-        >>> new_embedding = {
-        >>>     'name': 'ngapi',  # 'how much' in kiswahili
-        >>>     'embedding': torch.ones(1, 15) * 4,
-        >>>     'start': tokenizer.get_token_info('kwaheri')['start'],
-        >>>     'end': tokenizer.get_token_info('kwaheri')['end'],
-        >>>     'trainable': False  # if True, will registry as a parameter
-        >>> }
-        >>> embedding_layer = nn.Embedding(10, 15)
-        >>> embedding_layer_wrapper = EmbeddingLayerWithFixes(embedding_layer)
-        >>> embedding_layer_wrapper.add_embeddings(new_embedding)
-        >>>
-        >>> 3. Forward tokenizer and embedding layer!
-        >>> input_text = ['hello, ngapi!', 'hello my friend, ngapi?']
-        >>> input_ids = tokenizer(
-        >>>     input_text, padding='max_length', truncation=True,
-        >>>     return_tensors='pt')['input_ids']
-        >>> out_feat = embedding_layer_wrapper(input_ids)
-        >>>
-        >>> 4. Let's validate the result!
-        >>> assert (out_feat[0, 3: 7] == 2.3).all()
-        >>> assert (out_feat[2, 5: 9] == 2.3).all()
-        Args:
-            embeddings (Union[dict, list[dict]]): The external embeddings to
-                be added. Each dict must contain the following 4 fields: 'name'
-                (the name of this embedding), 'embedding' (the embedding
-                tensor), 'start' (the start token id of this embedding), 'end'
-                (the end token id of this embedding). For example:
-                `{name: NAME, start: START, end: END, embedding: torch.Tensor}`
-        """
-        if isinstance(embeddings, dict):
-            embeddings = [embeddings]
-        self.external_embeddings += embeddings
-        self.check_duplicate_names(self.external_embeddings)
-        self.check_ids_overlap(self.external_embeddings)
-        # set for trainable
-        added_trainable_emb_info = []
-        for embedding in embeddings:
-            trainable = embedding.get("trainable", False)
-            if trainable:
-                name = embedding["name"]
-                embedding["embedding"] = torch.nn.Parameter(embedding["embedding"])
-                self.trainable_embeddings[name] = embedding["embedding"]
-                added_trainable_emb_info.append(name)
-        added_emb_info = [emb["name"] for emb in embeddings]
-        added_emb_info = ", ".join(added_emb_info)
-        print(f"Successfully add external embeddings: {added_emb_info}.", "current")
-        if added_trainable_emb_info:
-            added_trainable_emb_info = ", ".join(added_trainable_emb_info)
-            print("Successfully add trainable external embeddings: " f"{added_trainable_emb_info}", "current")
-    def replace_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        """Replace external input ids to 0.
-        Args:
-            input_ids (torch.Tensor): The input ids to be replaced.
-        Returns:
-            torch.Tensor: The replaced input ids.
-        """
-        input_ids_fwd = input_ids.clone()
-        input_ids_fwd[input_ids_fwd >= self.num_embeddings] = 0
-        return input_ids_fwd
-    def replace_embeddings(
-        self, input_ids: torch.Tensor, embedding: torch.Tensor, external_embedding: dict
-    ) -> torch.Tensor:
-        """Replace external embedding to the embedding layer. Noted that, in
-        this function we use `torch.cat` to avoid inplace modification.
-        Args:
-            input_ids (torch.Tensor): The original token ids. Shape like
-                [LENGTH, ].
-            embedding (torch.Tensor): The embedding of token ids after
-                `replace_input_ids` function.
-            external_embedding (dict): The external embedding to be replaced.
-        Returns:
-            torch.Tensor: The replaced embedding.
-        """
-        new_embedding = []
-        name = external_embedding["name"]
-        start = external_embedding["start"]
-        end = external_embedding["end"]
-        target_ids_to_replace = [i for i in range(start, end)]
-        ext_emb = external_embedding["embedding"]
-        # do not need to replace
-        if not (input_ids == start).any():
-            return embedding
-        # start replace
-        s_idx, e_idx = 0, 0
-        while e_idx < len(input_ids):
-            if input_ids[e_idx] == start:
-                if e_idx != 0:
-                    # add embedding do not need to replace
-                    new_embedding.append(embedding[s_idx:e_idx])
-                # check if the next embedding need to replace is valid
-                actually_ids_to_replace = [int(i) for i in input_ids[e_idx : e_idx + end - start]]
-                assert actually_ids_to_replace == target_ids_to_replace, (
-                    f"Invalid 'input_ids' in position: {s_idx} to {e_idx}. "
-                    f"Expect '{target_ids_to_replace}' for embedding "
-                    f"'{name}' but found '{actually_ids_to_replace}'."
-                )
-                new_embedding.append(ext_emb)
-                s_idx = e_idx + end - start
-                e_idx = s_idx + 1
-            else:
-                e_idx += 1
-        if e_idx == len(input_ids):
-            new_embedding.append(embedding[s_idx:e_idx])
-        return torch.cat(new_embedding, dim=0)
-    def forward(self, input_ids: torch.Tensor, external_embeddings: Optional[List[dict]] = None):
-        """The forward function.
-        Args:
-            input_ids (torch.Tensor): The token ids shape like [bz, LENGTH] or
-                [LENGTH, ].
-            external_embeddings (Optional[List[dict]]): The external
-                embeddings. If not passed, only `self.external_embeddings`
-                will be used.  Defaults to None.
-        input_ids: shape like [bz, LENGTH] or [LENGTH].
-        """
-        assert input_ids.ndim in [1, 2]
-        if input_ids.ndim == 1:
-            input_ids = input_ids.unsqueeze(0)
-        if external_embeddings is None and not self.external_embeddings:
-            return self.wrapped(input_ids)
-        input_ids_fwd = self.replace_input_ids(input_ids)
-        inputs_embeds = self.wrapped(input_ids_fwd)
-        vecs = []
-        if external_embeddings is None:
-            external_embeddings = []
-        elif isinstance(external_embeddings, dict):
-            external_embeddings = [external_embeddings]
-        embeddings = self.external_embeddings + external_embeddings
-        for input_id, embedding in zip(input_ids, inputs_embeds):
-            new_embedding = embedding
-            for external_embedding in embeddings:
-                new_embedding = self.replace_embeddings(input_id, new_embedding, external_embedding)
-            vecs.append(new_embedding)
-        return torch.stack(vecs)
-def add_tokens(
-    tokenizer, text_encoder, placeholder_tokens: list, initialize_tokens: list = None, num_vectors_per_token: int = 1
-):
-    """Add token for training.
-    # TODO: support add tokens as dict, then we can load pretrained tokens.
-    """
-    if initialize_tokens is not None:
-        assert len(initialize_tokens) == len(
-            placeholder_tokens
-        ), "placeholder_token should be the same length as initialize_token"
-    for ii in range(len(placeholder_tokens)):
-        tokenizer.add_placeholder_token(placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token)
-    # text_encoder.set_embedding_layer()
-    embedding_layer = text_encoder.text_model.embeddings.token_embedding
-    text_encoder.text_model.embeddings.token_embedding = EmbeddingLayerWithFixes(embedding_layer)
-    embedding_layer = text_encoder.text_model.embeddings.token_embedding
-    assert embedding_layer is not None, (
-        "Do not support get embedding layer for current text encoder. " "Please check your configuration."
-    )
-    initialize_embedding = []
-    if initialize_tokens is not None:
-        for ii in range(len(placeholder_tokens)):
-            init_id = tokenizer(initialize_tokens[ii]).input_ids[1]
-            temp_embedding = embedding_layer.weight[init_id]
-            initialize_embedding.append(temp_embedding[None, ...].repeat(num_vectors_per_token, 1))
-    else:
-        for ii in range(len(placeholder_tokens)):
-            init_id = tokenizer("a").input_ids[1]
-            temp_embedding = embedding_layer.weight[init_id]
-            len_emb = temp_embedding.shape[0]
-            init_weight = (torch.rand(num_vectors_per_token, len_emb) - 0.5) / 2.0
-            initialize_embedding.append(init_weight)
-    # initialize_embedding  = torch.cat(initialize_embedding,dim=0)
-    token_info_all = []
-    for ii in range(len(placeholder_tokens)):
-        token_info = tokenizer.get_token_info(placeholder_tokens[ii])
-        token_info["embedding"] = initialize_embedding[ii]
-        token_info["trainable"] = True
-        token_info_all.append(token_info)
-    embedding_layer.add_embeddings(token_info_all)

MagicQuill/brushnet/unet_2d_blocks.py DELETED Viewed

The diff for this file is too large to render. See raw diff

MagicQuill/brushnet/unet_2d_condition.py DELETED Viewed

@@ -1,1355 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
-from diffusers.models.activations import get_activation
-from diffusers.models.attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    Attention,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
-from diffusers.models.embeddings import (
-    GaussianFourierProjection,
-    GLIGENTextBoundingboxProjection,
-    ImageHintTimeEmbedding,
-    ImageProjection,
-    ImageTimeEmbedding,
-    TextImageProjection,
-    TextImageTimeEmbedding,
-    TextTimeEmbedding,
-    TimestepEmbedding,
-    Timesteps,
-)
-from diffusers.models.modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    get_down_block,
-    get_mid_block,
-    get_up_block,
-)
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-@dataclass
-class UNet2DConditionOutput(BaseOutput):
-    """
-    The output of [`UNet2DConditionModel`].
-    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
-    """
-    sample: torch.FloatTensor = None
-class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
-    r"""
-    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
-    shaped output.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    Parameters:
-        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
-            Height and width of input/output sample.
-        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
-            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
-            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
-            The tuple of upsample blocks to use.
-        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
-            Whether to include self-attention in the basic transformer blocks, see
-            [`~models.attention.BasicTransformerBlock`].
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-            If `None`, normalization and activation layers is skipped in post-processing.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
-            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
-            dimension to `cross_attention_dim`.
-        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
-            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
-            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-        num_attention_heads (`int`, *optional*):
-            The number of attention heads. If not defined, defaults to `attention_head_dim`
-        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
-            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to `None`):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
-            Dimension for the timestep embeddings.
-        num_class_embeds (`int`, *optional*, defaults to `None`):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        time_embedding_type (`str`, *optional*, defaults to `positional`):
-            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
-        time_embedding_dim (`int`, *optional*, defaults to `None`):
-            An optional override for the dimension of the projected time embedding.
-        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
-            Optional activation function to use only once on the time embeddings before they are passed to the rest of
-            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
-        timestep_post_act (`str`, *optional*, defaults to `None`):
-            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
-        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
-            The dimension of `cond_proj` layer in the timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
-        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
-            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
-        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
-            embeddings with the class embeddings.
-        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
-            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
-            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
-            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
-            otherwise.
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: Union[int, Tuple[int]] = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        dropout: float = 0.0,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: Union[int, Tuple[int]] = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
-        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
-        encoder_hid_dim: Optional[int] = None,
-        encoder_hid_dim_type: Optional[str] = None,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        addition_embed_type: Optional[str] = None,
-        addition_time_embed_dim: Optional[int] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        resnet_skip_time_act: bool = False,
-        resnet_out_scale_factor: float = 1.0,
-        time_embedding_type: str = "positional",
-        time_embedding_dim: Optional[int] = None,
-        time_embedding_act_fn: Optional[str] = None,
-        timestep_post_act: Optional[str] = None,
-        time_cond_proj_dim: Optional[int] = None,
-        conv_in_kernel: int = 3,
-        conv_out_kernel: int = 3,
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        attention_type: str = "default",
-        class_embeddings_concat: bool = False,
-        mid_block_only_cross_attention: Optional[bool] = None,
-        cross_attention_norm: Optional[str] = None,
-        addition_embed_type_num_heads: int = 64,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-        if num_attention_heads is not None:
-            raise ValueError(
-                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
-            )
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-        # Check inputs
-        self._check_config(
-            down_block_types=down_block_types,
-            up_block_types=up_block_types,
-            only_cross_attention=only_cross_attention,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            cross_attention_dim=cross_attention_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
-            attention_head_dim=attention_head_dim,
-            num_attention_heads=num_attention_heads,
-        )
-        # input
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2d(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-        # time
-        time_embed_dim, timestep_input_dim = self._set_time_proj(
-            time_embedding_type,
-            block_out_channels=block_out_channels,
-            flip_sin_to_cos=flip_sin_to_cos,
-            freq_shift=freq_shift,
-            time_embedding_dim=time_embedding_dim,
-        )
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-            post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim,
-        )
-        self._set_encoder_hid_proj(
-            encoder_hid_dim_type,
-            cross_attention_dim=cross_attention_dim,
-            encoder_hid_dim=encoder_hid_dim,
-        )
-        # class embedding
-        self._set_class_embedding(
-            class_embed_type,
-            act_fn=act_fn,
-            num_class_embeds=num_class_embeds,
-            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-            time_embed_dim=time_embed_dim,
-            timestep_input_dim=timestep_input_dim,
-        )
-        self._set_add_embedding(
-            addition_embed_type,
-            addition_embed_type_num_heads=addition_embed_type_num_heads,
-            addition_time_embed_dim=addition_time_embed_dim,
-            cross_attention_dim=cross_attention_dim,
-            encoder_hid_dim=encoder_hid_dim,
-            flip_sin_to_cos=flip_sin_to_cos,
-            freq_shift=freq_shift,
-            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-            time_embed_dim=time_embed_dim,
-        )
-        if time_embedding_act_fn is None:
-            self.time_embed_act = None
-        else:
-            self.time_embed_act = get_activation(time_embedding_act_fn)
-        self.down_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-        if isinstance(only_cross_attention, bool):
-            if mid_block_only_cross_attention is None:
-                mid_block_only_cross_attention = only_cross_attention
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-        if mid_block_only_cross_attention is None:
-            mid_block_only_cross_attention = False
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-        if isinstance(cross_attention_dim, int):
-            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
-        if isinstance(layers_per_block, int):
-            layers_per_block = [layers_per_block] * len(down_block_types)
-        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
-        if class_embeddings_concat:
-            # The time embeddings are concatenated with the class embeddings. The dimension of the
-            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
-            # regular time embeddings
-            blocks_time_embed_dim = time_embed_dim * 2
-        else:
-            blocks_time_embed_dim = time_embed_dim
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block[i],
-                transformer_layers_per_block=transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=blocks_time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim[i],
-                num_attention_heads=num_attention_heads[i],
-                downsample_padding=downsample_padding,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                attention_type=attention_type,
-                resnet_skip_time_act=resnet_skip_time_act,
-                resnet_out_scale_factor=resnet_out_scale_factor,
-                cross_attention_norm=cross_attention_norm,
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-                dropout=dropout,
-            )
-            self.down_blocks.append(down_block)
-        # mid
-        self.mid_block = get_mid_block(
-            mid_block_type,
-            temb_channels=blocks_time_embed_dim,
-            in_channels=block_out_channels[-1],
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            resnet_groups=norm_num_groups,
-            output_scale_factor=mid_block_scale_factor,
-            transformer_layers_per_block=transformer_layers_per_block[-1],
-            num_attention_heads=num_attention_heads[-1],
-            cross_attention_dim=cross_attention_dim[-1],
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            mid_block_only_cross_attention=mid_block_only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            attention_type=attention_type,
-            resnet_skip_time_act=resnet_skip_time_act,
-            cross_attention_norm=cross_attention_norm,
-            attention_head_dim=attention_head_dim[-1],
-            dropout=dropout,
-        )
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_num_attention_heads = list(reversed(num_attention_heads))
-        reversed_layers_per_block = list(reversed(layers_per_block))
-        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
-        reversed_transformer_layers_per_block = (
-            list(reversed(transformer_layers_per_block))
-            if reverse_transformer_layers_per_block is None
-            else reverse_transformer_layers_per_block
-        )
-        only_cross_attention = list(reversed(only_cross_attention))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            is_final_block = i == len(block_out_channels) - 1
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-            # add upsample block for all BUT final layer
-            if not is_final_block:
-                add_upsample = True
-                self.num_upsamplers += 1
-            else:
-                add_upsample = False
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=reversed_layers_per_block[i] + 1,
-                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=blocks_time_embed_dim,
-                add_upsample=add_upsample,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resolution_idx=i,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=reversed_cross_attention_dim[i],
-                num_attention_heads=reversed_num_attention_heads[i],
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                attention_type=attention_type,
-                resnet_skip_time_act=resnet_skip_time_act,
-                resnet_out_scale_factor=resnet_out_scale_factor,
-                cross_attention_norm=cross_attention_norm,
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-                dropout=dropout,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-        # out
-        if norm_num_groups is not None:
-            self.conv_norm_out = nn.GroupNorm(
-                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
-            )
-            self.conv_act = get_activation(act_fn)
-        else:
-            self.conv_norm_out = None
-            self.conv_act = None
-        conv_out_padding = (conv_out_kernel - 1) // 2
-        self.conv_out = nn.Conv2d(
-            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
-        )
-        self._set_pos_net_if_use_gligen(attention_type=attention_type, cross_attention_dim=cross_attention_dim)
-    def _check_config(
-        self,
-        down_block_types: Tuple[str],
-        up_block_types: Tuple[str],
-        only_cross_attention: Union[bool, Tuple[bool]],
-        block_out_channels: Tuple[int],
-        layers_per_block: Union[int, Tuple[int]],
-        cross_attention_dim: Union[int, Tuple[int]],
-        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
-        reverse_transformer_layers_per_block: bool,
-        attention_head_dim: int,
-        num_attention_heads: Optional[Union[int, Tuple[int]]],
-    ):
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
-            for layer_number_per_block in transformer_layers_per_block:
-                if isinstance(layer_number_per_block, list):
-                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
-    def _set_time_proj(
-        self,
-        time_embedding_type: str,
-        block_out_channels: int,
-        flip_sin_to_cos: bool,
-        freq_shift: float,
-        time_embedding_dim: int,
-    ) -> Tuple[int, int]:
-        if time_embedding_type == "fourier":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
-            )
-        return time_embed_dim, timestep_input_dim
-    def _set_encoder_hid_proj(
-        self,
-        encoder_hid_dim_type: Optional[str],
-        cross_attention_dim: Union[int, Tuple[int]],
-        encoder_hid_dim: Optional[int],
-    ):
-        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
-            encoder_hid_dim_type = "text_proj"
-            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
-            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
-        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
-            )
-        if encoder_hid_dim_type == "text_proj":
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        elif encoder_hid_dim_type == "text_image_proj":
-            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
-            self.encoder_hid_proj = TextImageProjection(
-                text_embed_dim=encoder_hid_dim,
-                image_embed_dim=cross_attention_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2
-            self.encoder_hid_proj = ImageProjection(
-                image_embed_dim=encoder_hid_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
-            )
-        else:
-            self.encoder_hid_proj = None
-    def _set_class_embedding(
-        self,
-        class_embed_type: Optional[str],
-        act_fn: str,
-        num_class_embeds: Optional[int],
-        projection_class_embeddings_input_dim: Optional[int],
-        time_embed_dim: int,
-        timestep_input_dim: int,
-    ):
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif class_embed_type == "simple_projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-    def _set_add_embedding(
-        self,
-        addition_embed_type: str,
-        addition_embed_type_num_heads: int,
-        addition_time_embed_dim: Optional[int],
-        flip_sin_to_cos: bool,
-        freq_shift: float,
-        cross_attention_dim: Optional[int],
-        encoder_hid_dim: Optional[int],
-        projection_class_embeddings_input_dim: Optional[int],
-        time_embed_dim: int,
-    ):
-        if addition_embed_type == "text":
-            if encoder_hid_dim is not None:
-                text_time_embedding_from_dim = encoder_hid_dim
-            else:
-                text_time_embedding_from_dim = cross_attention_dim
-            self.add_embedding = TextTimeEmbedding(
-                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
-            )
-        elif addition_embed_type == "text_image":
-            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
-            self.add_embedding = TextImageTimeEmbedding(
-                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
-            )
-        elif addition_embed_type == "text_time":
-            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
-            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif addition_embed_type == "image":
-            # Kandinsky 2.2
-            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
-        elif addition_embed_type == "image_hint":
-            # Kandinsky 2.2 ControlNet
-            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
-        elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
-    def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
-        if attention_type in ["gated", "gated-text-image"]:
-            positive_len = 768
-            if isinstance(cross_attention_dim, int):
-                positive_len = cross_attention_dim
-            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
-                positive_len = cross_attention_dim[0]
-            feature_type = "text-only" if attention_type == "gated" else "text-image"
-            self.position_net = GLIGENTextBoundingboxProjection(
-                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
-            )
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-        self.set_attn_processor(processor)
-    def set_attention_slice(self, slice_size: Union[str, int, List[int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
-                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-        num_sliceable_layers = len(sliceable_head_dims)
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
-    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
-        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
-        The suffixes after the scaling factors represent the stage blocks where they are being applied.
-        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
-        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-        Args:
-            s1 (`float`):
-                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
-                mitigate the "oversmoothing effect" in the enhanced denoising process.
-            s2 (`float`):
-                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
-                mitigate the "oversmoothing effect" in the enhanced denoising process.
-            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
-            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
-        """
-        for i, upsample_block in enumerate(self.up_blocks):
-            setattr(upsample_block, "s1", s1)
-            setattr(upsample_block, "s2", s2)
-            setattr(upsample_block, "b1", b1)
-            setattr(upsample_block, "b2", b2)
-    def disable_freeu(self):
-        """Disables the FreeU mechanism."""
-        freeu_keys = {"s1", "s2", "b1", "b2"}
-        for i, upsample_block in enumerate(self.up_blocks):
-            for k in freeu_keys:
-                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
-                    setattr(upsample_block, k, None)
-    def fuse_qkv_projections(self):
-        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        are fused. For cross-attention modules, key and value projection matrices are fused.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
-        """
-        self.original_attn_processors = None
-        for _, attn_processor in self.attn_processors.items():
-            if "Added" in str(attn_processor.__class__.__name__):
-                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
-        self.original_attn_processors = self.attn_processors
-        for module in self.modules():
-            if isinstance(module, Attention):
-                module.fuse_projections(fuse=True)
-    def unfuse_qkv_projections(self):
-        """Disables the fused QKV projection if enabled.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
-        """
-        if self.original_attn_processors is not None:
-            self.set_attn_processor(self.original_attn_processors)
-    def unload_lora(self):
-        """Unloads LoRA weights."""
-        deprecate(
-            "unload_lora",
-            "0.28.0",
-            "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
-        )
-        for module in self.modules():
-            if hasattr(module, "set_lora_layer"):
-                module.set_lora_layer(None)
-    def get_time_embed(
-        self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
-    ) -> Optional[torch.Tensor]:
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-        t_emb = self.time_proj(timesteps)
-        # `Timesteps` does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-        return t_emb
-    def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
-        class_emb = None
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-                # `Timesteps` does not contain any weights and will always return f32 tensors
-                # there might be better ways to encapsulate this.
-                class_labels = class_labels.to(dtype=sample.dtype)
-            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
-        return class_emb
-    def get_aug_embed(
-        self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
-    ) -> Optional[torch.Tensor]:
-        aug_emb = None
-        if self.config.addition_embed_type == "text":
-            aug_emb = self.add_embedding(encoder_hidden_states)
-        elif self.config.addition_embed_type == "text_image":
-            # Kandinsky 2.1 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
-            aug_emb = self.add_embedding(text_embs, image_embs)
-        elif self.config.addition_embed_type == "text_time":
-            # SDXL - style
-            if "text_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                )
-            text_embeds = added_cond_kwargs.get("text_embeds")
-            if "time_ids" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                )
-            time_ids = added_cond_kwargs.get("time_ids")
-            time_embeds = self.add_time_proj(time_ids.flatten())
-            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-            add_embeds = add_embeds.to(emb.dtype)
-            aug_emb = self.add_embedding(add_embeds)
-        elif self.config.addition_embed_type == "image":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            aug_emb = self.add_embedding(image_embs)
-        elif self.config.addition_embed_type == "image_hint":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            hint = added_cond_kwargs.get("hint")
-            aug_emb = self.add_embedding(image_embs, hint)
-        return aug_emb
-    def process_encoder_hidden_states(
-        self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
-    ) -> torch.Tensor:
-        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
-            # Kandinsky 2.1 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = self.encoder_hid_proj(image_embeds)
-            encoder_hidden_states = (encoder_hidden_states, image_embeds)
-        return encoder_hidden_states
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
-        mid_block_additional_residual: Optional[torch.Tensor] = None,
-        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        return_dict: bool = True,
-        down_block_add_samples: Optional[Tuple[torch.Tensor]] = None,
-        mid_block_add_sample: Optional[Tuple[torch.Tensor]] = None,
-        up_block_add_samples: Optional[Tuple[torch.Tensor]] = None,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        r"""
-        The [`UNet2DConditionModel`] forward method.
-        Args:
-            sample (`torch.FloatTensor`):
-                The noisy input tensor with the following shape `(batch, channel, height, width)`.
-            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.FloatTensor`):
-                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
-                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
-                through the `self.time_embedding` layer to obtain the timestep embeddings.
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            added_cond_kwargs: (`dict`, *optional*):
-                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
-                are passed along to the UNet blocks.
-            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
-                A tuple of tensors that if specified are added to the residuals of down unet blocks.
-            mid_block_additional_residual: (`torch.Tensor`, *optional*):
-                A tensor that if specified is added to the residual of the middle unet block.
-            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
-                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
-            encoder_attention_mask (`torch.Tensor`):
-                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
-                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
-                which adds large negative values to the attention scores corresponding to "discard" tokens.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
-                tuple.
-        Returns:
-            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is the sample tensor.
-        """
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
-        for dim in sample.shape[-2:]:
-            if dim % default_overall_up_factor != 0:
-                # Forward upsample size to force interpolation output size.
-                forward_upsample_size = True
-                break
-        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
-        # expects mask of shape:
-        #   [batch, key_tokens]
-        # adds singleton query_tokens dimension:
-        #   [batch,                    1, key_tokens]
-        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
-        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
-        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
-        if attention_mask is not None:
-            # assume that mask is expressed as:
-            #   (1 = keep,      0 = discard)
-            # convert mask into a bias that can be added to attention scores:
-            #       (keep = +0,     discard = -10000.0)
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-        # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-        # 1. time
-        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
-        emb = self.time_embedding(t_emb, timestep_cond)
-        aug_emb = None
-        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
-        if class_emb is not None:
-            if self.config.class_embeddings_concat:
-                emb = torch.cat([emb, class_emb], dim=-1)
-            else:
-                emb = emb + class_emb
-        aug_emb = self.get_aug_embed(
-            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
-        )
-        if self.config.addition_embed_type == "image_hint":
-            aug_emb, hint = aug_emb
-            sample = torch.cat([sample, hint], dim=1)
-        emb = emb + aug_emb if aug_emb is not None else emb
-        if self.time_embed_act is not None:
-            emb = self.time_embed_act(emb)
-        encoder_hidden_states = self.process_encoder_hidden_states(
-            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
-        )
-        # 2. pre-process
-        sample = self.conv_in(sample)
-        # 2.5 GLIGEN position net
-        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
-            cross_attention_kwargs = cross_attention_kwargs.copy()
-            gligen_args = cross_attention_kwargs.pop("gligen")
-            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
-        # 3. down
-        # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
-        # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
-        if cross_attention_kwargs is not None:
-            cross_attention_kwargs = cross_attention_kwargs.copy()
-            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
-        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
-        is_adapter = down_intrablock_additional_residuals is not None
-        # maintain backward compatibility for legacy usage, where
-        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
-        #       but can only use one or the other
-        is_brushnet = down_block_add_samples is not None and mid_block_add_sample is not None and up_block_add_samples is not None
-        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
-            deprecate(
-                "T2I should not use down_block_additional_residuals",
-                "1.3.0",
-                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
-                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
-                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
-                standard_warn=False,
-            )
-            down_intrablock_additional_residuals = down_block_additional_residuals
-            is_adapter = True
-        down_block_res_samples = (sample,)
-        if is_brushnet:
-            sample = sample + down_block_add_samples.pop(0)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                # For t2i-adapter CrossAttnDownBlock2D
-                additional_residuals = {}
-                if is_adapter and len(down_intrablock_additional_residuals) > 0:
-                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
-                i = len(down_block_add_samples)
-                if is_brushnet and len(down_block_add_samples)>0:
-                    additional_residuals["down_block_add_samples"] = [down_block_add_samples.pop(0)
-                                                        for _ in range(len(downsample_block.resnets)+(downsample_block.downsamplers !=None))]
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,
-                    **additional_residuals,
-                )
-            else:
-                additional_residuals = {}
-                i = len(down_block_add_samples)
-                if is_brushnet and len(down_block_add_samples)>0:
-                    additional_residuals["down_block_add_samples"] = [down_block_add_samples.pop(0)
-                                                        for _ in range(len(downsample_block.resnets)+(downsample_block.downsamplers !=None))]
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, **additional_residuals)
-                if is_adapter and len(down_intrablock_additional_residuals) > 0:
-                    sample += down_intrablock_additional_residuals.pop(0)
-            down_block_res_samples += res_samples
-        if is_controlnet:
-            new_down_block_res_samples = ()
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample = down_block_res_sample + down_block_additional_residual
-                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
-            down_block_res_samples = new_down_block_res_samples
-        # 4. mid
-        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
-            # To support T2I-Adapter-XL
-            if (
-                is_adapter
-                and len(down_intrablock_additional_residuals) > 0
-                and sample.shape == down_intrablock_additional_residuals[0].shape
-            ):
-                sample += down_intrablock_additional_residuals.pop(0)
-        if is_controlnet:
-            sample = sample + mid_block_additional_residual
-        if is_brushnet:
-            sample = sample + mid_block_add_sample
-        # 5. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                additional_residuals = {}
-                i = len(up_block_add_samples)
-                if is_brushnet and len(up_block_add_samples)>0:
-                    additional_residuals["up_block_add_samples"] = [up_block_add_samples.pop(0)
-                                                        for _ in range(len(upsample_block.resnets)+(upsample_block.upsamplers !=None))]
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    **additional_residuals,
-                )
-            else:
-                additional_residuals = {}
-                i = len(up_block_add_samples)
-                if is_brushnet and len(up_block_add_samples)>0:
-                    additional_residuals["up_block_add_samples"] = [up_block_add_samples.pop(0)
-                                                        for _ in range(len(upsample_block.resnets)+(upsample_block.upsamplers !=None))]
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                    **additional_residuals,
-                )
-        # 6. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-        if not return_dict:
-            return (sample,)
-        return UNet2DConditionOutput(sample=sample)

MagicQuill/brushnet_nodes.py DELETED Viewed

@@ -1,1094 +0,0 @@
-import os
-import types
-from typing import Tuple
-import torch
-import torchvision.transforms as T
-import torch.nn.functional as F
-from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-import sys
-import comfy.sd
-import comfy.utils
-import comfy.model_management
-import comfy.sd1_clip
-import comfy.ldm.models.autoencoder
-import comfy.supported_models
-import folder_paths
-from .model_patch import add_model_patch_option, patch_model_function_wrapper
-from .brushnet.brushnet import BrushNetModel
-from .brushnet.brushnet_ca import BrushNetModel as PowerPaintModel
-from .brushnet.powerpaint_utils import TokenizerWrapper, add_tokens
-current_directory = os.path.dirname(os.path.abspath(__file__))
-brushnet_config_file = os.path.join(current_directory, 'brushnet', 'brushnet.json')
-brushnet_xl_config_file = os.path.join(current_directory, 'brushnet', 'brushnet_xl.json')
-powerpaint_config_file = os.path.join(current_directory,'brushnet', 'powerpaint.json')
-sd15_scaling_factor = 0.18215
-sdxl_scaling_factor = 0.13025
-print(sys.path)
-ModelsToUnload = [comfy.sd1_clip.SD1ClipModel,
-                  comfy.ldm.models.autoencoder.AutoencoderKL
-                 ]
-class BrushNetLoader:
-    @classmethod
-    def INPUT_TYPES(self):
-        self.inpaint_files = get_files_with_extension('inpaint')
-        return {"required":
-                    {
-                        "brushnet": ([file for file in self.inpaint_files], ),
-                        "dtype": (['float16', 'bfloat16', 'float32', 'float64'], ),
-                     },
-                }
-    CATEGORY = "inpaint"
-    RETURN_TYPES = ("BRMODEL",)
-    RETURN_NAMES = ("brushnet",)
-    FUNCTION = "brushnet_loading"
-    def brushnet_loading(self, brushnet, dtype):
-        brushnet_file = os.path.join(self.inpaint_files[brushnet], brushnet)
-        print('BrushNet model file:', brushnet_file)
-        is_SDXL = False
-        is_PP = False
-        sd = comfy.utils.load_torch_file(brushnet_file)
-        brushnet_down_block, brushnet_mid_block, brushnet_up_block, keys = brushnet_blocks(sd)
-        del sd
-        if brushnet_down_block == 24 and brushnet_mid_block == 2 and brushnet_up_block == 30:
-            is_SDXL = False
-            if keys == 322:
-                is_PP = False
-                print('BrushNet model type: SD1.5')
-            else:
-                is_PP = True
-                print('PowerPaint model type: SD1.5')
-        elif brushnet_down_block == 18 and brushnet_mid_block == 2 and brushnet_up_block == 22:
-            print('BrushNet model type: Loading SDXL')
-            is_SDXL = True
-            is_PP = False
-        else:
-            raise Exception("Unknown BrushNet model")
-        with init_empty_weights():
-            if is_SDXL:
-                brushnet_config = BrushNetModel.load_config(brushnet_xl_config_file)
-                brushnet_model = BrushNetModel.from_config(brushnet_config)
-            elif is_PP:
-                brushnet_config = PowerPaintModel.load_config(powerpaint_config_file)
-                brushnet_model = PowerPaintModel.from_config(brushnet_config)
-            else:
-                brushnet_config = BrushNetModel.load_config(brushnet_config_file)
-                brushnet_model = BrushNetModel.from_config(brushnet_config)
-        if is_PP:
-            print("PowerPaint model file:", brushnet_file)
-        else:
-            print("BrushNet model file:", brushnet_file)
-        if dtype == 'float16':
-            torch_dtype = torch.float16
-        elif dtype == 'bfloat16':
-            torch_dtype = torch.bfloat16
-        elif dtype == 'float32':
-            torch_dtype = torch.float32
-        else:
-            torch_dtype = torch.float64
-        brushnet_model = load_checkpoint_and_dispatch(
-            brushnet_model,
-            brushnet_file,
-            device_map="sequential",
-            max_memory=None,
-            offload_folder=None,
-            offload_state_dict=False,
-            dtype=torch_dtype,
-            force_hooks=False,
-        )
-        if is_PP:
-            print("PowerPaint model is loaded")
-        elif is_SDXL:
-            print("BrushNet SDXL model is loaded")
-        else:
-            print("BrushNet SD1.5 model is loaded")
-        return ({"brushnet": brushnet_model, "SDXL": is_SDXL, "PP": is_PP, "dtype": torch_dtype}, )
-class PowerPaintCLIPLoader:
-    @classmethod
-    def INPUT_TYPES(self):
-        self.inpaint_files = get_files_with_extension('inpaint', ['.bin'])
-        self.clip_files = get_files_with_extension('clip')
-        return {"required":
-                    {
-                        "base": ([file for file in self.clip_files], ),
-                        "powerpaint": ([file for file in self.inpaint_files], ),
-                     },
-                }
-    CATEGORY = "inpaint"
-    RETURN_TYPES = ("CLIP",)
-    RETURN_NAMES = ("clip",)
-    FUNCTION = "ppclip_loading"
-    def ppclip_loading(self, base, powerpaint):
-        base_CLIP_file = os.path.join(self.clip_files[base], base)
-        pp_CLIP_file = os.path.join(self.inpaint_files[powerpaint], powerpaint)
-        pp_clip = comfy.sd.load_clip(ckpt_paths=[base_CLIP_file])
-        print('PowerPaint base CLIP file: ', base_CLIP_file)
-        pp_tokenizer = TokenizerWrapper(pp_clip.tokenizer.clip_l.tokenizer)
-        pp_text_encoder = pp_clip.patcher.model.clip_l.transformer
-        add_tokens(
-            tokenizer = pp_tokenizer,
-            text_encoder = pp_text_encoder,
-            placeholder_tokens = ["P_ctxt", "P_shape", "P_obj"],
-            initialize_tokens = ["a", "a", "a"],
-            num_vectors_per_token = 10,
-        )
-        pp_text_encoder.load_state_dict(comfy.utils.load_torch_file(pp_CLIP_file), strict=False)
-        print('PowerPaint CLIP file: ', pp_CLIP_file)
-        pp_clip.tokenizer.clip_l.tokenizer = pp_tokenizer
-        pp_clip.patcher.model.clip_l.transformer = pp_text_encoder
-        return (pp_clip,)
-class PowerPaint:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {
-                        "model": ("MODEL",),
-                        "vae": ("VAE", ),
-                        "image": ("IMAGE",),
-                        "mask": ("MASK",),
-                        "powerpaint": ("BRMODEL", ),
-                        "clip": ("CLIP", ),
-                        "positive": ("CONDITIONING", ),
-                        "negative": ("CONDITIONING", ),
-                        "fitting" : ("FLOAT", {"default": 1.0, "min": 0.3, "max": 1.0}),
-                        "function": (['text guided', 'shape guided', 'object removal', 'context aware', 'image outpainting'], ),
-                        "scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0}),
-                        "start_at": ("INT", {"default": 0, "min": 0, "max": 10000}),
-                        "end_at": ("INT", {"default": 10000, "min": 0, "max": 10000}),
-                        "save_memory": (['none', 'auto', 'max'], ),
-                     },
-        }
-    CATEGORY = "inpaint"
-    RETURN_TYPES = ("MODEL","CONDITIONING","CONDITIONING","LATENT",)
-    RETURN_NAMES = ("model","positive","negative","latent",)
-    FUNCTION = "model_update"
-    def model_update(self, model, vae, image, mask, powerpaint, clip, positive, negative, fitting, function, scale, start_at, end_at, save_memory):
-        is_SDXL, is_PP = check_compatibilty(model, powerpaint)
-        if not is_PP:
-            raise Exception("BrushNet model was loaded, please use BrushNet node")
-        # Make a copy of the model so that we're not patching it everywhere in the workflow.
-        model = model.clone()
-        # prepare image and mask
-        # no batches for original image and mask
-        masked_image, mask = prepare_image(image, mask)
-        batch = masked_image.shape[0]
-        #width = masked_image.shape[2]
-        #height = masked_image.shape[1]
-        if hasattr(model.model.model_config, 'latent_format') and hasattr(model.model.model_config.latent_format, 'scale_factor'):
-            scaling_factor = model.model.model_config.latent_format.scale_factor
-        else:
-            scaling_factor = sd15_scaling_factor
-        torch_dtype = powerpaint['dtype']
-        # prepare conditioning latents
-        conditioning_latents = get_image_latents(masked_image, mask, vae, scaling_factor)
-        conditioning_latents[0] = conditioning_latents[0].to(dtype=torch_dtype).to(powerpaint['brushnet'].device)
-        conditioning_latents[1] = conditioning_latents[1].to(dtype=torch_dtype).to(powerpaint['brushnet'].device)
-        # prepare embeddings
-        if function == "object removal":
-            promptA = "P_ctxt"
-            promptB = "P_ctxt"
-            negative_promptA = "P_obj"
-            negative_promptB = "P_obj"
-            print('You should add to positive prompt: "empty scene blur"')
-            #positive = positive + " empty scene blur"
-        elif function == "context aware":
-            promptA = "P_ctxt"
-            promptB = "P_ctxt"
-            negative_promptA = ""
-            negative_promptB = ""
-            #positive = positive + " empty scene"
-            print('You should add to positive prompt: "empty scene"')
-        elif function == "shape guided":
-            promptA = "P_shape"
-            promptB = "P_ctxt"
-            negative_promptA = "P_shape"
-            negative_promptB = "P_ctxt"
-        elif function == "image outpainting":
-            promptA = "P_ctxt"
-            promptB = "P_ctxt"
-            negative_promptA = "P_obj"
-            negative_promptB = "P_obj"
-            #positive = positive + " empty scene"
-            print('You should add to positive prompt: "empty scene"')
-        else:
-            promptA = "P_obj"
-            promptB = "P_obj"
-            negative_promptA = "P_obj"
-            negative_promptB = "P_obj"
-        tokens = clip.tokenize(promptA)
-        prompt_embedsA = clip.encode_from_tokens(tokens, return_pooled=False)
-        tokens = clip.tokenize(negative_promptA)
-        negative_prompt_embedsA = clip.encode_from_tokens(tokens, return_pooled=False)
-        tokens = clip.tokenize(promptB)
-        prompt_embedsB = clip.encode_from_tokens(tokens, return_pooled=False)
-        tokens = clip.tokenize(negative_promptB)
-        negative_prompt_embedsB = clip.encode_from_tokens(tokens, return_pooled=False)
-        prompt_embeds_pp = (prompt_embedsA * fitting + (1.0 - fitting) * prompt_embedsB).to(dtype=torch_dtype).to(powerpaint['brushnet'].device)
-        negative_prompt_embeds_pp = (negative_prompt_embedsA * fitting + (1.0 - fitting) * negative_prompt_embedsB).to(dtype=torch_dtype).to(powerpaint['brushnet'].device)
-        # unload vae and CLIPs
-        del vae
-        del clip
-        for loaded_model in comfy.model_management.current_loaded_models:
-            if type(loaded_model.model.model) in ModelsToUnload:
-                comfy.model_management.current_loaded_models.remove(loaded_model)
-                loaded_model.model_unload()
-                del loaded_model
-        # apply patch to model
-        brushnet_conditioning_scale = scale
-        control_guidance_start = start_at
-        control_guidance_end = end_at
-        if save_memory != 'none':
-            powerpaint['brushnet'].set_attention_slice(save_memory)
-        add_brushnet_patch(model,
-                           powerpaint['brushnet'],
-                           torch_dtype,
-                           conditioning_latents,
-                           (brushnet_conditioning_scale, control_guidance_start, control_guidance_end),
-                           negative_prompt_embeds_pp, prompt_embeds_pp,
-                           None, None, None,
-                           False)
-        latent = torch.zeros([batch, 4, conditioning_latents[0].shape[2], conditioning_latents[0].shape[3]], device=powerpaint['brushnet'].device)
-        return (model, positive, negative, {"samples":latent},)
-class BrushNet:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {
-                        "model": ("MODEL",),
-                        "vae": ("VAE", ),
-                        "image": ("IMAGE",),
-                        "mask": ("MASK",),
-                        "brushnet": ("BRMODEL", ),
-                        "positive": ("CONDITIONING", ),
-                        "negative": ("CONDITIONING", ),
-                        "scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0}),
-                        "start_at": ("INT", {"default": 0, "min": 0, "max": 10000}),
-                        "end_at": ("INT", {"default": 10000, "min": 0, "max": 10000}),
-                     },
-        }
-    CATEGORY = "inpaint"
-    RETURN_TYPES = ("MODEL","CONDITIONING","CONDITIONING","LATENT",)
-    RETURN_NAMES = ("model","positive","negative","latent",)
-    FUNCTION = "model_update"
-    def model_update(self, model, vae, image, mask, brushnet, positive, negative, scale, start_at, end_at):
-        is_SDXL, is_PP = check_compatibilty(model, brushnet)
-        if is_PP:
-            raise Exception("PowerPaint model was loaded, please use PowerPaint node")
-        # Make a copy of the model so that we're not patching it everywhere in the workflow.
-        model = model.clone()
-        # prepare image and mask
-        # no batches for original image and mask
-        masked_image, mask = prepare_image(image, mask)
-        batch = masked_image.shape[0]
-        width = masked_image.shape[2]
-        height = masked_image.shape[1]
-        if hasattr(model.model.model_config, 'latent_format') and hasattr(model.model.model_config.latent_format, 'scale_factor'):
-            scaling_factor = model.model.model_config.latent_format.scale_factor
-        elif is_SDXL:
-            scaling_factor = sdxl_scaling_factor
-        else:
-            scaling_factor = sd15_scaling_factor
-        torch_dtype = brushnet['dtype']
-        # prepare conditioning latents
-        conditioning_latents = get_image_latents(masked_image, mask, vae, scaling_factor)
-        conditioning_latents[0] = conditioning_latents[0].to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        conditioning_latents[1] = conditioning_latents[1].to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        # unload vae
-        del vae
-        for loaded_model in comfy.model_management.current_loaded_models:
-            if type(loaded_model.model.model) in ModelsToUnload:
-                comfy.model_management.current_loaded_models.remove(loaded_model)
-                loaded_model.model_unload()
-                del loaded_model
-        # prepare embeddings
-        prompt_embeds = positive[0][0].to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        negative_prompt_embeds = negative[0][0].to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        max_tokens = max(prompt_embeds.shape[1], negative_prompt_embeds.shape[1])
-        if prompt_embeds.shape[1] < max_tokens:
-            multiplier = max_tokens // 77 - prompt_embeds.shape[1] // 77
-            prompt_embeds = torch.concat([prompt_embeds] + [prompt_embeds[:,-77:,:]] * multiplier, dim=1)
-            print('BrushNet: negative prompt more than 75 tokens:', negative_prompt_embeds.shape, 'multiplying prompt_embeds')
-        if negative_prompt_embeds.shape[1] < max_tokens:
-            multiplier = max_tokens // 77 - negative_prompt_embeds.shape[1] // 77
-            negative_prompt_embeds = torch.concat([negative_prompt_embeds] + [negative_prompt_embeds[:,-77:,:]] * multiplier, dim=1)
-            print('BrushNet: positive prompt more than 75 tokens:', prompt_embeds.shape, 'multiplying negative_prompt_embeds')
-        if len(positive[0]) > 1 and 'pooled_output' in positive[0][1] and positive[0][1]['pooled_output'] is not None:
-            pooled_prompt_embeds = positive[0][1]['pooled_output'].to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        else:
-            print('BrushNet: positive conditioning has not pooled_output')
-            if is_SDXL:
-                print('BrushNet will not produce correct results')
-            pooled_prompt_embeds = torch.empty([2, 1280], device=brushnet['brushnet'].device).to(dtype=torch_dtype)
-        if len(negative[0]) > 1 and 'pooled_output' in negative[0][1] and negative[0][1]['pooled_output'] is not None:
-            negative_pooled_prompt_embeds = negative[0][1]['pooled_output'].to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        else:
-            print('BrushNet: negative conditioning has not pooled_output')
-            if is_SDXL:
-                print('BrushNet will not produce correct results')
-            negative_pooled_prompt_embeds = torch.empty([1, pooled_prompt_embeds.shape[1]], device=brushnet['brushnet'].device).to(dtype=torch_dtype)
-        time_ids = torch.FloatTensor([[height, width, 0., 0., height, width]]).to(dtype=torch_dtype).to(brushnet['brushnet'].device)
-        if not is_SDXL:
-            pooled_prompt_embeds = None
-            negative_pooled_prompt_embeds = None
-            time_ids = None
-        # apply patch to model
-        brushnet_conditioning_scale = scale
-        control_guidance_start = start_at
-        control_guidance_end = end_at
-        add_brushnet_patch(model,
-                           brushnet['brushnet'],
-                           torch_dtype,
-                           conditioning_latents,
-                           (brushnet_conditioning_scale, control_guidance_start, control_guidance_end),
-                           prompt_embeds, negative_prompt_embeds,
-                           pooled_prompt_embeds, negative_pooled_prompt_embeds, time_ids,
-                           False)
-        latent = torch.zeros([batch, 4, conditioning_latents[0].shape[2], conditioning_latents[0].shape[3]], device=brushnet['brushnet'].device)
-        return (model, positive, negative, {"samples":latent},)
-class BlendInpaint:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {
-                        "inpaint": ("IMAGE",),
-                        "original": ("IMAGE",),
-                        "mask": ("MASK",),
-                        "kernel": ("INT", {"default": 10, "min": 1, "max": 1000}),
-                        "sigma": ("FLOAT", {"default": 10.0, "min": 0.01, "max": 1000}),
-                     },
-                "optional":
-                    {
-                        "origin": ("VECTOR",),
-                    },
-                }
-    CATEGORY = "inpaint"
-    RETURN_TYPES = ("IMAGE","MASK",)
-    RETURN_NAMES = ("image","MASK",)
-    FUNCTION = "blend_inpaint"
-    def blend_inpaint(self, inpaint: torch.Tensor, original: torch.Tensor, mask, kernel: int, sigma:int, origin=None) -> Tuple[torch.Tensor]:
-        original, mask = check_image_mask(original, mask, 'Blend Inpaint')
-        if len(inpaint.shape) < 4:
-            # image tensor shape should be [B, H, W, C], but batch somehow is missing
-            inpaint = inpaint[None,:,:,:]
-        if inpaint.shape[0] < original.shape[0]:
-            print("Blend Inpaint gets batch of original images (%d) but only (%d) inpaint images" % (original.shape[0], inpaint.shape[0]))
-            original= original[:inpaint.shape[0],:,:]
-            mask = mask[:inpaint.shape[0],:,:]
-        if inpaint.shape[0] > original.shape[0]:
-            # batch over inpaint
-            count = 0
-            original_list = []
-            mask_list = []
-            origin_list = []
-            while (count < inpaint.shape[0]):
-                for i in range(original.shape[0]):
-                    original_list.append(original[i][None,:,:,:])
-                    mask_list.append(mask[i][None,:,:])
-                    if origin is not None:
-                        origin_list.append(origin[i][None,:])
-                    count += 1
-                    if count >= inpaint.shape[0]:
-                        break
-            original = torch.concat(original_list, dim=0)
-            mask = torch.concat(mask_list, dim=0)
-            if origin is not None:
-                origin = torch.concat(origin_list, dim=0)
-        if kernel % 2 == 0:
-            kernel += 1
-        transform = T.GaussianBlur(kernel_size=(kernel, kernel), sigma=(sigma, sigma))
-        ret = []
-        blurred = []
-        for i in range(inpaint.shape[0]):
-            if origin is None:
-                blurred_mask = transform(mask[i][None,None,:,:]).to(original.device).to(original.dtype)
-                blurred.append(blurred_mask[0])
-                result = torch.nn.functional.interpolate(
-                    inpaint[i][None,:,:,:].permute(0, 3, 1, 2),
-                    size=(
-                        original[i].shape[0],
-                        original[i].shape[1],
-                    )
-                ).permute(0, 2, 3, 1).to(original.device).to(original.dtype)
-            else:
-                # got mask from CutForInpaint
-                height, width, _ = original[i].shape
-                x0 = origin[i][0].item()
-                y0 = origin[i][1].item()
-                if mask[i].shape[0] < height or mask[i].shape[1] < width:
-                    padded_mask = F.pad(input=mask[i], pad=(x0, width-x0-mask[i].shape[1],
-                                                            y0, height-y0-mask[i].shape[0]), mode='constant', value=0)
-                else:
-                    padded_mask = mask[i]
-                blurred_mask = transform(padded_mask[None,None,:,:]).to(original.device).to(original.dtype)
-                blurred.append(blurred_mask[0][0])
-                result = F.pad(input=inpaint[i], pad=(0, 0, x0, width-x0-inpaint[i].shape[1],
-                                                      y0, height-y0-inpaint[i].shape[0]), mode='constant', value=0)
-                result = result[None,:,:,:].to(original.device).to(original.dtype)
-            ret.append(original[i] * (1.0 - blurred_mask[0][0][:,:,None]) + result[0] * blurred_mask[0][0][:,:,None])
-        return (torch.stack(ret), torch.stack(blurred), )
-class CutForInpaint:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {
-                        "image": ("IMAGE",),
-                        "mask": ("MASK",),
-                        "width": ("INT", {"default": 512, "min": 64, "max": 2048}),
-                        "height": ("INT", {"default": 512, "min": 64, "max": 2048}),
-                     },
-                }
-    CATEGORY = "inpaint"
-    RETURN_TYPES = ("IMAGE","MASK","VECTOR",)
-    RETURN_NAMES = ("image","mask","origin",)
-    FUNCTION = "cut_for_inpaint"
-    def cut_for_inpaint(self, image: torch.Tensor, mask: torch.Tensor, width: int, height: int):
-        image, mask = check_image_mask(image, mask, 'BrushNet')
-        ret = []
-        msk = []
-        org = []
-        for i in range(image.shape[0]):
-            x0, y0, w, h = cut_with_mask(mask[i], width, height)
-            ret.append((image[i][y0:y0+h,x0:x0+w,:]))
-            msk.append((mask[i][y0:y0+h,x0:x0+w]))
-            org.append(torch.IntTensor([x0,y0]))
-        return (torch.stack(ret), torch.stack(msk), torch.stack(org), )
-#### Utility function
-def get_files_with_extension(folder_name, extension=['.safetensors']):
-    try:
-        folders = folder_paths.get_folder_paths(folder_name)
-    except:
-        folders = []
-    if not folders:
-        folders = [os.path.join(folder_paths.models_dir, folder_name)]
-    if not os.path.isdir(folders[0]):
-        folders = [os.path.join(folder_paths.base_path, folder_name)]
-    if not os.path.isdir(folders[0]):
-        return {}
-    filtered_folders = []
-    for x in folders:
-        if not os.path.isdir(x):
-            continue
-        the_same = False
-        for y in filtered_folders:
-            if os.path.samefile(x, y):
-                the_same = True
-                break
-        if not the_same:
-            filtered_folders.append(x)
-    if not filtered_folders:
-        return {}
-    output = {}
-    for x in filtered_folders:
-        files, folders_all = folder_paths.recursive_search(x, excluded_dir_names=[".git"])
-        filtered_files = folder_paths.filter_files_extensions(files, extension)
-        for f in filtered_files:
-            output[f] = x
-    return output
-# get blocks from state_dict so we could know which model it is
-def brushnet_blocks(sd):
-    brushnet_down_block = 0
-    brushnet_mid_block = 0
-    brushnet_up_block = 0
-    for key in sd:
-        if 'brushnet_down_block' in key:
-            brushnet_down_block += 1
-        if 'brushnet_mid_block' in key:
-            brushnet_mid_block += 1
-        if 'brushnet_up_block' in key:
-            brushnet_up_block += 1
-    return (brushnet_down_block, brushnet_mid_block, brushnet_up_block, len(sd))
-# Check models compatibility
-def check_compatibilty(model, brushnet):
-    is_SDXL = False
-    is_PP = False
-    if isinstance(model.model.model_config, comfy.supported_models.SD15):
-        print('Base model type: SD1.5')
-        is_SDXL = False
-        if brushnet["SDXL"]:
-            raise Exception("Base model is SD15, but BrushNet is SDXL type")
-        if brushnet["PP"]:
-            is_PP = True
-    elif isinstance(model.model.model_config, comfy.supported_models.SDXL):
-        print('Base model type: SDXL')
-        is_SDXL = True
-        if not brushnet["SDXL"]:
-            raise Exception("Base model is SDXL, but BrushNet is SD15 type")
-    else:
-        print('Base model type: ', type(model.model.model_config))
-        raise Exception("Unsupported model type: " + str(type(model.model.model_config)))
-    return (is_SDXL, is_PP)
-def check_image_mask(image, mask, name):
-    if len(image.shape) < 4:
-        # image tensor shape should be [B, H, W, C], but batch somehow is missing
-        image = image[None,:,:,:]
-    if len(mask.shape) > 3:
-        # mask tensor shape should be [B, H, W] but we get [B, H, W, C], image may be?
-        # take first mask, red channel
-        mask = (mask[:,:,:,0])[:,:,:]
-    elif len(mask.shape) < 3:
-        # mask tensor shape should be [B, H, W] but batch somehow is missing
-        mask = mask[None,:,:]
-    if image.shape[0] > mask.shape[0]:
-        print(name, "gets batch of images (%d) but only %d masks" % (image.shape[0], mask.shape[0]))
-        if mask.shape[0] == 1:
-            print(name, "will copy the mask to fill batch")
-            mask = torch.cat([mask] * image.shape[0], dim=0)
-        else:
-            print(name, "will add empty masks to fill batch")
-            empty_mask = torch.zeros([image.shape[0] - mask.shape[0], mask.shape[1], mask.shape[2]])
-            mask = torch.cat([mask, empty_mask], dim=0)
-    elif image.shape[0] < mask.shape[0]:
-        print(name, "gets batch of images (%d) but too many (%d) masks" % (image.shape[0], mask.shape[0]))
-        mask = mask[:image.shape[0],:,:]
-    return (image, mask)
-# Prepare image and mask
-def prepare_image(image, mask):
-    image, mask = check_image_mask(image, mask, 'BrushNet')
-    print("BrushNet image.shape =", image.shape, "mask.shape =", mask.shape)
-    if mask.shape[2] != image.shape[2] or mask.shape[1] != image.shape[1]:
-        raise Exception("Image and mask should be the same size")
-    # As a suggestion of inferno46n2 (https://github.com/nullquant/ComfyUI-BrushNet/issues/64)
-    mask = mask.round()
-    masked_image = image * (1.0 - mask[:,:,:,None])
-    return (masked_image, mask)
-# Get origin of the mask
-def cut_with_mask(mask, width, height):
-    iy, ix = (mask == 1).nonzero(as_tuple=True)
-    h0, w0 = mask.shape
-    if iy.numel() == 0:
-        x_c = w0 / 2.0
-        y_c = h0 / 2.0
-    else:
-        x_min = ix.min().item()
-        x_max = ix.max().item()
-        y_min = iy.min().item()
-        y_max = iy.max().item()
-        if x_max - x_min > width or y_max - y_min > height:
-            raise Exception("Masked area is bigger than provided dimensions")
-        x_c = (x_min + x_max) / 2.0
-        y_c = (y_min + y_max) / 2.0
-    width2 = width / 2.0
-    height2 = height / 2.0
-    if w0 <= width:
-        x0 = 0
-        w = w0
-    else:
-        x0 = max(0, x_c - width2)
-        w = width
-        if x0 + width > w0:
-            x0 = w0 - width
-    if h0 <= height:
-        y0 = 0
-        h = h0
-    else:
-        y0 = max(0, y_c - height2)
-        h = height
-        if y0 + height > h0:
-            y0 = h0 - height
-    return (int(x0), int(y0), int(w), int(h))
-# Prepare conditioning_latents
-@torch.inference_mode()
-def get_image_latents(masked_image, mask, vae, scaling_factor):
-    processed_image = masked_image.to(vae.device)
-    image_latents = vae.encode(processed_image[:,:,:,:3]) * scaling_factor
-    processed_mask = 1. - mask[:,None,:,:]
-    interpolated_mask = torch.nn.functional.interpolate(
-                processed_mask,
-                size=(
-                    image_latents.shape[-2],
-                    image_latents.shape[-1]
-                )
-            )
-    interpolated_mask = interpolated_mask.to(image_latents.device)
-    conditioning_latents = [image_latents, interpolated_mask]
-    print('BrushNet CL: image_latents shape =', image_latents.shape, 'interpolated_mask shape =', interpolated_mask.shape)
-    return conditioning_latents
-# Main function where magic happens
-@torch.inference_mode()
-def brushnet_inference(x, timesteps, transformer_options, debug):
-    if 'model_patch' not in transformer_options:
-        print('BrushNet inference: there is no model_patch key in transformer_options')
-        return ([], 0, [])
-    mp = transformer_options['model_patch']
-    if 'brushnet' not in mp:
-        print('BrushNet inference: there is no brushnet key in mdel_patch')
-        return ([], 0, [])
-    bo = mp['brushnet']
-    if 'model' not in bo:
-        print('BrushNet inference: there is no model key in brushnet')
-        return ([], 0, [])
-    brushnet = bo['model']
-    if not (isinstance(brushnet, BrushNetModel) or isinstance(brushnet, PowerPaintModel)):
-        print('BrushNet model is not a BrushNetModel class')
-        return ([], 0, [])
-    torch_dtype = bo['dtype']
-    cl_list = bo['latents']
-    brushnet_conditioning_scale, control_guidance_start, control_guidance_end = bo['controls']
-    pe = bo['prompt_embeds']
-    npe = bo['negative_prompt_embeds']
-    ppe, nppe, time_ids = bo['add_embeds']
-    #do_classifier_free_guidance = mp['free_guidance']
-    do_classifier_free_guidance = len(transformer_options['cond_or_uncond']) > 1
-    x = x.detach().clone()
-    x = x.to(torch_dtype).to(brushnet.device)
-    timesteps = timesteps.detach().clone()
-    timesteps = timesteps.to(torch_dtype).to(brushnet.device)
-    total_steps = mp['total_steps']
-    step = mp['step']
-    added_cond_kwargs = {}
-    if do_classifier_free_guidance and step == 0:
-        print('BrushNet inference: do_classifier_free_guidance is True')
-    sub_idx = None
-    if 'ad_params' in transformer_options and 'sub_idxs' in transformer_options['ad_params']:
-        sub_idx = transformer_options['ad_params']['sub_idxs']
-    # we have batch input images
-    batch = cl_list[0].shape[0]
-    # we have incoming latents
-    latents_incoming = x.shape[0]
-    # and we already got some
-    latents_got = bo['latent_id']
-    if step == 0 or batch > 1:
-        print('BrushNet inference, step = %d: image batch = %d, got %d latents, starting from %d' \
-                % (step, batch, latents_incoming, latents_got))
-    image_latents = []
-    masks = []
-    prompt_embeds = []
-    negative_prompt_embeds = []
-    pooled_prompt_embeds = []
-    negative_pooled_prompt_embeds = []
-    if sub_idx:
-        # AnimateDiff indexes detected
-        if step == 0:
-            print('BrushNet inference: AnimateDiff indexes detected and applied')
-        batch = len(sub_idx)
-        if do_classifier_free_guidance:
-            for i in sub_idx:
-                image_latents.append(cl_list[0][i][None,:,:,:])
-                masks.append(cl_list[1][i][None,:,:,:])
-                prompt_embeds.append(pe)
-                negative_prompt_embeds.append(npe)
-                pooled_prompt_embeds.append(ppe)
-                negative_pooled_prompt_embeds.append(nppe)
-            for i in sub_idx:
-                image_latents.append(cl_list[0][i][None,:,:,:])
-                masks.append(cl_list[1][i][None,:,:,:])
-        else:
-            for i in sub_idx:
-                image_latents.append(cl_list[0][i][None,:,:,:])
-                masks.append(cl_list[1][i][None,:,:,:])
-                prompt_embeds.append(pe)
-                pooled_prompt_embeds.append(ppe)
-    else:
-        # do_classifier_free_guidance = 2 passes, 1st pass is cond, 2nd is uncond
-        continue_batch = True
-        for i in range(latents_incoming):
-            number = latents_got + i
-            if number < batch:
-                # 1st pass, cond
-                image_latents.append(cl_list[0][number][None,:,:,:])
-                masks.append(cl_list[1][number][None,:,:,:])
-                prompt_embeds.append(pe)
-                pooled_prompt_embeds.append(ppe)
-            elif do_classifier_free_guidance and number < batch * 2:
-                # 2nd pass, uncond
-                image_latents.append(cl_list[0][number-batch][None,:,:,:])
-                masks.append(cl_list[1][number-batch][None,:,:,:])
-                negative_prompt_embeds.append(npe)
-                negative_pooled_prompt_embeds.append(nppe)
-            else:
-                # latent batch
-                image_latents.append(cl_list[0][0][None,:,:,:])
-                masks.append(cl_list[1][0][None,:,:,:])
-                prompt_embeds.append(pe)
-                pooled_prompt_embeds.append(ppe)
-                latents_got = -i
-                continue_batch = False
-        if continue_batch:
-            # we don't have full batch yet
-            if do_classifier_free_guidance:
-                if number < batch * 2 - 1:
-                    bo['latent_id'] = number + 1
-                else:
-                    bo['latent_id'] = 0
-            else:
-                if number < batch - 1:
-                    bo['latent_id'] = number + 1
-                else:
-                    bo['latent_id'] = 0
-        else:
-            bo['latent_id'] = 0
-    cl = []
-    for il, m in zip(image_latents, masks):
-        cl.append(torch.concat([il, m], dim=1))
-    cl2apply = torch.concat(cl, dim=0)
-    conditioning_latents = cl2apply.to(torch_dtype).to(brushnet.device)
-    # print("BrushNet CL: conditioning_latents shape =", conditioning_latents.shape)
-    # print("BrushNet CL: x shape =", x.shape)
-    prompt_embeds.extend(negative_prompt_embeds)
-    prompt_embeds = torch.concat(prompt_embeds, dim=0).to(torch_dtype).to(brushnet.device)
-    if ppe is not None:
-        added_cond_kwargs = {}
-        added_cond_kwargs['time_ids'] = torch.concat([time_ids] * latents_incoming, dim = 0).to(torch_dtype).to(brushnet.device)
-        pooled_prompt_embeds.extend(negative_pooled_prompt_embeds)
-        pooled_prompt_embeds = torch.concat(pooled_prompt_embeds, dim=0).to(torch_dtype).to(brushnet.device)
-        added_cond_kwargs['text_embeds'] = pooled_prompt_embeds
-    else:
-        added_cond_kwargs = None
-    if x.shape[2] != conditioning_latents.shape[2] or x.shape[3] != conditioning_latents.shape[3]:
-        if step == 0:
-            print('BrushNet inference: image', conditioning_latents.shape, 'and latent', x.shape, 'have different size, resizing image')
-        conditioning_latents = torch.nn.functional.interpolate(
-            conditioning_latents, size=(
-                x.shape[2],
-                x.shape[3],
-            ), mode='bicubic',
-        ).to(torch_dtype).to(brushnet.device)
-    if step == 0:
-        print('BrushNet inference: sample', x.shape, ', CL', conditioning_latents.shape, 'dtype', torch_dtype)
-    if debug: print('BrushNet: step =', step)
-    if step < control_guidance_start or step > control_guidance_end:
-        cond_scale = 0.0
-    else:
-        cond_scale = brushnet_conditioning_scale
-    return brushnet(x,
-                    encoder_hidden_states=prompt_embeds,
-                    brushnet_cond=conditioning_latents,
-                    timestep = timesteps,
-                    conditioning_scale=cond_scale,
-                    guess_mode=False,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=False,
-                    debug=debug,
-                )
-# This is main patch function
-def add_brushnet_patch(model, brushnet, torch_dtype, conditioning_latents,
-                       controls,
-                       prompt_embeds, negative_prompt_embeds,
-                       pooled_prompt_embeds, negative_pooled_prompt_embeds, time_ids,
-                       debug):
-    is_SDXL = isinstance(model.model.model_config, comfy.supported_models.SDXL)
-    if is_SDXL:
-        input_blocks = [[0, comfy.ops.disable_weight_init.Conv2d],
-                        [1, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [2, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [3, comfy.ldm.modules.diffusionmodules.openaimodel.Downsample],
-                        [4, comfy.ldm.modules.attention.SpatialTransformer],
-                        [5, comfy.ldm.modules.attention.SpatialTransformer],
-                        [6, comfy.ldm.modules.diffusionmodules.openaimodel.Downsample],
-                        [7, comfy.ldm.modules.attention.SpatialTransformer],
-                        [8, comfy.ldm.modules.attention.SpatialTransformer]]
-        middle_block  = [0, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock]
-        output_blocks = [[0, comfy.ldm.modules.attention.SpatialTransformer],
-                        [1, comfy.ldm.modules.attention.SpatialTransformer],
-                        [2, comfy.ldm.modules.attention.SpatialTransformer],
-                        [2, comfy.ldm.modules.diffusionmodules.openaimodel.Upsample],
-                        [3, comfy.ldm.modules.attention.SpatialTransformer],
-                        [4, comfy.ldm.modules.attention.SpatialTransformer],
-                        [5, comfy.ldm.modules.attention.SpatialTransformer],
-                        [5, comfy.ldm.modules.diffusionmodules.openaimodel.Upsample],
-                        [6, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [7, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [8, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock]]
-    else:
-        input_blocks = [[0, comfy.ops.disable_weight_init.Conv2d],
-                        [1, comfy.ldm.modules.attention.SpatialTransformer],
-                        [2, comfy.ldm.modules.attention.SpatialTransformer],
-                        [3, comfy.ldm.modules.diffusionmodules.openaimodel.Downsample],
-                        [4, comfy.ldm.modules.attention.SpatialTransformer],
-                        [5, comfy.ldm.modules.attention.SpatialTransformer],
-                        [6, comfy.ldm.modules.diffusionmodules.openaimodel.Downsample],
-                        [7, comfy.ldm.modules.attention.SpatialTransformer],
-                        [8, comfy.ldm.modules.attention.SpatialTransformer],
-                        [9, comfy.ldm.modules.diffusionmodules.openaimodel.Downsample],
-                        [10, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [11, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock]]
-        middle_block  = [0, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock]
-        output_blocks = [[0, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [1, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [2, comfy.ldm.modules.diffusionmodules.openaimodel.ResBlock],
-                        [2, comfy.ldm.modules.diffusionmodules.openaimodel.Upsample],
-                        [3, comfy.ldm.modules.attention.SpatialTransformer],
-                        [4, comfy.ldm.modules.attention.SpatialTransformer],
-                        [5, comfy.ldm.modules.attention.SpatialTransformer],
-                        [5, comfy.ldm.modules.diffusionmodules.openaimodel.Upsample],
-                        [6, comfy.ldm.modules.attention.SpatialTransformer],
-                        [7, comfy.ldm.modules.attention.SpatialTransformer],
-                        [8, comfy.ldm.modules.attention.SpatialTransformer],
-                        [8, comfy.ldm.modules.diffusionmodules.openaimodel.Upsample],
-                        [9, comfy.ldm.modules.attention.SpatialTransformer],
-                        [10, comfy.ldm.modules.attention.SpatialTransformer],
-                        [11, comfy.ldm.modules.attention.SpatialTransformer]]
-    def last_layer_index(block, tp):
-        layer_list = []
-        for layer in block:
-            layer_list.append(type(layer))
-        layer_list.reverse()
-        if tp not in layer_list:
-            return -1, layer_list.reverse()
-        return len(layer_list) - 1 - layer_list.index(tp), layer_list
-    def brushnet_forward(model, x, timesteps, transformer_options, control):
-        if 'brushnet' not in transformer_options['model_patch']:
-            input_samples = []
-            mid_sample = 0
-            output_samples = []
-        else:
-            # brushnet inference
-            input_samples, mid_sample, output_samples = brushnet_inference(x, timesteps, transformer_options, debug)
-        # give additional samples to blocks
-        for i, tp in input_blocks:
-            idx, layer_list = last_layer_index(model.input_blocks[i], tp)
-            if idx < 0:
-                print("BrushNet can't find", tp, "layer in", i,"input block:", layer_list)
-                continue
-            model.input_blocks[i][idx].add_sample_after = input_samples.pop(0) if input_samples else 0
-        idx, layer_list = last_layer_index(model.middle_block, middle_block[1])
-        if idx < 0:
-            print("BrushNet can't find", middle_block[1], "layer in middle block", layer_list)
-        model.middle_block[idx].add_sample_after = mid_sample
-        for i, tp in output_blocks:
-            idx, layer_list = last_layer_index(model.output_blocks[i], tp)
-            if idx < 0:
-                print("BrushNet can't find", tp, "layer in", i,"outnput block:", layer_list)
-                continue
-            model.output_blocks[i][idx].add_sample_after = output_samples.pop(0) if output_samples else 0
-    patch_model_function_wrapper(model, brushnet_forward)
-    to = add_model_patch_option(model)
-    mp = to['model_patch']
-    if 'brushnet' not in mp:
-        mp['brushnet'] = {}
-    bo = mp['brushnet']
-    bo['model'] = brushnet
-    bo['dtype'] = torch_dtype
-    bo['latents'] = conditioning_latents
-    bo['controls'] = controls
-    bo['prompt_embeds'] = prompt_embeds
-    bo['negative_prompt_embeds'] = negative_prompt_embeds
-    bo['add_embeds'] = (pooled_prompt_embeds, negative_pooled_prompt_embeds, time_ids)
-    bo['latent_id'] = 0
-    # patch layers `forward` so we can apply brushnet
-    def forward_patched_by_brushnet(self, x, *args, **kwargs):
-        h = self.original_forward(x, *args, **kwargs)
-        if hasattr(self, 'add_sample_after') and type(self):
-            to_add = self.add_sample_after
-            if torch.is_tensor(to_add):
-                # interpolate due to RAUNet
-                if h.shape[2] != to_add.shape[2] or h.shape[3] != to_add.shape[3]:
-                    to_add = torch.nn.functional.interpolate(to_add, size=(h.shape[2], h.shape[3]), mode='bicubic')
-                h += to_add.to(h.dtype).to(h.device)
-            else:
-                h += self.add_sample_after
-            self.add_sample_after = 0
-        return h
-    for i, block in enumerate(model.model.diffusion_model.input_blocks):
-        for j, layer in enumerate(block):
-            if not hasattr(layer, 'original_forward'):
-                layer.original_forward = layer.forward
-            layer.forward = types.MethodType(forward_patched_by_brushnet, layer)
-            layer.add_sample_after = 0
-    for j, layer in enumerate(model.model.diffusion_model.middle_block):
-        if not hasattr(layer, 'original_forward'):
-            layer.original_forward = layer.forward
-        layer.forward = types.MethodType(forward_patched_by_brushnet, layer)
-        layer.add_sample_after = 0
-    for i, block in enumerate(model.model.diffusion_model.output_blocks):
-        for j, layer in enumerate(block):
-            if not hasattr(layer, 'original_forward'):
-                layer.original_forward = layer.forward
-            layer.forward = types.MethodType(forward_patched_by_brushnet, layer)
-            layer.add_sample_after = 0

MagicQuill/comfy/.DS_Store DELETED Viewed

Binary file (6.15 kB)

MagicQuill/comfy/checkpoint_pickle.py DELETED Viewed

@@ -1,13 +0,0 @@
-import pickle
-load = pickle.load
-class Empty:
-    pass
-class Unpickler(pickle.Unpickler):
-    def find_class(self, module, name):
-        #TODO: safe unpickle
-        if module.startswith("pytorch_lightning"):
-            return Empty
-        return super().find_class(module, name)

MagicQuill/comfy/cldm/__pycache__/cldm.cpython-310.pyc DELETED Viewed

Binary file (6.11 kB)

MagicQuill/comfy/cldm/cldm.py DELETED Viewed

@@ -1,313 +0,0 @@
-#taken from: https://github.com/lllyasviel/ControlNet
-#and modified
-import torch
-import torch as th
-import torch.nn as nn
-from ..ldm.modules.diffusionmodules.util import (
-    zero_module,
-    timestep_embedding,
-)
-from ..ldm.modules.attention import SpatialTransformer
-from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
-from ..ldm.util import exists
-import comfy.ops
-class ControlledUnetModel(UNetModel):
-    #implemented in the ldm unet
-    pass
-class ControlNet(nn.Module):
-    def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        hint_channels,
-        num_res_blocks,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        dtype=torch.float32,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,    # custom transformer support
-        transformer_depth=1,              # custom transformer support
-        context_dim=None,                 # custom transformer support
-        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        disable_self_attentions=None,
-        num_attention_blocks=None,
-        disable_middle_self_attn=False,
-        use_linear_in_transformer=False,
-        adm_in_channels=None,
-        transformer_depth_middle=None,
-        transformer_depth_output=None,
-        attn_precision=None,
-        device=None,
-        operations=comfy.ops.disable_weight_init,
-        **kwargs,
-    ):
-        super().__init__()
-        assert use_spatial_transformer == True, "use_spatial_transformer has to be true"
-        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
-        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
-            # from omegaconf.listconfig import ListConfig
-            # if type(context_dim) == ListConfig:
-            #     context_dim = list(context_dim)
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
-        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
-        self.dims = dims
-        self.image_size = image_size
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        if isinstance(num_res_blocks, int):
-            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
-        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
-                                 "as a list/tuple (per-level) with the same length as channel_mult")
-            self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
-        transformer_depth = transformer_depth[:]
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        self.dtype = dtype
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-        time_embed_dim = model_channels * 4
-        self.time_embed = nn.Sequential(
-            operations.Linear(model_channels, time_embed_dim, dtype=self.dtype, device=device),
-            nn.SiLU(),
-            operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
-        )
-        if self.num_classes is not None:
-            if isinstance(self.num_classes, int):
-                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-            elif self.num_classes == "continuous":
-                print("setting up linear c_adm embedding layer")
-                self.label_emb = nn.Linear(1, time_embed_dim)
-            elif self.num_classes == "sequential":
-                assert adm_in_channels is not None
-                self.label_emb = nn.Sequential(
-                    nn.Sequential(
-                        operations.Linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device),
-                        nn.SiLU(),
-                        operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
-                    )
-                )
-            else:
-                raise ValueError()
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    operations.conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device)
-                )
-            ]
-        )
-        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels, operations=operations, dtype=self.dtype, device=device)])
-        self.input_hint_block = TimestepEmbedSequential(
-                    operations.conv_nd(dims, hint_channels, 16, 3, padding=1, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 16, 16, 3, padding=1, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 16, 32, 3, padding=1, stride=2, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 32, 32, 3, padding=1, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 32, 96, 3, padding=1, stride=2, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 96, 96, 3, padding=1, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 96, 256, 3, padding=1, stride=2, dtype=self.dtype, device=device),
-                    nn.SiLU(),
-                    operations.conv_nd(dims, 256, model_channels, 3, padding=1, dtype=self.dtype, device=device)
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for nr in range(self.num_res_blocks[level]):
-                layers = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                        dtype=self.dtype,
-                        device=device,
-                        operations=operations,
-                    )
-                ]
-                ch = mult * model_channels
-                num_transformers = transformer_depth.pop(0)
-                if num_transformers > 0:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
-                        layers.append(
-                            SpatialTransformer(
-                                ch, num_heads, dim_head, depth=num_transformers, context_dim=context_dim,
-                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint, attn_precision=attn_precision, dtype=self.dtype, device=device, operations=operations
-                            )
-                        )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                            dtype=self.dtype,
-                            device=device,
-                            operations=operations
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch, dtype=self.dtype, device=device, operations=operations
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
-                ds *= 2
-                self._feature_size += ch
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            #num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        mid_block = [
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-                dtype=self.dtype,
-                device=device,
-                operations=operations
-            )]
-        if transformer_depth_middle >= 0:
-            mid_block += [SpatialTransformer(  # always uses a self-attn
-                            ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
-                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
-                            use_checkpoint=use_checkpoint, attn_precision=attn_precision, dtype=self.dtype, device=device, operations=operations
-                        ),
-            ResBlock(
-                ch,
-                time_embed_dim,
-                dropout,
-                dims=dims,
-                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-                dtype=self.dtype,
-                device=device,
-                operations=operations
-            )]
-        self.middle_block = TimestepEmbedSequential(*mid_block)
-        self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device)
-        self._feature_size += ch
-    def make_zero_conv(self, channels, operations=None, dtype=None, device=None):
-        return TimestepEmbedSequential(operations.conv_nd(self.dims, channels, channels, 1, padding=0, dtype=dtype, device=device))
-    def forward(self, x, hint, timesteps, context, y=None, **kwargs):
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
-        emb = self.time_embed(t_emb)
-        guided_hint = self.input_hint_block(hint, emb, context)
-        outs = []
-        hs = []
-        if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
-            emb = emb + self.label_emb(y)
-        h = x
-        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
-            if guided_hint is not None:
-                h = module(h, emb, context)
-                h += guided_hint
-                guided_hint = None
-            else:
-                h = module(h, emb, context)
-            outs.append(zero_conv(h, emb, context))
-        h = self.middle_block(h, emb, context)
-        outs.append(self.middle_block_out(h, emb, context))
-        return outs

MagicQuill/comfy/cli_args.py DELETED Viewed

@@ -1,143 +0,0 @@
-import argparse
-import enum
-import comfy.options
-class EnumAction(argparse.Action):
-    """
-    Argparse action for handling Enums
-    """
-    def __init__(self, **kwargs):
-        # Pop off the type value
-        enum_type = kwargs.pop("type", None)
-        # Ensure an Enum subclass is provided
-        if enum_type is None:
-            raise ValueError("type must be assigned an Enum when using EnumAction")
-        if not issubclass(enum_type, enum.Enum):
-            raise TypeError("type must be an Enum when using EnumAction")
-        # Generate choices from the Enum
-        choices = tuple(e.value for e in enum_type)
-        kwargs.setdefault("choices", choices)
-        kwargs.setdefault("metavar", f"[{','.join(list(choices))}]")
-        super(EnumAction, self).__init__(**kwargs)
-        self._enum = enum_type
-    def __call__(self, parser, namespace, values, option_string=None):
-        # Convert value back into an Enum
-        value = self._enum(values)
-        setattr(namespace, self.dest, value)
-parser = argparse.ArgumentParser()
-parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)")
-parser.add_argument("--port", type=int, default=8188, help="Set the listen port.")
-parser.add_argument("--tls-keyfile", type=str, help="Path to TLS (SSL) key file. Enables TLS, makes app accessible at https://... requires --tls-certfile to function")
-parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certificate file. Enables TLS, makes app accessible at https://... requires --tls-keyfile to function")
-parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
-parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")
-parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
-parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
-parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory).")
-parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory.")
-parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
-parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
-cm_group = parser.add_mutually_exclusive_group()
-cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
-cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
-fp_group = parser.add_mutually_exclusive_group()
-fp_group.add_argument("--force-fp32", action="store_true", help="Force fp32 (If this makes your GPU work better please report it).")
-fp_group.add_argument("--force-fp16", action="store_true", help="Force fp16.")
-fpunet_group = parser.add_mutually_exclusive_group()
-fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the UNET in bf16. This should only be used for testing stuff.")
-fpunet_group.add_argument("--fp16-unet", action="store_true", help="Store unet weights in fp16.")
-fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
-fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
-fpvae_group = parser.add_mutually_exclusive_group()
-fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in fp16, might cause black images.")
-fpvae_group.add_argument("--fp32-vae", action="store_true", help="Run the VAE in full precision fp32.")
-fpvae_group.add_argument("--bf16-vae", action="store_true", help="Run the VAE in bf16.")
-parser.add_argument("--cpu-vae", action="store_true", help="Run the VAE on the CPU.")
-fpte_group = parser.add_mutually_exclusive_group()
-fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Store text encoder weights in fp8 (e4m3fn variant).")
-fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
-fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
-fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
-parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
-parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
-parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")
-class LatentPreviewMethod(enum.Enum):
-    NoPreviews = "none"
-    Auto = "auto"
-    Latent2RGB = "latent2rgb"
-    TAESD = "taesd"
-parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)
-attn_group = parser.add_mutually_exclusive_group()
-attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
-attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
-attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
-parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")
-upcast = parser.add_mutually_exclusive_group()
-upcast.add_argument("--force-upcast-attention", action="store_true", help="Force enable attention upcasting, please report if it fixes black images.")
-upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")
-vram_group = parser.add_mutually_exclusive_group()
-vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
-vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
-vram_group.add_argument("--normalvram", action="store_true", help="Used to force normal vram use if lowvram gets automatically enabled.")
-vram_group.add_argument("--lowvram", action="store_true", help="Split the unet in parts to use less vram.")
-vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
-vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
-parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
-parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
-parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
-parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
-parser.add_argument("--windows-standalone-build", action="store_true", help="Windows standalone build: Enable convenient things that most people using the standalone windows build will probably enjoy (like auto opening the page on startup).")
-parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
-parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
-parser.add_argument("--verbose", action="store_true", help="Enables more debug prints.")
-if comfy.options.args_parsing:
-    args = parser.parse_args()
-else:
-    args = parser.parse_args([])
-if args.windows_standalone_build:
-    args.auto_launch = True
-if args.disable_auto_launch:
-    args.auto_launch = False
-import logging
-logging_level = logging.INFO
-if args.verbose:
-    logging_level = logging.DEBUG
-logging.basicConfig(format="%(message)s", level=logging_level)

MagicQuill/comfy/clip_config_bigg.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "architectures": [
-    "CLIPTextModel"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 0,
-  "dropout": 0.0,
-  "eos_token_id": 2,
-  "hidden_act": "gelu",
-  "hidden_size": 1280,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 5120,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 77,
-  "model_type": "clip_text_model",
-  "num_attention_heads": 20,
-  "num_hidden_layers": 32,
-  "pad_token_id": 1,
-  "projection_dim": 1280,
-  "torch_dtype": "float32",
-  "vocab_size": 49408
-}

MagicQuill/comfy/clip_model.py DELETED Viewed

@@ -1,194 +0,0 @@
-import torch
-from comfy.ldm.modules.attention import optimized_attention_for_device
-class CLIPAttention(torch.nn.Module):
-    def __init__(self, embed_dim, heads, dtype, device, operations):
-        super().__init__()
-        self.heads = heads
-        self.q_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.k_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.v_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.out_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-    def forward(self, x, mask=None, optimized_attention=None):
-        q = self.q_proj(x)
-        k = self.k_proj(x)
-        v = self.v_proj(x)
-        out = optimized_attention(q, k, v, self.heads, mask)
-        return self.out_proj(out)
-ACTIVATIONS = {"quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
-               "gelu": torch.nn.functional.gelu,
-}
-class CLIPMLP(torch.nn.Module):
-    def __init__(self, embed_dim, intermediate_size, activation, dtype, device, operations):
-        super().__init__()
-        self.fc1 = operations.Linear(embed_dim, intermediate_size, bias=True, dtype=dtype, device=device)
-        self.activation = ACTIVATIONS[activation]
-        self.fc2 = operations.Linear(intermediate_size, embed_dim, bias=True, dtype=dtype, device=device)
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.activation(x)
-        x = self.fc2(x)
-        return x
-class CLIPLayer(torch.nn.Module):
-    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations):
-        super().__init__()
-        self.layer_norm1 = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
-        self.self_attn = CLIPAttention(embed_dim, heads, dtype, device, operations)
-        self.layer_norm2 = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
-        self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device, operations)
-    def forward(self, x, mask=None, optimized_attention=None):
-        x += self.self_attn(self.layer_norm1(x), mask, optimized_attention)
-        x += self.mlp(self.layer_norm2(x))
-        return x
-class CLIPEncoder(torch.nn.Module):
-    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations):
-        super().__init__()
-        self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) for i in range(num_layers)])
-    def forward(self, x, mask=None, intermediate_output=None):
-        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layers) + intermediate_output
-        intermediate = None
-        for i, l in enumerate(self.layers):
-            x = l(x, mask, optimized_attention)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
-        super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
-    def forward(self, input_tokens):
-        return self.token_embedding(input_tokens) + self.position_embedding.weight
-class CLIPTextModel_(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        num_layers = config_dict["num_hidden_layers"]
-        embed_dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        intermediate_size = config_dict["intermediate_size"]
-        intermediate_activation = config_dict["hidden_act"]
-        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
-        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
-        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
-        x = self.embeddings(input_tokens)
-        mask = None
-        if attention_mask is not None:
-            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
-        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
-        if mask is not None:
-            mask += causal_mask
-        else:
-            mask = causal_mask
-        x, i = self.encoder(x, mask=mask, intermediate_output=intermediate_output)
-        x = self.final_layer_norm(x)
-        if i is not None and final_layer_norm_intermediate:
-            i = self.final_layer_norm(i)
-        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
-        return x, i, pooled_output
-class CLIPTextModel(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        self.num_layers = config_dict["num_hidden_layers"]
-        self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
-        embed_dim = config_dict["hidden_size"]
-        self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
-        self.text_projection.weight.copy_(torch.eye(embed_dim))
-        self.dtype = dtype
-    def get_input_embeddings(self):
-        return self.text_model.embeddings.token_embedding
-    def set_input_embeddings(self, embeddings):
-        self.text_model.embeddings.token_embedding = embeddings
-    def forward(self, *args, **kwargs):
-        x = self.text_model(*args, **kwargs)
-        out = self.text_projection(x[2])
-        return (x[0], x[1], out, x[2])
-class CLIPVisionEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
-        self.patch_embedding = operations.Conv2d(
-            in_channels=num_channels,
-            out_channels=embed_dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=False,
-            dtype=dtype,
-            device=device
-        )
-        num_patches = (image_size // patch_size) ** 2
-        num_positions = num_patches + 1
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
-    def forward(self, pixel_values):
-        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
-        return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)
-class CLIPVision(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        num_layers = config_dict["num_hidden_layers"]
-        embed_dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        intermediate_size = config_dict["intermediate_size"]
-        intermediate_activation = config_dict["hidden_act"]
-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
-        self.pre_layrnorm = operations.LayerNorm(embed_dim)
-        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
-        self.post_layernorm = operations.LayerNorm(embed_dim)
-    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
-        x = self.embeddings(pixel_values)
-        x = self.pre_layrnorm(x)
-        #TODO: attention_mask?
-        x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
-        pooled_output = self.post_layernorm(x[:, 0, :])
-        return x, i, pooled_output
-class CLIPVisionModelProjection(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        self.vision_model = CLIPVision(config_dict, dtype, device, operations)
-        self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
-    def forward(self, *args, **kwargs):
-        x = self.vision_model(*args, **kwargs)
-        out = self.visual_projection(x[2])
-        return (x[0], x[1], out)

MagicQuill/comfy/clip_vision.py DELETED Viewed

@@ -1,117 +0,0 @@
-from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
-import os
-import torch
-import json
-import logging
-import comfy.ops
-import comfy.model_patcher
-import comfy.model_management
-import comfy.utils
-import comfy.clip_model
-class Output:
-    def __getitem__(self, key):
-        return getattr(self, key)
-    def __setitem__(self, key, item):
-        setattr(self, key, item)
-def clip_preprocess(image, size=224):
-    mean = torch.tensor([ 0.48145466,0.4578275,0.40821073], device=image.device, dtype=image.dtype)
-    std = torch.tensor([0.26862954,0.26130258,0.27577711], device=image.device, dtype=image.dtype)
-    image = image.movedim(-1, 1)
-    if not (image.shape[2] == size and image.shape[3] == size):
-        scale = (size / min(image.shape[2], image.shape[3]))
-        image = torch.nn.functional.interpolate(image, size=(round(scale * image.shape[2]), round(scale * image.shape[3])), mode="bicubic", antialias=True)
-        h = (image.shape[2] - size)//2
-        w = (image.shape[3] - size)//2
-        image = image[:,:,h:h+size,w:w+size]
-    image = torch.clip((255. * image), 0, 255).round() / 255.0
-    return (image - mean.view([3,1,1])) / std.view([3,1,1])
-class ClipVisionModel():
-    def __init__(self, json_config):
-        with open(json_config) as f:
-            config = json.load(f)
-        self.load_device = comfy.model_management.text_encoder_device()
-        offload_device = comfy.model_management.text_encoder_offload_device()
-        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
-        self.model.eval()
-        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
-    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=False)
-    def get_sd(self):
-        return self.model.state_dict()
-    def encode_image(self, image):
-        comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device)).float()
-        out = self.model(pixel_values=pixel_values, intermediate_output=-2)
-        outputs = Output()
-        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
-        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
-        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
-        return outputs
-def convert_to_transformers(sd, prefix):
-    sd_k = sd.keys()
-    if "{}transformer.resblocks.0.attn.in_proj_weight".format(prefix) in sd_k:
-        keys_to_replace = {
-            "{}class_embedding".format(prefix): "vision_model.embeddings.class_embedding",
-            "{}conv1.weight".format(prefix): "vision_model.embeddings.patch_embedding.weight",
-            "{}positional_embedding".format(prefix): "vision_model.embeddings.position_embedding.weight",
-            "{}ln_post.bias".format(prefix): "vision_model.post_layernorm.bias",
-            "{}ln_post.weight".format(prefix): "vision_model.post_layernorm.weight",
-            "{}ln_pre.bias".format(prefix): "vision_model.pre_layrnorm.bias",
-            "{}ln_pre.weight".format(prefix): "vision_model.pre_layrnorm.weight",
-        }
-        for x in keys_to_replace:
-            if x in sd_k:
-                sd[keys_to_replace[x]] = sd.pop(x)
-        if "{}proj".format(prefix) in sd_k:
-            sd['visual_projection.weight'] = sd.pop("{}proj".format(prefix)).transpose(0, 1)
-        sd = transformers_convert(sd, prefix, "vision_model.", 48)
-    else:
-        replace_prefix = {prefix: ""}
-        sd = state_dict_prefix_replace(sd, replace_prefix)
-    return sd
-def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
-    if convert_keys:
-        sd = convert_to_transformers(sd, prefix)
-    if "vision_model.encoder.layers.47.layer_norm1.weight" in sd:
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
-    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
-    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
-    else:
-        return None
-    clip = ClipVisionModel(json_config)
-    m, u = clip.load_sd(sd)
-    if len(m) > 0:
-        logging.warning("missing clip vision: {}".format(m))
-    u = set(u)
-    keys = list(sd.keys())
-    for k in keys:
-        if k not in u:
-            t = sd.pop(k)
-            del t
-    return clip
-def load(ckpt_path):
-    sd = load_torch_file(ckpt_path)
-    if "visual.transformer.resblocks.0.attn.in_proj_weight" in sd:
-        return load_clipvision_from_sd(sd, prefix="visual.", convert_keys=True)
-    else:
-        return load_clipvision_from_sd(sd)

MagicQuill/comfy/clip_vision_config_g.json DELETED Viewed

@@ -1,18 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "gelu",
-  "hidden_size": 1664,
-  "image_size": 224,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 8192,
-  "layer_norm_eps": 1e-05,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 48,
-  "patch_size": 14,
-  "projection_dim": 1280,
-  "torch_dtype": "float32"
-}

MagicQuill/comfy/clip_vision_config_h.json DELETED Viewed

@@ -1,18 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "gelu",
-  "hidden_size": 1280,
-  "image_size": 224,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 5120,
-  "layer_norm_eps": 1e-05,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 32,
-  "patch_size": 14,
-  "projection_dim": 1024,
-  "torch_dtype": "float32"
-}

MagicQuill/comfy/clip_vision_config_vitl.json DELETED Viewed

@@ -1,18 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 1024,
-  "image_size": 224,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-05,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 24,
-  "patch_size": 14,
-  "projection_dim": 768,
-  "torch_dtype": "float32"
-}

MagicQuill/comfy/conds.py DELETED Viewed

@@ -1,83 +0,0 @@
-import torch
-import math
-import comfy.utils
-def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
-    return abs(a*b) // math.gcd(a, b)
-class CONDRegular:
-    def __init__(self, cond):
-        self.cond = cond
-    def _copy_with(self, cond):
-        return self.__class__(cond)
-    def process_cond(self, batch_size, device, **kwargs):
-        return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size).to(device))
-    def can_concat(self, other):
-        if self.cond.shape != other.cond.shape:
-            return False
-        return True
-    def concat(self, others):
-        conds = [self.cond]
-        for x in others:
-            conds.append(x.cond)
-        return torch.cat(conds)
-class CONDNoiseShape(CONDRegular):
-    def process_cond(self, batch_size, device, area, **kwargs):
-        data = self.cond
-        if area is not None:
-            dims = len(area) // 2
-            for i in range(dims):
-                data = data.narrow(i + 2, area[i + dims], area[i])
-        return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size).to(device))
-class CONDCrossAttn(CONDRegular):
-    def can_concat(self, other):
-        s1 = self.cond.shape
-        s2 = other.cond.shape
-        if s1 != s2:
-            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
-                return False
-            mult_min = lcm(s1[1], s2[1])
-            diff = mult_min // min(s1[1], s2[1])
-            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
-                return False
-        return True
-    def concat(self, others):
-        conds = [self.cond]
-        crossattn_max_len = self.cond.shape[1]
-        for x in others:
-            c = x.cond
-            crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
-            conds.append(c)
-        out = []
-        for c in conds:
-            if c.shape[1] < crossattn_max_len:
-                c = c.repeat(1, crossattn_max_len // c.shape[1], 1) #padding with repeat doesn't change result
-            out.append(c)
-        return torch.cat(out)
-class CONDConstant(CONDRegular):
-    def __init__(self, cond):
-        self.cond = cond
-    def process_cond(self, batch_size, device, **kwargs):
-        return self._copy_with(self.cond)
-    def can_concat(self, other):
-        if self.cond != other.cond:
-            return False
-        return True
-    def concat(self, others):
-        return self.cond

MagicQuill/comfy/controlnet.py DELETED Viewed

@@ -1,554 +0,0 @@
-import torch
-import math
-import os
-import logging
-import comfy.utils
-import comfy.model_management
-import comfy.model_detection
-import comfy.model_patcher
-import comfy.ops
-import comfy.cldm.cldm
-import comfy.t2i_adapter.adapter
-import comfy.ldm.cascade.controlnet
-def broadcast_image_to(tensor, target_batch_size, batched_number):
-    current_batch_size = tensor.shape[0]
-    #print(current_batch_size, target_batch_size)
-    if current_batch_size == 1:
-        return tensor
-    per_batch = target_batch_size // batched_number
-    tensor = tensor[:per_batch]
-    if per_batch > tensor.shape[0]:
-        tensor = torch.cat([tensor] * (per_batch // tensor.shape[0]) + [tensor[:(per_batch % tensor.shape[0])]], dim=0)
-    current_batch_size = tensor.shape[0]
-    if current_batch_size == target_batch_size:
-        return tensor
-    else:
-        return torch.cat([tensor] * batched_number, dim=0)
-class ControlBase:
-    def __init__(self, device=None):
-        self.cond_hint_original = None
-        self.cond_hint = None
-        self.strength = 1.0
-        self.timestep_percent_range = (0.0, 1.0)
-        self.global_average_pooling = False
-        self.timestep_range = None
-        self.compression_ratio = 8
-        self.upscale_algorithm = 'nearest-exact'
-        if device is None:
-            device = comfy.model_management.get_torch_device()
-        self.device = device
-        self.previous_controlnet = None
-    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0)):
-        self.cond_hint_original = cond_hint
-        self.strength = strength
-        self.timestep_percent_range = timestep_percent_range
-        return self
-    def pre_run(self, model, percent_to_timestep_function):
-        self.timestep_range = (percent_to_timestep_function(self.timestep_percent_range[0]), percent_to_timestep_function(self.timestep_percent_range[1]))
-        if self.previous_controlnet is not None:
-            self.previous_controlnet.pre_run(model, percent_to_timestep_function)
-    def set_previous_controlnet(self, controlnet):
-        self.previous_controlnet = controlnet
-        return self
-    def cleanup(self):
-        if self.previous_controlnet is not None:
-            self.previous_controlnet.cleanup()
-        if self.cond_hint is not None:
-            del self.cond_hint
-            self.cond_hint = None
-        self.timestep_range = None
-    def get_models(self):
-        out = []
-        if self.previous_controlnet is not None:
-            out += self.previous_controlnet.get_models()
-        return out
-    def copy_to(self, c):
-        c.cond_hint_original = self.cond_hint_original
-        c.strength = self.strength
-        c.timestep_percent_range = self.timestep_percent_range
-        c.global_average_pooling = self.global_average_pooling
-        c.compression_ratio = self.compression_ratio
-        c.upscale_algorithm = self.upscale_algorithm
-    def inference_memory_requirements(self, dtype):
-        if self.previous_controlnet is not None:
-            return self.previous_controlnet.inference_memory_requirements(dtype)
-        return 0
-    def control_merge(self, control_input, control_output, control_prev, output_dtype):
-        out = {'input':[], 'middle':[], 'output': []}
-        if control_input is not None:
-            for i in range(len(control_input)):
-                key = 'input'
-                x = control_input[i]
-                if x is not None:
-                    x *= self.strength
-                    if x.dtype != output_dtype:
-                        x = x.to(output_dtype)
-                out[key].insert(0, x)
-        if control_output is not None:
-            for i in range(len(control_output)):
-                if i == (len(control_output) - 1):
-                    key = 'middle'
-                    index = 0
-                else:
-                    key = 'output'
-                    index = i
-                x = control_output[i]
-                if x is not None:
-                    if self.global_average_pooling:
-                        x = torch.mean(x, dim=(2, 3), keepdim=True).repeat(1, 1, x.shape[2], x.shape[3])
-                    x *= self.strength
-                    if x.dtype != output_dtype:
-                        x = x.to(output_dtype)
-                out[key].append(x)
-        if control_prev is not None:
-            for x in ['input', 'middle', 'output']:
-                o = out[x]
-                for i in range(len(control_prev[x])):
-                    prev_val = control_prev[x][i]
-                    if i >= len(o):
-                        o.append(prev_val)
-                    elif prev_val is not None:
-                        if o[i] is None:
-                            o[i] = prev_val
-                        else:
-                            if o[i].shape[0] < prev_val.shape[0]:
-                                o[i] = prev_val + o[i]
-                            else:
-                                o[i] += prev_val
-        return out
-class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, device=None, load_device=None, manual_cast_dtype=None):
-        super().__init__(device)
-        self.control_model = control_model
-        self.load_device = load_device
-        if control_model is not None:
-            self.control_model_wrapped = comfy.model_patcher.ModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
-        self.global_average_pooling = global_average_pooling
-        self.model_sampling_current = None
-        self.manual_cast_dtype = manual_cast_dtype
-    def get_control(self, x_noisy, t, cond, batched_number):
-        control_prev = None
-        if self.previous_controlnet is not None:
-            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
-        if self.timestep_range is not None:
-            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
-                if control_prev is not None:
-                    return control_prev
-                else:
-                    return None
-        dtype = self.control_model.dtype
-        if self.manual_cast_dtype is not None:
-            dtype = self.manual_cast_dtype
-        output_dtype = x_noisy.dtype
-        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
-            if self.cond_hint is not None:
-                del self.cond_hint
-            self.cond_hint = None
-            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio, self.upscale_algorithm, "center").to(dtype).to(self.device)
-        if x_noisy.shape[0] != self.cond_hint.shape[0]:
-            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
-        context = cond.get('crossattn_controlnet', cond['c_crossattn'])
-        y = cond.get('y', None)
-        if y is not None:
-            y = y.to(dtype)
-        timestep = self.model_sampling_current.timestep(t)
-        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)
-        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y)
-        return self.control_merge(None, control, control_prev, output_dtype)
-    def copy(self):
-        c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
-        c.control_model = self.control_model
-        c.control_model_wrapped = self.control_model_wrapped
-        self.copy_to(c)
-        return c
-    def get_models(self):
-        out = super().get_models()
-        out.append(self.control_model_wrapped)
-        return out
-    def pre_run(self, model, percent_to_timestep_function):
-        super().pre_run(model, percent_to_timestep_function)
-        self.model_sampling_current = model.model_sampling
-    def cleanup(self):
-        self.model_sampling_current = None
-        super().cleanup()
-class ControlLoraOps:
-    class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
-        def __init__(self, in_features: int, out_features: int, bias: bool = True,
-                    device=None, dtype=None) -> None:
-            factory_kwargs = {'device': device, 'dtype': dtype}
-            super().__init__()
-            self.in_features = in_features
-            self.out_features = out_features
-            self.weight = None
-            self.up = None
-            self.down = None
-            self.bias = None
-        def forward(self, input):
-            weight, bias = comfy.ops.cast_bias_weight(self, input)
-            if self.up is not None:
-                return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
-            else:
-                return torch.nn.functional.linear(input, weight, bias)
-    class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp):
-        def __init__(
-            self,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=0,
-            dilation=1,
-            groups=1,
-            bias=True,
-            padding_mode='zeros',
-            device=None,
-            dtype=None
-        ):
-            super().__init__()
-            self.in_channels = in_channels
-            self.out_channels = out_channels
-            self.kernel_size = kernel_size
-            self.stride = stride
-            self.padding = padding
-            self.dilation = dilation
-            self.transposed = False
-            self.output_padding = 0
-            self.groups = groups
-            self.padding_mode = padding_mode
-            self.weight = None
-            self.bias = None
-            self.up = None
-            self.down = None
-        def forward(self, input):
-            weight, bias = comfy.ops.cast_bias_weight(self, input)
-            if self.up is not None:
-                return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
-            else:
-                return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
-class ControlLora(ControlNet):
-    def __init__(self, control_weights, global_average_pooling=False, device=None):
-        ControlBase.__init__(self, device)
-        self.control_weights = control_weights
-        self.global_average_pooling = global_average_pooling
-    def pre_run(self, model, percent_to_timestep_function):
-        super().pre_run(model, percent_to_timestep_function)
-        controlnet_config = model.model_config.unet_config.copy()
-        controlnet_config.pop("out_channels")
-        controlnet_config["hint_channels"] = self.control_weights["input_hint_block.0.weight"].shape[1]
-        self.manual_cast_dtype = model.manual_cast_dtype
-        dtype = model.get_dtype()
-        if self.manual_cast_dtype is None:
-            class control_lora_ops(ControlLoraOps, comfy.ops.disable_weight_init):
-                pass
-        else:
-            class control_lora_ops(ControlLoraOps, comfy.ops.manual_cast):
-                pass
-            dtype = self.manual_cast_dtype
-        controlnet_config["operations"] = control_lora_ops
-        controlnet_config["dtype"] = dtype
-        self.control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
-        self.control_model.to(comfy.model_management.get_torch_device())
-        diffusion_model = model.diffusion_model
-        sd = diffusion_model.state_dict()
-        cm = self.control_model.state_dict()
-        for k in sd:
-            weight = sd[k]
-            try:
-                comfy.utils.set_attr_param(self.control_model, k, weight)
-            except:
-                pass
-        for k in self.control_weights:
-            if k not in {"lora_controlnet"}:
-                comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))
-    def copy(self):
-        c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
-        self.copy_to(c)
-        return c
-    def cleanup(self):
-        del self.control_model
-        self.control_model = None
-        super().cleanup()
-    def get_models(self):
-        out = ControlBase.get_models(self)
-        return out
-    def inference_memory_requirements(self, dtype):
-        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)
-def load_controlnet(ckpt_path, model=None):
-    controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
-    if "lora_controlnet" in controlnet_data:
-        return ControlLora(controlnet_data)
-    controlnet_config = None
-    supported_inference_dtypes = None
-    if "controlnet_cond_embedding.conv_in.weight" in controlnet_data: #diffusers format
-        controlnet_config = comfy.model_detection.unet_config_from_diffusers_unet(controlnet_data)
-        diffusers_keys = comfy.utils.unet_to_diffusers(controlnet_config)
-        diffusers_keys["controlnet_mid_block.weight"] = "middle_block_out.0.weight"
-        diffusers_keys["controlnet_mid_block.bias"] = "middle_block_out.0.bias"
-        count = 0
-        loop = True
-        while loop:
-            suffix = [".weight", ".bias"]
-            for s in suffix:
-                k_in = "controlnet_down_blocks.{}{}".format(count, s)
-                k_out = "zero_convs.{}.0{}".format(count, s)
-                if k_in not in controlnet_data:
-                    loop = False
-                    break
-                diffusers_keys[k_in] = k_out
-            count += 1
-        count = 0
-        loop = True
-        while loop:
-            suffix = [".weight", ".bias"]
-            for s in suffix:
-                if count == 0:
-                    k_in = "controlnet_cond_embedding.conv_in{}".format(s)
-                else:
-                    k_in = "controlnet_cond_embedding.blocks.{}{}".format(count - 1, s)
-                k_out = "input_hint_block.{}{}".format(count * 2, s)
-                if k_in not in controlnet_data:
-                    k_in = "controlnet_cond_embedding.conv_out{}".format(s)
-                    loop = False
-                diffusers_keys[k_in] = k_out
-            count += 1
-        new_sd = {}
-        for k in diffusers_keys:
-            if k in controlnet_data:
-                new_sd[diffusers_keys[k]] = controlnet_data.pop(k)
-        leftover_keys = controlnet_data.keys()
-        if len(leftover_keys) > 0:
-            logging.warning("leftover keys: {}".format(leftover_keys))
-        controlnet_data = new_sd
-    pth_key = 'control_model.zero_convs.0.0.weight'
-    pth = False
-    key = 'zero_convs.0.0.weight'
-    if pth_key in controlnet_data:
-        pth = True
-        key = pth_key
-        prefix = "control_model."
-    elif key in controlnet_data:
-        prefix = ""
-    else:
-        net = load_t2i_adapter(controlnet_data)
-        if net is None:
-            logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
-        return net
-    if controlnet_config is None:
-        model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
-        supported_inference_dtypes = model_config.supported_inference_dtypes
-        controlnet_config = model_config.unet_config
-    load_device = comfy.model_management.get_torch_device()
-    if supported_inference_dtypes is None:
-        unet_dtype = comfy.model_management.unet_dtype()
-    else:
-        unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)
-    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-    if manual_cast_dtype is not None:
-        controlnet_config["operations"] = comfy.ops.manual_cast
-    controlnet_config["dtype"] = unet_dtype
-    controlnet_config.pop("out_channels")
-    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
-    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
-    if pth:
-        if 'difference' in controlnet_data:
-            if model is not None:
-                comfy.model_management.load_models_gpu([model])
-                model_sd = model.model_state_dict()
-                for x in controlnet_data:
-                    c_m = "control_model."
-                    if x.startswith(c_m):
-                        sd_key = "diffusion_model.{}".format(x[len(c_m):])
-                        if sd_key in model_sd:
-                            cd = controlnet_data[x]
-                            cd += model_sd[sd_key].type(cd.dtype).to(cd.device)
-            else:
-                logging.warning("WARNING: Loaded a diff controlnet without a model. It will very likely not work.")
-        class WeightsLoader(torch.nn.Module):
-            pass
-        w = WeightsLoader()
-        w.control_model = control_model
-        missing, unexpected = w.load_state_dict(controlnet_data, strict=False)
-    else:
-        missing, unexpected = control_model.load_state_dict(controlnet_data, strict=False)
-    if len(missing) > 0:
-        logging.warning("missing controlnet keys: {}".format(missing))
-    if len(unexpected) > 0:
-        logging.debug("unexpected controlnet keys: {}".format(unexpected))
-    global_average_pooling = False
-    filename = os.path.splitext(ckpt_path)[0]
-    if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
-        global_average_pooling = True
-    control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
-    return control
-class T2IAdapter(ControlBase):
-    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
-        super().__init__(device)
-        self.t2i_model = t2i_model
-        self.channels_in = channels_in
-        self.control_input = None
-        self.compression_ratio = compression_ratio
-        self.upscale_algorithm = upscale_algorithm
-    def scale_image_to(self, width, height):
-        unshuffle_amount = self.t2i_model.unshuffle_amount
-        width = math.ceil(width / unshuffle_amount) * unshuffle_amount
-        height = math.ceil(height / unshuffle_amount) * unshuffle_amount
-        return width, height
-    def get_control(self, x_noisy, t, cond, batched_number):
-        control_prev = None
-        if self.previous_controlnet is not None:
-            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
-        if self.timestep_range is not None:
-            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
-                if control_prev is not None:
-                    return control_prev
-                else:
-                    return None
-        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
-            if self.cond_hint is not None:
-                del self.cond_hint
-            self.control_input = None
-            self.cond_hint = None
-            width, height = self.scale_image_to(x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio)
-            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, width, height, self.upscale_algorithm, "center").float().to(self.device)
-            if self.channels_in == 1 and self.cond_hint.shape[1] > 1:
-                self.cond_hint = torch.mean(self.cond_hint, 1, keepdim=True)
-        if x_noisy.shape[0] != self.cond_hint.shape[0]:
-            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
-        if self.control_input is None:
-            self.t2i_model.to(x_noisy.dtype)
-            self.t2i_model.to(self.device)
-            self.control_input = self.t2i_model(self.cond_hint.to(x_noisy.dtype))
-            self.t2i_model.cpu()
-        control_input = list(map(lambda a: None if a is None else a.clone(), self.control_input))
-        mid = None
-        if self.t2i_model.xl == True:
-            mid = control_input[-1:]
-            control_input = control_input[:-1]
-        return self.control_merge(control_input, mid, control_prev, x_noisy.dtype)
-    def copy(self):
-        c = T2IAdapter(self.t2i_model, self.channels_in, self.compression_ratio, self.upscale_algorithm)
-        self.copy_to(c)
-        return c
-def load_t2i_adapter(t2i_data):
-    compression_ratio = 8
-    upscale_algorithm = 'nearest-exact'
-    if 'adapter' in t2i_data:
-        t2i_data = t2i_data['adapter']
-    if 'adapter.body.0.resnets.0.block1.weight' in t2i_data: #diffusers format
-        prefix_replace = {}
-        for i in range(4):
-            for j in range(2):
-                prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
-            prefix_replace["adapter.body.{}.".format(i, j)] = "body.{}.".format(i * 2)
-        prefix_replace["adapter."] = ""
-        t2i_data = comfy.utils.state_dict_prefix_replace(t2i_data, prefix_replace)
-    keys = t2i_data.keys()
-    if "body.0.in_conv.weight" in keys:
-        cin = t2i_data['body.0.in_conv.weight'].shape[1]
-        model_ad = comfy.t2i_adapter.adapter.Adapter_light(cin=cin, channels=[320, 640, 1280, 1280], nums_rb=4)
-    elif 'conv_in.weight' in keys:
-        cin = t2i_data['conv_in.weight'].shape[1]
-        channel = t2i_data['conv_in.weight'].shape[0]
-        ksize = t2i_data['body.0.block2.weight'].shape[2]
-        use_conv = False
-        down_opts = list(filter(lambda a: a.endswith("down_opt.op.weight"), keys))
-        if len(down_opts) > 0:
-            use_conv = True
-        xl = False
-        if cin == 256 or cin == 768:
-            xl = True
-        model_ad = comfy.t2i_adapter.adapter.Adapter(cin=cin, channels=[channel, channel*2, channel*4, channel*4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, xl=xl)
-    elif "backbone.0.0.weight" in keys:
-        model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.0.weight'].shape[1], proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
-        compression_ratio = 32
-        upscale_algorithm = 'bilinear'
-    elif "backbone.10.blocks.0.weight" in keys:
-        model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.weight'].shape[1], bottleneck_mode="large", proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
-        compression_ratio = 1
-        upscale_algorithm = 'nearest-exact'
-    else:
-        return None
-    missing, unexpected = model_ad.load_state_dict(t2i_data)
-    if len(missing) > 0:
-        logging.warning("t2i missing {}".format(missing))
-    if len(unexpected) > 0:
-        logging.debug("t2i unexpected {}".format(unexpected))
-    return T2IAdapter(model_ad, model_ad.input_channels, compression_ratio, upscale_algorithm)

MagicQuill/comfy/diffusers_convert.py DELETED Viewed

@@ -1,281 +0,0 @@
-import re
-import torch
-import logging
-# conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
-# =================#
-# UNet Conversion #
-# =================#
-unet_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
-    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
-    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
-    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
-    ("input_blocks.0.0.weight", "conv_in.weight"),
-    ("input_blocks.0.0.bias", "conv_in.bias"),
-    ("out.0.weight", "conv_norm_out.weight"),
-    ("out.0.bias", "conv_norm_out.bias"),
-    ("out.2.weight", "conv_out.weight"),
-    ("out.2.bias", "conv_out.bias"),
-]
-unet_conversion_map_resnet = [
-    # (stable-diffusion, HF Diffusers)
-    ("in_layers.0", "norm1"),
-    ("in_layers.2", "conv1"),
-    ("out_layers.0", "norm2"),
-    ("out_layers.3", "conv2"),
-    ("emb_layers.1", "time_emb_proj"),
-    ("skip_connection", "conv_shortcut"),
-]
-unet_conversion_map_layer = []
-# hardcoded number of downblocks and resnets/attentions...
-# would need smarter logic for other networks.
-for i in range(4):
-    # loop over downblocks/upblocks
-    for j in range(2):
-        # loop over resnets/attentions for downblocks
-        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
-        sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
-        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
-        if i < 3:
-            # no attention layers in down_blocks.3
-            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
-            sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
-            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
-    for j in range(3):
-        # loop over resnets/attentions for upblocks
-        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
-        sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
-        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
-        if i > 0:
-            # no attention layers in up_blocks.0
-            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
-            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
-            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
-    if i < 3:
-        # no downsample in down_blocks.3
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
-        sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
-        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
-        # no upsample in up_blocks.3
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
-        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
-hf_mid_atn_prefix = "mid_block.attentions.0."
-sd_mid_atn_prefix = "middle_block.1."
-unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
-for j in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{j}."
-    sd_mid_res_prefix = f"middle_block.{2 * j}."
-    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
-def convert_unet_state_dict(unet_state_dict):
-    # buyer beware: this is a *brittle* function,
-    # and correct output requires that all of these pieces interact in
-    # the exact order in which I have arranged them.
-    mapping = {k: k for k in unet_state_dict.keys()}
-    for sd_name, hf_name in unet_conversion_map:
-        mapping[hf_name] = sd_name
-    for k, v in mapping.items():
-        if "resnets" in k:
-            for sd_part, hf_part in unet_conversion_map_resnet:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    for k, v in mapping.items():
-        for sd_part, hf_part in unet_conversion_map_layer:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
-    return new_state_dict
-# ================#
-# VAE Conversion #
-# ================#
-vae_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("nin_shortcut", "conv_shortcut"),
-    ("norm_out", "conv_norm_out"),
-    ("mid.attn_1.", "mid_block.attentions.0."),
-]
-for i in range(4):
-    # down_blocks have two resnets
-    for j in range(2):
-        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
-        sd_down_prefix = f"encoder.down.{i}.block.{j}."
-        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
-    if i < 3:
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
-        sd_downsample_prefix = f"down.{i}.downsample."
-        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"up.{3 - i}.upsample."
-        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
-    # up_blocks have three resnets
-    # also, up blocks in hf are numbered in reverse from sd
-    for j in range(3):
-        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
-        sd_up_prefix = f"decoder.up.{3 - i}.block.{j}."
-        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
-# this part accounts for mid blocks in both the encoder and the decoder
-for i in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{i}."
-    sd_mid_res_prefix = f"mid.block_{i + 1}."
-    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
-vae_conversion_map_attn = [
-    # (stable-diffusion, HF Diffusers)
-    ("norm.", "group_norm."),
-    ("q.", "query."),
-    ("k.", "key."),
-    ("v.", "value."),
-    ("q.", "to_q."),
-    ("k.", "to_k."),
-    ("v.", "to_v."),
-    ("proj_out.", "to_out.0."),
-    ("proj_out.", "proj_attn."),
-]
-def reshape_weight_for_sd(w):
-    # convert HF linear weights to SD conv2d weights
-    return w.reshape(*w.shape, 1, 1)
-def convert_vae_state_dict(vae_state_dict):
-    mapping = {k: k for k in vae_state_dict.keys()}
-    for k, v in mapping.items():
-        for sd_part, hf_part in vae_conversion_map:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    for k, v in mapping.items():
-        if "attentions" in k:
-            for sd_part, hf_part in vae_conversion_map_attn:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
-    weights_to_convert = ["q", "k", "v", "proj_out"]
-    for k, v in new_state_dict.items():
-        for weight_name in weights_to_convert:
-            if f"mid.attn_1.{weight_name}.weight" in k:
-                logging.debug(f"Reshaping {k} for SD format")
-                new_state_dict[k] = reshape_weight_for_sd(v)
-    return new_state_dict
-# =========================#
-# Text Encoder Conversion #
-# =========================#
-textenc_conversion_lst = [
-    # (stable-diffusion, HF Diffusers)
-    ("resblocks.", "text_model.encoder.layers."),
-    ("ln_1", "layer_norm1"),
-    ("ln_2", "layer_norm2"),
-    (".c_fc.", ".fc1."),
-    (".c_proj.", ".fc2."),
-    (".attn", ".self_attn"),
-    ("ln_final.", "transformer.text_model.final_layer_norm."),
-    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
-    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
-]
-protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
-textenc_pattern = re.compile("|".join(protected.keys()))
-# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
-code2idx = {"q": 0, "k": 1, "v": 2}
-# This function exists because at the time of writing torch.cat can't do fp8 with cuda
-def cat_tensors(tensors):
-    x = 0
-    for t in tensors:
-        x += t.shape[0]
-    shape = [x] + list(tensors[0].shape)[1:]
-    out = torch.empty(shape, device=tensors[0].device, dtype=tensors[0].dtype)
-    x = 0
-    for t in tensors:
-        out[x:x + t.shape[0]] = t
-        x += t.shape[0]
-    return out
-def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
-    new_state_dict = {}
-    capture_qkv_weight = {}
-    capture_qkv_bias = {}
-    for k, v in text_enc_dict.items():
-        if not k.startswith(prefix):
-            continue
-        if (
-                k.endswith(".self_attn.q_proj.weight")
-                or k.endswith(".self_attn.k_proj.weight")
-                or k.endswith(".self_attn.v_proj.weight")
-        ):
-            k_pre = k[: -len(".q_proj.weight")]
-            k_code = k[-len("q_proj.weight")]
-            if k_pre not in capture_qkv_weight:
-                capture_qkv_weight[k_pre] = [None, None, None]
-            capture_qkv_weight[k_pre][code2idx[k_code]] = v
-            continue
-        if (
-                k.endswith(".self_attn.q_proj.bias")
-                or k.endswith(".self_attn.k_proj.bias")
-                or k.endswith(".self_attn.v_proj.bias")
-        ):
-            k_pre = k[: -len(".q_proj.bias")]
-            k_code = k[-len("q_proj.bias")]
-            if k_pre not in capture_qkv_bias:
-                capture_qkv_bias[k_pre] = [None, None, None]
-            capture_qkv_bias[k_pre][code2idx[k_code]] = v
-            continue
-        text_proj = "transformer.text_projection.weight"
-        if k.endswith(text_proj):
-            new_state_dict[k.replace(text_proj, "text_projection")] = v.transpose(0, 1).contiguous()
-        else:
-            relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
-            new_state_dict[relabelled_key] = v
-    for k_pre, tensors in capture_qkv_weight.items():
-        if None in tensors:
-            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
-        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
-        new_state_dict[relabelled_key + ".in_proj_weight"] = cat_tensors(tensors)
-    for k_pre, tensors in capture_qkv_bias.items():
-        if None in tensors:
-            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
-        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
-        new_state_dict[relabelled_key + ".in_proj_bias"] = cat_tensors(tensors)
-    return new_state_dict
-def convert_text_enc_state_dict(text_enc_dict):
-    return text_enc_dict

MagicQuill/comfy/diffusers_load.py DELETED Viewed

@@ -1,36 +0,0 @@
-import os
-import comfy.sd
-def first_file(path, filenames):
-    for f in filenames:
-        p = os.path.join(path, f)
-        if os.path.exists(p):
-            return p
-    return None
-def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_directory=None):
-    diffusion_model_names = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.fp16.bin", "diffusion_pytorch_model.bin"]
-    unet_path = first_file(os.path.join(model_path, "unet"), diffusion_model_names)
-    vae_path = first_file(os.path.join(model_path, "vae"), diffusion_model_names)
-    text_encoder_model_names = ["model.fp16.safetensors", "model.safetensors", "pytorch_model.fp16.bin", "pytorch_model.bin"]
-    text_encoder1_path = first_file(os.path.join(model_path, "text_encoder"), text_encoder_model_names)
-    text_encoder2_path = first_file(os.path.join(model_path, "text_encoder_2"), text_encoder_model_names)
-    text_encoder_paths = [text_encoder1_path]
-    if text_encoder2_path is not None:
-        text_encoder_paths.append(text_encoder2_path)
-    unet = comfy.sd.load_unet(unet_path)
-    clip = None
-    if output_clip:
-        clip = comfy.sd.load_clip(text_encoder_paths, embedding_directory=embedding_directory)
-    vae = None
-    if output_vae:
-        sd = comfy.utils.load_torch_file(vae_path)
-        vae = comfy.sd.VAE(sd=sd)
-    return (unet, clip, vae)

MagicQuill/comfy/extra_samplers/__pycache__/uni_pc.cpython-310.pyc DELETED Viewed

Binary file (28.5 kB)

MagicQuill/comfy/extra_samplers/uni_pc.py DELETED Viewed

@@ -1,875 +0,0 @@
-#code taken from: https://github.com/wl-zhao/UniPC and modified
-import torch
-import torch.nn.functional as F
-import math
-from tqdm.auto import trange, tqdm
-class NoiseScheduleVP:
-    def __init__(
-            self,
-            schedule='discrete',
-            betas=None,
-            alphas_cumprod=None,
-            continuous_beta_0=0.1,
-            continuous_beta_1=20.,
-        ):
-        """Create a wrapper class for the forward SDE (VP type).
-        ***
-        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
-                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
-        ***
-        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
-        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
-        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
-            log_alpha_t = self.marginal_log_mean_coeff(t)
-            sigma_t = self.marginal_std(t)
-            lambda_t = self.marginal_lambda(t)
-        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
-            t = self.inverse_lambda(lambda_t)
-        ===============================================================
-        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
-        1. For discrete-time DPMs:
-            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
-                t_i = (i + 1) / N
-            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
-            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
-            Args:
-                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
-                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
-            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
-            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
-                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
-                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
-                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
-                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
-                and
-                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
-        2. For continuous-time DPMs:
-            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
-            schedule are the default settings in DDPM and improved-DDPM:
-            Args:
-                beta_min: A `float` number. The smallest beta for the linear schedule.
-                beta_max: A `float` number. The largest beta for the linear schedule.
-                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
-                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
-                T: A `float` number. The ending time of the forward process.
-        ===============================================================
-        Args:
-            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
-                    'linear' or 'cosine' for continuous-time DPMs.
-        Returns:
-            A wrapper object of the forward SDE (VP type).
-        ===============================================================
-        Example:
-        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', betas=betas)
-        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
-        # For continuous-time DPMs (VPSDE), linear schedule:
-        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
-        """
-        if schedule not in ['discrete', 'linear', 'cosine']:
-            raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule))
-        self.schedule = schedule
-        if schedule == 'discrete':
-            if betas is not None:
-                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
-            else:
-                assert alphas_cumprod is not None
-                log_alphas = 0.5 * torch.log(alphas_cumprod)
-            self.total_N = len(log_alphas)
-            self.T = 1.
-            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
-            self.log_alpha_array = log_alphas.reshape((1, -1,))
-        else:
-            self.total_N = 1000
-            self.beta_0 = continuous_beta_0
-            self.beta_1 = continuous_beta_1
-            self.cosine_s = 0.008
-            self.cosine_beta_max = 999.
-            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
-            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
-            self.schedule = schedule
-            if schedule == 'cosine':
-                # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
-                # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
-                self.T = 0.9946
-            else:
-                self.T = 1.
-    def marginal_log_mean_coeff(self, t):
-        """
-        Compute log(alpha_t) of a given continuous-time label t in [0, T].
-        """
-        if self.schedule == 'discrete':
-            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
-        elif self.schedule == 'linear':
-            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
-        elif self.schedule == 'cosine':
-            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
-            log_alpha_t =  log_alpha_fn(t) - self.cosine_log_alpha_0
-            return log_alpha_t
-    def marginal_alpha(self, t):
-        """
-        Compute alpha_t of a given continuous-time label t in [0, T].
-        """
-        return torch.exp(self.marginal_log_mean_coeff(t))
-    def marginal_std(self, t):
-        """
-        Compute sigma_t of a given continuous-time label t in [0, T].
-        """
-        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
-    def marginal_lambda(self, t):
-        """
-        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
-        """
-        log_mean_coeff = self.marginal_log_mean_coeff(t)
-        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
-        return log_mean_coeff - log_std
-    def inverse_lambda(self, lamb):
-        """
-        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
-        """
-        if self.schedule == 'linear':
-            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            Delta = self.beta_0**2 + tmp
-            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
-        elif self.schedule == 'discrete':
-            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
-            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
-            return t.reshape((-1,))
-        else:
-            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
-            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
-            t = t_fn(log_alpha)
-            return t
-def model_wrapper(
-    model,
-    noise_schedule,
-    model_type="noise",
-    model_kwargs={},
-    guidance_type="uncond",
-    condition=None,
-    unconditional_condition=None,
-    guidance_scale=1.,
-    classifier_fn=None,
-    classifier_kwargs={},
-):
-    """Create a wrapper function for the noise prediction model.
-    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
-    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
-    We support four types of the diffusion model by setting `model_type`:
-        1. "noise": noise prediction model. (Trained by predicting noise).
-        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
-        3. "v": velocity prediction model. (Trained by predicting the velocity).
-            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
-            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
-                arXiv preprint arXiv:2202.00512 (2022).
-            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
-                arXiv preprint arXiv:2210.02303 (2022).
-        4. "score": marginal score function. (Trained by denoising score matching).
-            Note that the score function and the noise prediction model follows a simple relationship:
-            ```
-                noise(x_t, t) = -sigma_t * score(x_t, t)
-            ```
-    We support three types of guided sampling by DPMs by setting `guidance_type`:
-        1. "uncond": unconditional sampling by DPMs.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            ``
-        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            ``
-            The input `classifier_fn` has the following format:
-            ``
-                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
-            ``
-            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
-                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
-        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
-            The input `model` has the following format:
-            ``
-                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
-            ``
-            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
-            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
-                arXiv preprint arXiv:2207.12598 (2022).
-    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
-    or continuous-time labels (i.e. epsilon to T).
-    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
-    ``
-        def model_fn(x, t_continuous) -> noise:
-            t_input = get_model_input_time(t_continuous)
-            return noise_pred(model, x, t_input, **model_kwargs)
-    ``
-    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
-    ===============================================================
-    Args:
-        model: A diffusion model with the corresponding format described above.
-        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
-        model_type: A `str`. The parameterization type of the diffusion model.
-                    "noise" or "x_start" or "v" or "score".
-        model_kwargs: A `dict`. A dict for the other inputs of the model function.
-        guidance_type: A `str`. The type of the guidance for sampling.
-                    "uncond" or "classifier" or "classifier-free".
-        condition: A pytorch tensor. The condition for the guided sampling.
-                    Only used for "classifier" or "classifier-free" guidance type.
-        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
-                    Only used for "classifier-free" guidance type.
-        guidance_scale: A `float`. The scale for the guided sampling.
-        classifier_fn: A classifier function. Only used for the classifier guidance.
-        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
-    Returns:
-        A noise prediction model that accepts the noised data and the continuous time as the inputs.
-    """
-    def get_model_input_time(t_continuous):
-        """
-        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
-        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
-        For continuous-time DPMs, we just use `t_continuous`.
-        """
-        if noise_schedule.schedule == 'discrete':
-            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
-        else:
-            return t_continuous
-    def noise_pred_fn(x, t_continuous, cond=None):
-        if t_continuous.reshape((-1,)).shape[0] == 1:
-            t_continuous = t_continuous.expand((x.shape[0]))
-        t_input = get_model_input_time(t_continuous)
-        output = model(x, t_input, **model_kwargs)
-        if model_type == "noise":
-            return output
-        elif model_type == "x_start":
-            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
-        elif model_type == "v":
-            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
-        elif model_type == "score":
-            sigma_t = noise_schedule.marginal_std(t_continuous)
-            dims = x.dim()
-            return -expand_dims(sigma_t, dims) * output
-    def cond_grad_fn(x, t_input):
-        """
-        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
-        """
-        with torch.enable_grad():
-            x_in = x.detach().requires_grad_(True)
-            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
-            return torch.autograd.grad(log_prob.sum(), x_in)[0]
-    def model_fn(x, t_continuous):
-        """
-        The noise predicition model function that is used for DPM-Solver.
-        """
-        if t_continuous.reshape((-1,)).shape[0] == 1:
-            t_continuous = t_continuous.expand((x.shape[0]))
-        if guidance_type == "uncond":
-            return noise_pred_fn(x, t_continuous)
-        elif guidance_type == "classifier":
-            assert classifier_fn is not None
-            t_input = get_model_input_time(t_continuous)
-            cond_grad = cond_grad_fn(x, t_input)
-            sigma_t = noise_schedule.marginal_std(t_continuous)
-            noise = noise_pred_fn(x, t_continuous)
-            return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
-        elif guidance_type == "classifier-free":
-            if guidance_scale == 1. or unconditional_condition is None:
-                return noise_pred_fn(x, t_continuous, cond=condition)
-            else:
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t_continuous] * 2)
-                c_in = torch.cat([unconditional_condition, condition])
-                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
-                return noise_uncond + guidance_scale * (noise - noise_uncond)
-    assert model_type in ["noise", "x_start", "v"]
-    assert guidance_type in ["uncond", "classifier", "classifier-free"]
-    return model_fn
-class UniPC:
-    def __init__(
-        self,
-        model_fn,
-        noise_schedule,
-        predict_x0=True,
-        thresholding=False,
-        max_val=1.,
-        variant='bh1',
-    ):
-        """Construct a UniPC.
-        We support both data_prediction and noise_prediction.
-        """
-        self.model = model_fn
-        self.noise_schedule = noise_schedule
-        self.variant = variant
-        self.predict_x0 = predict_x0
-        self.thresholding = thresholding
-        self.max_val = max_val
-    def dynamic_thresholding_fn(self, x0, t=None):
-        """
-        The dynamic thresholding method.
-        """
-        dims = x0.dim()
-        p = self.dynamic_thresholding_ratio
-        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
-        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
-        x0 = torch.clamp(x0, -s, s) / s
-        return x0
-    def noise_prediction_fn(self, x, t):
-        """
-        Return the noise prediction model.
-        """
-        return self.model(x, t)
-    def data_prediction_fn(self, x, t):
-        """
-        Return the data prediction model (with thresholding).
-        """
-        noise = self.noise_prediction_fn(x, t)
-        dims = x.dim()
-        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
-        x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
-        if self.thresholding:
-            p = 0.995   # A hyperparameter in the paper of "Imagen" [1].
-            s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
-            s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
-            x0 = torch.clamp(x0, -s, s) / s
-        return x0
-    def model_fn(self, x, t):
-        """
-        Convert the model to the noise prediction model or the data prediction model.
-        """
-        if self.predict_x0:
-            return self.data_prediction_fn(x, t)
-        else:
-            return self.noise_prediction_fn(x, t)
-    def get_time_steps(self, skip_type, t_T, t_0, N, device):
-        """Compute the intermediate time steps for sampling.
-        """
-        if skip_type == 'logSNR':
-            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
-            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
-            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
-            return self.noise_schedule.inverse_lambda(logSNR_steps)
-        elif skip_type == 'time_uniform':
-            return torch.linspace(t_T, t_0, N + 1).to(device)
-        elif skip_type == 'time_quadratic':
-            t_order = 2
-            t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
-            return t
-        else:
-            raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
-    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
-        """
-        Get the order of each step for sampling by the singlestep DPM-Solver.
-        """
-        if order == 3:
-            K = steps // 3 + 1
-            if steps % 3 == 0:
-                orders = [3,] * (K - 2) + [2, 1]
-            elif steps % 3 == 1:
-                orders = [3,] * (K - 1) + [1]
-            else:
-                orders = [3,] * (K - 1) + [2]
-        elif order == 2:
-            if steps % 2 == 0:
-                K = steps // 2
-                orders = [2,] * K
-            else:
-                K = steps // 2 + 1
-                orders = [2,] * (K - 1) + [1]
-        elif order == 1:
-            K = steps
-            orders = [1,] * steps
-        else:
-            raise ValueError("'order' must be '1' or '2' or '3'.")
-        if skip_type == 'logSNR':
-            # To reproduce the results in DPM-Solver paper
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
-        else:
-            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)]
-        return timesteps_outer, orders
-    def denoise_to_zero_fn(self, x, s):
-        """
-        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
-        """
-        return self.data_prediction_fn(x, s)
-    def multistep_uni_pc_update(self, x, model_prev_list, t_prev_list, t, order, **kwargs):
-        if len(t.shape) == 0:
-            t = t.view(-1)
-        if 'bh' in self.variant:
-            return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
-        else:
-            assert self.variant == 'vary_coeff'
-            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
-    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
-        print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
-        ns = self.noise_schedule
-        assert order <= len(model_prev_list)
-        # first compute rks
-        t_prev_0 = t_prev_list[-1]
-        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
-        lambda_t = ns.marginal_lambda(t)
-        model_prev_0 = model_prev_list[-1]
-        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
-        log_alpha_t = ns.marginal_log_mean_coeff(t)
-        alpha_t = torch.exp(log_alpha_t)
-        h = lambda_t - lambda_prev_0
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            t_prev_i = t_prev_list[-(i + 1)]
-            model_prev_i = model_prev_list[-(i + 1)]
-            lambda_prev_i = ns.marginal_lambda(t_prev_i)
-            rk = (lambda_prev_i - lambda_prev_0) / h
-            rks.append(rk)
-            D1s.append((model_prev_i - model_prev_0) / rk)
-        rks.append(1.)
-        rks = torch.tensor(rks, device=x.device)
-        K = len(rks)
-        # build C matrix
-        C = []
-        col = torch.ones_like(rks)
-        for k in range(1, K + 1):
-            C.append(col)
-            col = col * rks / (k + 1)
-        C = torch.stack(C, dim=1)
-        if len(D1s) > 0:
-            D1s = torch.stack(D1s, dim=1) # (B, K)
-            C_inv_p = torch.linalg.inv(C[:-1, :-1])
-            A_p = C_inv_p
-        if use_corrector:
-            print('using corrector')
-            C_inv = torch.linalg.inv(C)
-            A_c = C_inv
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = torch.expm1(hh)
-        h_phi_ks = []
-        factorial_k = 1
-        h_phi_k = h_phi_1
-        for k in range(1, K + 2):
-            h_phi_ks.append(h_phi_k)
-            h_phi_k = h_phi_k / hh - 1 / factorial_k
-            factorial_k *= (k + 1)
-        model_t = None
-        if self.predict_x0:
-            x_t_ = (
-                sigma_t / sigma_prev_0 * x
-                - alpha_t * h_phi_1 * model_prev_0
-            )
-            # now predictor
-            x_t = x_t_
-            if len(D1s) > 0:
-                # compute the residuals for predictor
-                for k in range(K - 1):
-                    x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
-            # now corrector
-            if use_corrector:
-                model_t = self.model_fn(x_t, t)
-                D1_t = (model_t - model_prev_0)
-                x_t = x_t_
-                k = 0
-                for k in range(K - 1):
-                    x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
-                x_t = x_t - alpha_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
-        else:
-            log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
-            x_t_ = (
-                (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
-                - (sigma_t * h_phi_1) * model_prev_0
-            )
-            # now predictor
-            x_t = x_t_
-            if len(D1s) > 0:
-                # compute the residuals for predictor
-                for k in range(K - 1):
-                    x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
-            # now corrector
-            if use_corrector:
-                model_t = self.model_fn(x_t, t)
-                D1_t = (model_t - model_prev_0)
-                x_t = x_t_
-                k = 0
-                for k in range(K - 1):
-                    x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
-                x_t = x_t - sigma_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
-        return x_t, model_t
-    def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True):
-        # print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
-        ns = self.noise_schedule
-        assert order <= len(model_prev_list)
-        dims = x.dim()
-        # first compute rks
-        t_prev_0 = t_prev_list[-1]
-        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
-        lambda_t = ns.marginal_lambda(t)
-        model_prev_0 = model_prev_list[-1]
-        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
-        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
-        alpha_t = torch.exp(log_alpha_t)
-        h = lambda_t - lambda_prev_0
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            t_prev_i = t_prev_list[-(i + 1)]
-            model_prev_i = model_prev_list[-(i + 1)]
-            lambda_prev_i = ns.marginal_lambda(t_prev_i)
-            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
-            rks.append(rk)
-            D1s.append((model_prev_i - model_prev_0) / rk)
-        rks.append(1.)
-        rks = torch.tensor(rks, device=x.device)
-        R = []
-        b = []
-        hh = -h[0] if self.predict_x0 else h[0]
-        h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-        factorial_i = 1
-        if self.variant == 'bh1':
-            B_h = hh
-        elif self.variant == 'bh2':
-            B_h = torch.expm1(hh)
-        else:
-            raise NotImplementedError()
-        for i in range(1, order + 1):
-            R.append(torch.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= (i + 1)
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-        R = torch.stack(R)
-        b = torch.tensor(b, device=x.device)
-        # now predictor
-        use_predictor = len(D1s) > 0 and x_t is None
-        if len(D1s) > 0:
-            D1s = torch.stack(D1s, dim=1) # (B, K)
-            if x_t is None:
-                # for order 2, we use a simplified version
-                if order == 2:
-                    rhos_p = torch.tensor([0.5], device=b.device)
-                else:
-                    rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
-        else:
-            D1s = None
-        if use_corrector:
-            # print('using corrector')
-            # for order 1, we use a simplified version
-            if order == 1:
-                rhos_c = torch.tensor([0.5], device=b.device)
-            else:
-                rhos_c = torch.linalg.solve(R, b)
-        model_t = None
-        if self.predict_x0:
-            x_t_ = (
-                expand_dims(sigma_t / sigma_prev_0, dims) * x
-                - expand_dims(alpha_t * h_phi_1, dims)* model_prev_0
-            )
-            if x_t is None:
-                if use_predictor:
-                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
-                else:
-                    pred_res = 0
-                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
-            if use_corrector:
-                model_t = self.model_fn(x_t, t)
-                if D1s is not None:
-                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
-                else:
-                    corr_res = 0
-                D1_t = (model_t - model_prev_0)
-                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
-        else:
-            x_t_ = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
-                - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
-            )
-            if x_t is None:
-                if use_predictor:
-                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
-                else:
-                    pred_res = 0
-                x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * pred_res
-            if use_corrector:
-                model_t = self.model_fn(x_t, t)
-                if D1s is not None:
-                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
-                else:
-                    corr_res = 0
-                D1_t = (model_t - model_prev_0)
-                x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
-        return x_t, model_t
-    def sample(self, x, timesteps, t_start=None, t_end=None, order=3, skip_type='time_uniform',
-        method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
-        atol=0.0078, rtol=0.05, corrector=False, callback=None, disable_pbar=False
-    ):
-        # t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
-        # t_T = self.noise_schedule.T if t_start is None else t_start
-        device = x.device
-        steps = len(timesteps) - 1
-        if method == 'multistep':
-            assert steps >= order
-            # timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
-            assert timesteps.shape[0] - 1 == steps
-            # with torch.no_grad():
-            for step_index in trange(steps, disable=disable_pbar):
-                if step_index == 0:
-                    vec_t = timesteps[0].expand((x.shape[0]))
-                    model_prev_list = [self.model_fn(x, vec_t)]
-                    t_prev_list = [vec_t]
-                elif step_index < order:
-                    init_order = step_index
-                # Init the first `order` values by lower order multistep DPM-Solver.
-                # for init_order in range(1, order):
-                    vec_t = timesteps[init_order].expand(x.shape[0])
-                    x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, init_order, use_corrector=True)
-                    if model_x is None:
-                        model_x = self.model_fn(x, vec_t)
-                    model_prev_list.append(model_x)
-                    t_prev_list.append(vec_t)
-                else:
-                    extra_final_step = 0
-                    if step_index == (steps - 1):
-                        extra_final_step = 1
-                    for step in range(step_index, step_index + 1 + extra_final_step):
-                        vec_t = timesteps[step].expand(x.shape[0])
-                        if lower_order_final:
-                            step_order = min(order, steps + 1 - step)
-                        else:
-                            step_order = order
-                        # print('this step order:', step_order)
-                        if step == steps:
-                            # print('do not run corrector at the last step')
-                            use_corrector = False
-                        else:
-                            use_corrector = True
-                        x, model_x =  self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, step_order, use_corrector=use_corrector)
-                        for i in range(order - 1):
-                            t_prev_list[i] = t_prev_list[i + 1]
-                            model_prev_list[i] = model_prev_list[i + 1]
-                        t_prev_list[-1] = vec_t
-                        # We do not need to evaluate the final model value.
-                        if step < steps:
-                            if model_x is None:
-                                model_x = self.model_fn(x, vec_t)
-                            model_prev_list[-1] = model_x
-                if callback is not None:
-                    callback({'x': x, 'i': step_index, 'denoised': model_prev_list[-1]})
-        else:
-            raise NotImplementedError()
-        # if denoise_to_zero:
-        #     x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
-        return x
-#############################################################
-# other utility functions
-#############################################################
-def interpolate_fn(x, xp, yp):
-    """
-    A piecewise linear function y = f(x), using xp and yp as keypoints.
-    We implement f(x) in a differentiable way (i.e. applicable for autograd).
-    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
-    Args:
-        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
-        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
-        yp: PyTorch tensor with shape [C, K].
-    Returns:
-        The function values f(x), with shape [N, C].
-    """
-    N, K = x.shape[0], xp.shape[1]
-    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
-    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
-    x_idx = torch.argmin(x_indices, dim=2)
-    cand_start_idx = x_idx - 1
-    start_idx = torch.where(
-        torch.eq(x_idx, 0),
-        torch.tensor(1, device=x.device),
-        torch.where(
-            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
-        ),
-    )
-    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
-    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
-    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
-    start_idx2 = torch.where(
-        torch.eq(x_idx, 0),
-        torch.tensor(0, device=x.device),
-        torch.where(
-            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
-        ),
-    )
-    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
-    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
-    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
-    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
-    return cand
-def expand_dims(v, dims):
-    """
-    Expand the tensor `v` to the dim `dims`.
-    Args:
-        `v`: a PyTorch tensor with shape [N].
-        `dim`: a `int`.
-    Returns:
-        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
-    """
-    return v[(...,) + (None,)*(dims - 1)]
-class SigmaConvert:
-    schedule = ""
-    def marginal_log_mean_coeff(self, sigma):
-        return 0.5 * torch.log(1 / ((sigma * sigma) + 1))
-    def marginal_alpha(self, t):
-        return torch.exp(self.marginal_log_mean_coeff(t))
-    def marginal_std(self, t):
-        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
-    def marginal_lambda(self, t):
-        """
-        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
-        """
-        log_mean_coeff = self.marginal_log_mean_coeff(t)
-        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
-        return log_mean_coeff - log_std
-def predict_eps_sigma(model, input, sigma_in, **kwargs):
-    sigma = sigma_in.view(sigma_in.shape[:1] + (1,) * (input.ndim - 1))
-    input = input * ((sigma ** 2 + 1.0) ** 0.5)
-    return  (input - model(input, sigma_in, **kwargs)) / sigma
-def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
-        timesteps = sigmas.clone()
-        if sigmas[-1] == 0:
-            timesteps = sigmas[:]
-            timesteps[-1] = 0.001
-        else:
-            timesteps = sigmas.clone()
-        ns = SigmaConvert()
-        noise = noise / torch.sqrt(1.0 + timesteps[0] ** 2.0)
-        model_type = "noise"
-        model_fn = model_wrapper(
-            lambda input, sigma, **kwargs: predict_eps_sigma(model, input, sigma, **kwargs),
-            ns,
-            model_type=model_type,
-            guidance_type="uncond",
-            model_kwargs=extra_args,
-        )
-        order = min(3, len(timesteps) - 2)
-        uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, variant=variant)
-        x = uni_pc.sample(noise, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable)
-        x /= ns.marginal_alpha(timesteps[-1])
-        return x
-def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False):
-    return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2')

MagicQuill/comfy/gligen.py DELETED Viewed

@@ -1,343 +0,0 @@
-import torch
-from torch import nn
-from .ldm.modules.attention import CrossAttention
-from inspect import isfunction
-import comfy.ops
-ops = comfy.ops.manual_cast
-def exists(val):
-    return val is not None
-def uniq(arr):
-    return{el: True for el in arr}.keys()
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-# feedforward
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = ops.Linear(dim_in, dim_out * 2)
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * torch.nn.functional.gelu(gate)
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            ops.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            ops.Linear(inner_dim, dim_out)
-        )
-    def forward(self, x):
-        return self.net(x)
-class GatedCrossAttentionDense(nn.Module):
-    def __init__(self, query_dim, context_dim, n_heads, d_head):
-        super().__init__()
-        self.attn = CrossAttention(
-            query_dim=query_dim,
-            context_dim=context_dim,
-            heads=n_heads,
-            dim_head=d_head,
-            operations=ops)
-        self.ff = FeedForward(query_dim, glu=True)
-        self.norm1 = ops.LayerNorm(query_dim)
-        self.norm2 = ops.LayerNorm(query_dim)
-        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
-        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
-        # this can be useful: we can externally change magnitude of tanh(alpha)
-        # for example, when it is set to 0, then the entire model is same as
-        # original one
-        self.scale = 1
-    def forward(self, x, objs):
-        x = x + self.scale * \
-            torch.tanh(self.alpha_attn) * self.attn(self.norm1(x), objs, objs)
-        x = x + self.scale * \
-            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
-        return x
-class GatedSelfAttentionDense(nn.Module):
-    def __init__(self, query_dim, context_dim, n_heads, d_head):
-        super().__init__()
-        # we need a linear projection since we need cat visual feature and obj
-        # feature
-        self.linear = ops.Linear(context_dim, query_dim)
-        self.attn = CrossAttention(
-            query_dim=query_dim,
-            context_dim=query_dim,
-            heads=n_heads,
-            dim_head=d_head,
-            operations=ops)
-        self.ff = FeedForward(query_dim, glu=True)
-        self.norm1 = ops.LayerNorm(query_dim)
-        self.norm2 = ops.LayerNorm(query_dim)
-        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
-        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
-        # this can be useful: we can externally change magnitude of tanh(alpha)
-        # for example, when it is set to 0, then the entire model is same as
-        # original one
-        self.scale = 1
-    def forward(self, x, objs):
-        N_visual = x.shape[1]
-        objs = self.linear(objs)
-        x = x + self.scale * torch.tanh(self.alpha_attn) * self.attn(
-            self.norm1(torch.cat([x, objs], dim=1)))[:, 0:N_visual, :]
-        x = x + self.scale * \
-            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
-        return x
-class GatedSelfAttentionDense2(nn.Module):
-    def __init__(self, query_dim, context_dim, n_heads, d_head):
-        super().__init__()
-        # we need a linear projection since we need cat visual feature and obj
-        # feature
-        self.linear = ops.Linear(context_dim, query_dim)
-        self.attn = CrossAttention(
-            query_dim=query_dim, context_dim=query_dim, dim_head=d_head, operations=ops)
-        self.ff = FeedForward(query_dim, glu=True)
-        self.norm1 = ops.LayerNorm(query_dim)
-        self.norm2 = ops.LayerNorm(query_dim)
-        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
-        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
-        # this can be useful: we can externally change magnitude of tanh(alpha)
-        # for example, when it is set to 0, then the entire model is same as
-        # original one
-        self.scale = 1
-    def forward(self, x, objs):
-        B, N_visual, _ = x.shape
-        B, N_ground, _ = objs.shape
-        objs = self.linear(objs)
-        # sanity check
-        size_v = math.sqrt(N_visual)
-        size_g = math.sqrt(N_ground)
-        assert int(size_v) == size_v, "Visual tokens must be square rootable"
-        assert int(size_g) == size_g, "Grounding tokens must be square rootable"
-        size_v = int(size_v)
-        size_g = int(size_g)
-        # select grounding token and resize it to visual token size as residual
-        out = self.attn(self.norm1(torch.cat([x, objs], dim=1)))[
-            :, N_visual:, :]
-        out = out.permute(0, 2, 1).reshape(B, -1, size_g, size_g)
-        out = torch.nn.functional.interpolate(
-            out, (size_v, size_v), mode='bicubic')
-        residual = out.reshape(B, -1, N_visual).permute(0, 2, 1)
-        # add residual to visual feature
-        x = x + self.scale * torch.tanh(self.alpha_attn) * residual
-        x = x + self.scale * \
-            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
-        return x
-class FourierEmbedder():
-    def __init__(self, num_freqs=64, temperature=100):
-        self.num_freqs = num_freqs
-        self.temperature = temperature
-        self.freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
-    @torch.no_grad()
-    def __call__(self, x, cat_dim=-1):
-        "x: arbitrary shape of tensor. dim: cat dim"
-        out = []
-        for freq in self.freq_bands:
-            out.append(torch.sin(freq * x))
-            out.append(torch.cos(freq * x))
-        return torch.cat(out, cat_dim)
-class PositionNet(nn.Module):
-    def __init__(self, in_dim, out_dim, fourier_freqs=8):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
-        self.position_dim = fourier_freqs * 2 * 4  # 2 is sin&cos, 4 is xyxy
-        self.linears = nn.Sequential(
-            ops.Linear(self.in_dim + self.position_dim, 512),
-            nn.SiLU(),
-            ops.Linear(512, 512),
-            nn.SiLU(),
-            ops.Linear(512, out_dim),
-        )
-        self.null_positive_feature = torch.nn.Parameter(
-            torch.zeros([self.in_dim]))
-        self.null_position_feature = torch.nn.Parameter(
-            torch.zeros([self.position_dim]))
-    def forward(self, boxes, masks, positive_embeddings):
-        B, N, _ = boxes.shape
-        masks = masks.unsqueeze(-1)
-        positive_embeddings = positive_embeddings
-        # embedding position (it may includes padding as placeholder)
-        xyxy_embedding = self.fourier_embedder(boxes)  # B*N*4 --> B*N*C
-        # learnable null embedding
-        positive_null = self.null_positive_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
-        xyxy_null = self.null_position_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
-        # replace padding with learnable null embedding
-        positive_embeddings = positive_embeddings * \
-            masks + (1 - masks) * positive_null
-        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
-        objs = self.linears(
-            torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
-        assert objs.shape == torch.Size([B, N, self.out_dim])
-        return objs
-class Gligen(nn.Module):
-    def __init__(self, modules, position_net, key_dim):
-        super().__init__()
-        self.module_list = nn.ModuleList(modules)
-        self.position_net = position_net
-        self.key_dim = key_dim
-        self.max_objs = 30
-        self.current_device = torch.device("cpu")
-    def _set_position(self, boxes, masks, positive_embeddings):
-        objs = self.position_net(boxes, masks, positive_embeddings)
-        def func(x, extra_options):
-            key = extra_options["transformer_index"]
-            module = self.module_list[key]
-            return module(x, objs.to(device=x.device, dtype=x.dtype))
-        return func
-    def set_position(self, latent_image_shape, position_params, device):
-        batch, c, h, w = latent_image_shape
-        masks = torch.zeros([self.max_objs], device="cpu")
-        boxes = []
-        positive_embeddings = []
-        for p in position_params:
-            x1 = (p[4]) / w
-            y1 = (p[3]) / h
-            x2 = (p[4] + p[2]) / w
-            y2 = (p[3] + p[1]) / h
-            masks[len(boxes)] = 1.0
-            boxes += [torch.tensor((x1, y1, x2, y2)).unsqueeze(0)]
-            positive_embeddings += [p[0]]
-        append_boxes = []
-        append_conds = []
-        if len(boxes) < self.max_objs:
-            append_boxes = [torch.zeros(
-                [self.max_objs - len(boxes), 4], device="cpu")]
-            append_conds = [torch.zeros(
-                [self.max_objs - len(boxes), self.key_dim], device="cpu")]
-        box_out = torch.cat(
-            boxes + append_boxes).unsqueeze(0).repeat(batch, 1, 1)
-        masks = masks.unsqueeze(0).repeat(batch, 1)
-        conds = torch.cat(positive_embeddings +
-                          append_conds).unsqueeze(0).repeat(batch, 1, 1)
-        return self._set_position(
-            box_out.to(device),
-            masks.to(device),
-            conds.to(device))
-    def set_empty(self, latent_image_shape, device):
-        batch, c, h, w = latent_image_shape
-        masks = torch.zeros([self.max_objs], device="cpu").repeat(batch, 1)
-        box_out = torch.zeros([self.max_objs, 4],
-                              device="cpu").repeat(batch, 1, 1)
-        conds = torch.zeros([self.max_objs, self.key_dim],
-                            device="cpu").repeat(batch, 1, 1)
-        return self._set_position(
-            box_out.to(device),
-            masks.to(device),
-            conds.to(device))
-def load_gligen(sd):
-    sd_k = sd.keys()
-    output_list = []
-    key_dim = 768
-    for a in ["input_blocks", "middle_block", "output_blocks"]:
-        for b in range(20):
-            k_temp = filter(lambda k: "{}.{}.".format(a, b)
-                            in k and ".fuser." in k, sd_k)
-            k_temp = map(lambda k: (k, k.split(".fuser.")[-1]), k_temp)
-            n_sd = {}
-            for k in k_temp:
-                n_sd[k[1]] = sd[k[0]]
-            if len(n_sd) > 0:
-                query_dim = n_sd["linear.weight"].shape[0]
-                key_dim = n_sd["linear.weight"].shape[1]
-                if key_dim == 768:  # SD1.x
-                    n_heads = 8
-                    d_head = query_dim // n_heads
-                else:
-                    d_head = 64
-                    n_heads = query_dim // d_head
-                gated = GatedSelfAttentionDense(
-                    query_dim, key_dim, n_heads, d_head)
-                gated.load_state_dict(n_sd, strict=False)
-                output_list.append(gated)
-    if "position_net.null_positive_feature" in sd_k:
-        in_dim = sd["position_net.null_positive_feature"].shape[0]
-        out_dim = sd["position_net.linears.4.weight"].shape[0]
-        class WeightsLoader(torch.nn.Module):
-            pass
-        w = WeightsLoader()
-        w.position_net = PositionNet(in_dim, out_dim)
-        w.load_state_dict(sd, strict=False)
-    gligen = Gligen(output_list, w.position_net, key_dim)
-    return gligen

MagicQuill/comfy/k_diffusion/__pycache__/sampling.cpython-310.pyc DELETED Viewed

Binary file (28.2 kB)

MagicQuill/comfy/k_diffusion/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (14 kB)

MagicQuill/comfy/k_diffusion/sampling.py DELETED Viewed

@@ -1,843 +0,0 @@
-import math
-from scipy import integrate
-import torch
-from torch import nn
-import torchsde
-from tqdm.auto import trange, tqdm
-from . import utils
-def append_zero(x):
-    return torch.cat([x, x.new_zeros([1])])
-def get_sigmas_karras(n, sigma_min, sigma_max, rho=7., device='cpu'):
-    """Constructs the noise schedule of Karras et al. (2022)."""
-    ramp = torch.linspace(0, 1, n, device=device)
-    min_inv_rho = sigma_min ** (1 / rho)
-    max_inv_rho = sigma_max ** (1 / rho)
-    sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
-    return append_zero(sigmas).to(device)
-def get_sigmas_exponential(n, sigma_min, sigma_max, device='cpu'):
-    """Constructs an exponential noise schedule."""
-    sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), n, device=device).exp()
-    return append_zero(sigmas)
-def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'):
-    """Constructs an polynomial in log sigma noise schedule."""
-    ramp = torch.linspace(1, 0, n, device=device) ** rho
-    sigmas = torch.exp(ramp * (math.log(sigma_max) - math.log(sigma_min)) + math.log(sigma_min))
-    return append_zero(sigmas)
-def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
-    """Constructs a continuous VP noise schedule."""
-    t = torch.linspace(1, eps_s, n, device=device)
-    sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1)
-    return append_zero(sigmas)
-def to_d(x, sigma, denoised):
-    """Converts a denoiser output to a Karras ODE derivative."""
-    return (x - denoised) / utils.append_dims(sigma, x.ndim)
-def get_ancestral_step(sigma_from, sigma_to, eta=1.):
-    """Calculates the noise level (sigma_down) to step down to and the amount
-    of noise to add (sigma_up) when doing an ancestral sampling step."""
-    if not eta:
-        return sigma_to, 0.
-    sigma_up = min(sigma_to, eta * (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5)
-    sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5
-    return sigma_down, sigma_up
-def default_noise_sampler(x):
-    return lambda sigma, sigma_next: torch.randn_like(x)
-class BatchedBrownianTree:
-    """A wrapper around torchsde.BrownianTree that enables batches of entropy."""
-    def __init__(self, x, t0, t1, seed=None, **kwargs):
-        self.cpu_tree = True
-        if "cpu" in kwargs:
-            self.cpu_tree = kwargs.pop("cpu")
-        t0, t1, self.sign = self.sort(t0, t1)
-        w0 = kwargs.get('w0', torch.zeros_like(x))
-        if seed is None:
-            seed = torch.randint(0, 2 ** 63 - 1, []).item()
-        self.batched = True
-        try:
-            assert len(seed) == x.shape[0]
-            w0 = w0[0]
-        except TypeError:
-            seed = [seed]
-            self.batched = False
-        if self.cpu_tree:
-            self.trees = [torchsde.BrownianTree(t0.cpu(), w0.cpu(), t1.cpu(), entropy=s, **kwargs) for s in seed]
-        else:
-            self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
-    @staticmethod
-    def sort(a, b):
-        return (a, b, 1) if a < b else (b, a, -1)
-    def __call__(self, t0, t1):
-        t0, t1, sign = self.sort(t0, t1)
-        if self.cpu_tree:
-            w = torch.stack([tree(t0.cpu().float(), t1.cpu().float()).to(t0.dtype).to(t0.device) for tree in self.trees]) * (self.sign * sign)
-        else:
-            w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
-        return w if self.batched else w[0]
-class BrownianTreeNoiseSampler:
-    """A noise sampler backed by a torchsde.BrownianTree.
-    Args:
-        x (Tensor): The tensor whose shape, device and dtype to use to generate
-            random samples.
-        sigma_min (float): The low end of the valid interval.
-        sigma_max (float): The high end of the valid interval.
-        seed (int or List[int]): The random seed. If a list of seeds is
-            supplied instead of a single integer, then the noise sampler will
-            use one BrownianTree per batch item, each with its own seed.
-        transform (callable): A function that maps sigma to the sampler's
-            internal timestep.
-    """
-    def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x, cpu=False):
-        self.transform = transform
-        t0, t1 = self.transform(torch.as_tensor(sigma_min)), self.transform(torch.as_tensor(sigma_max))
-        self.tree = BatchedBrownianTree(x, t0, t1, seed, cpu=cpu)
-    def __call__(self, sigma, sigma_next):
-        t0, t1 = self.transform(torch.as_tensor(sigma)), self.transform(torch.as_tensor(sigma_next))
-        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
-@torch.no_grad()
-def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
-    """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        if s_churn > 0:
-            gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
-            sigma_hat = sigmas[i] * (gamma + 1)
-        else:
-            gamma = 0
-            sigma_hat = sigmas[i]
-        if gamma > 0:
-            eps = torch.randn_like(x) * s_noise
-            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
-        denoised = model(x, sigma_hat * s_in, **extra_args)
-        d = to_d(x, sigma_hat, denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
-        dt = sigmas[i + 1] - sigma_hat
-        # Euler method
-        x = x + d * dt
-    return x
-@torch.no_grad()
-def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with Euler method steps."""
-    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        d = to_d(x, sigmas[i], denoised)
-        # Euler method
-        dt = sigma_down - sigmas[i]
-        x = x + d * dt
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-    return x
-@torch.no_grad()
-def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
-    """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        if s_churn > 0:
-            gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
-            sigma_hat = sigmas[i] * (gamma + 1)
-        else:
-            gamma = 0
-            sigma_hat = sigmas[i]
-        sigma_hat = sigmas[i] * (gamma + 1)
-        if gamma > 0:
-            eps = torch.randn_like(x) * s_noise
-            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
-        denoised = model(x, sigma_hat * s_in, **extra_args)
-        d = to_d(x, sigma_hat, denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
-        dt = sigmas[i + 1] - sigma_hat
-        if sigmas[i + 1] == 0:
-            # Euler method
-            x = x + d * dt
-        else:
-            # Heun's method
-            x_2 = x + d * dt
-            denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
-            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
-            d_prime = (d + d_2) / 2
-            x = x + d_prime * dt
-    return x
-@torch.no_grad()
-def sample_dpm_2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
-    """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        if s_churn > 0:
-            gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
-            sigma_hat = sigmas[i] * (gamma + 1)
-        else:
-            gamma = 0
-            sigma_hat = sigmas[i]
-        if gamma > 0:
-            eps = torch.randn_like(x) * s_noise
-            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
-        denoised = model(x, sigma_hat * s_in, **extra_args)
-        d = to_d(x, sigma_hat, denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Euler method
-            dt = sigmas[i + 1] - sigma_hat
-            x = x + d * dt
-        else:
-            # DPM-Solver-2
-            sigma_mid = sigma_hat.log().lerp(sigmas[i + 1].log(), 0.5).exp()
-            dt_1 = sigma_mid - sigma_hat
-            dt_2 = sigmas[i + 1] - sigma_hat
-            x_2 = x + d * dt_1
-            denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
-            d_2 = to_d(x_2, sigma_mid, denoised_2)
-            x = x + d_2 * dt_2
-    return x
-@torch.no_grad()
-def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with DPM-Solver second-order steps."""
-    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        d = to_d(x, sigmas[i], denoised)
-        if sigma_down == 0:
-            # Euler method
-            dt = sigma_down - sigmas[i]
-            x = x + d * dt
-        else:
-            # DPM-Solver-2
-            sigma_mid = sigmas[i].log().lerp(sigma_down.log(), 0.5).exp()
-            dt_1 = sigma_mid - sigmas[i]
-            dt_2 = sigma_down - sigmas[i]
-            x_2 = x + d * dt_1
-            denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
-            d_2 = to_d(x_2, sigma_mid, denoised_2)
-            x = x + d_2 * dt_2
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-    return x
-def linear_multistep_coeff(order, t, i, j):
-    if order - 1 > i:
-        raise ValueError(f'Order {order} too high for step {i}')
-    def fn(tau):
-        prod = 1.
-        for k in range(order):
-            if j == k:
-                continue
-            prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
-        return prod
-    return integrate.quad(fn, t[i], t[i + 1], epsrel=1e-4)[0]
-@torch.no_grad()
-def sample_lms(model, x, sigmas, extra_args=None, callback=None, disable=None, order=4):
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    sigmas_cpu = sigmas.detach().cpu().numpy()
-    ds = []
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        d = to_d(x, sigmas[i], denoised)
-        ds.append(d)
-        if len(ds) > order:
-            ds.pop(0)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        cur_order = min(i + 1, order)
-        coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)]
-        x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
-    return x
-class PIDStepSizeController:
-    """A PID controller for ODE adaptive step size control."""
-    def __init__(self, h, pcoeff, icoeff, dcoeff, order=1, accept_safety=0.81, eps=1e-8):
-        self.h = h
-        self.b1 = (pcoeff + icoeff + dcoeff) / order
-        self.b2 = -(pcoeff + 2 * dcoeff) / order
-        self.b3 = dcoeff / order
-        self.accept_safety = accept_safety
-        self.eps = eps
-        self.errs = []
-    def limiter(self, x):
-        return 1 + math.atan(x - 1)
-    def propose_step(self, error):
-        inv_error = 1 / (float(error) + self.eps)
-        if not self.errs:
-            self.errs = [inv_error, inv_error, inv_error]
-        self.errs[0] = inv_error
-        factor = self.errs[0] ** self.b1 * self.errs[1] ** self.b2 * self.errs[2] ** self.b3
-        factor = self.limiter(factor)
-        accept = factor >= self.accept_safety
-        if accept:
-            self.errs[2] = self.errs[1]
-            self.errs[1] = self.errs[0]
-        self.h *= factor
-        return accept
-class DPMSolver(nn.Module):
-    """DPM-Solver. See https://arxiv.org/abs/2206.00927."""
-    def __init__(self, model, extra_args=None, eps_callback=None, info_callback=None):
-        super().__init__()
-        self.model = model
-        self.extra_args = {} if extra_args is None else extra_args
-        self.eps_callback = eps_callback
-        self.info_callback = info_callback
-    def t(self, sigma):
-        return -sigma.log()
-    def sigma(self, t):
-        return t.neg().exp()
-    def eps(self, eps_cache, key, x, t, *args, **kwargs):
-        if key in eps_cache:
-            return eps_cache[key], eps_cache
-        sigma = self.sigma(t) * x.new_ones([x.shape[0]])
-        eps = (x - self.model(x, sigma, *args, **self.extra_args, **kwargs)) / self.sigma(t)
-        if self.eps_callback is not None:
-            self.eps_callback()
-        return eps, {key: eps, **eps_cache}
-    def dpm_solver_1_step(self, x, t, t_next, eps_cache=None):
-        eps_cache = {} if eps_cache is None else eps_cache
-        h = t_next - t
-        eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
-        x_1 = x - self.sigma(t_next) * h.expm1() * eps
-        return x_1, eps_cache
-    def dpm_solver_2_step(self, x, t, t_next, r1=1 / 2, eps_cache=None):
-        eps_cache = {} if eps_cache is None else eps_cache
-        h = t_next - t
-        eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
-        s1 = t + r1 * h
-        u1 = x - self.sigma(s1) * (r1 * h).expm1() * eps
-        eps_r1, eps_cache = self.eps(eps_cache, 'eps_r1', u1, s1)
-        x_2 = x - self.sigma(t_next) * h.expm1() * eps - self.sigma(t_next) / (2 * r1) * h.expm1() * (eps_r1 - eps)
-        return x_2, eps_cache
-    def dpm_solver_3_step(self, x, t, t_next, r1=1 / 3, r2=2 / 3, eps_cache=None):
-        eps_cache = {} if eps_cache is None else eps_cache
-        h = t_next - t
-        eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
-        s1 = t + r1 * h
-        s2 = t + r2 * h
-        u1 = x - self.sigma(s1) * (r1 * h).expm1() * eps
-        eps_r1, eps_cache = self.eps(eps_cache, 'eps_r1', u1, s1)
-        u2 = x - self.sigma(s2) * (r2 * h).expm1() * eps - self.sigma(s2) * (r2 / r1) * ((r2 * h).expm1() / (r2 * h) - 1) * (eps_r1 - eps)
-        eps_r2, eps_cache = self.eps(eps_cache, 'eps_r2', u2, s2)
-        x_3 = x - self.sigma(t_next) * h.expm1() * eps - self.sigma(t_next) / r2 * (h.expm1() / h - 1) * (eps_r2 - eps)
-        return x_3, eps_cache
-    def dpm_solver_fast(self, x, t_start, t_end, nfe, eta=0., s_noise=1., noise_sampler=None):
-        noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-        if not t_end > t_start and eta:
-            raise ValueError('eta must be 0 for reverse sampling')
-        m = math.floor(nfe / 3) + 1
-        ts = torch.linspace(t_start, t_end, m + 1, device=x.device)
-        if nfe % 3 == 0:
-            orders = [3] * (m - 2) + [2, 1]
-        else:
-            orders = [3] * (m - 1) + [nfe % 3]
-        for i in range(len(orders)):
-            eps_cache = {}
-            t, t_next = ts[i], ts[i + 1]
-            if eta:
-                sd, su = get_ancestral_step(self.sigma(t), self.sigma(t_next), eta)
-                t_next_ = torch.minimum(t_end, self.t(sd))
-                su = (self.sigma(t_next) ** 2 - self.sigma(t_next_) ** 2) ** 0.5
-            else:
-                t_next_, su = t_next, 0.
-            eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
-            denoised = x - self.sigma(t) * eps
-            if self.info_callback is not None:
-                self.info_callback({'x': x, 'i': i, 't': ts[i], 't_up': t, 'denoised': denoised})
-            if orders[i] == 1:
-                x, eps_cache = self.dpm_solver_1_step(x, t, t_next_, eps_cache=eps_cache)
-            elif orders[i] == 2:
-                x, eps_cache = self.dpm_solver_2_step(x, t, t_next_, eps_cache=eps_cache)
-            else:
-                x, eps_cache = self.dpm_solver_3_step(x, t, t_next_, eps_cache=eps_cache)
-            x = x + su * s_noise * noise_sampler(self.sigma(t), self.sigma(t_next))
-        return x
-    def dpm_solver_adaptive(self, x, t_start, t_end, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None):
-        noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-        if order not in {2, 3}:
-            raise ValueError('order should be 2 or 3')
-        forward = t_end > t_start
-        if not forward and eta:
-            raise ValueError('eta must be 0 for reverse sampling')
-        h_init = abs(h_init) * (1 if forward else -1)
-        atol = torch.tensor(atol)
-        rtol = torch.tensor(rtol)
-        s = t_start
-        x_prev = x
-        accept = True
-        pid = PIDStepSizeController(h_init, pcoeff, icoeff, dcoeff, 1.5 if eta else order, accept_safety)
-        info = {'steps': 0, 'nfe': 0, 'n_accept': 0, 'n_reject': 0}
-        while s < t_end - 1e-5 if forward else s > t_end + 1e-5:
-            eps_cache = {}
-            t = torch.minimum(t_end, s + pid.h) if forward else torch.maximum(t_end, s + pid.h)
-            if eta:
-                sd, su = get_ancestral_step(self.sigma(s), self.sigma(t), eta)
-                t_ = torch.minimum(t_end, self.t(sd))
-                su = (self.sigma(t) ** 2 - self.sigma(t_) ** 2) ** 0.5
-            else:
-                t_, su = t, 0.
-            eps, eps_cache = self.eps(eps_cache, 'eps', x, s)
-            denoised = x - self.sigma(s) * eps
-            if order == 2:
-                x_low, eps_cache = self.dpm_solver_1_step(x, s, t_, eps_cache=eps_cache)
-                x_high, eps_cache = self.dpm_solver_2_step(x, s, t_, eps_cache=eps_cache)
-            else:
-                x_low, eps_cache = self.dpm_solver_2_step(x, s, t_, r1=1 / 3, eps_cache=eps_cache)
-                x_high, eps_cache = self.dpm_solver_3_step(x, s, t_, eps_cache=eps_cache)
-            delta = torch.maximum(atol, rtol * torch.maximum(x_low.abs(), x_prev.abs()))
-            error = torch.linalg.norm((x_low - x_high) / delta) / x.numel() ** 0.5
-            accept = pid.propose_step(error)
-            if accept:
-                x_prev = x_low
-                x = x_high + su * s_noise * noise_sampler(self.sigma(s), self.sigma(t))
-                s = t
-                info['n_accept'] += 1
-            else:
-                info['n_reject'] += 1
-            info['nfe'] += order
-            info['steps'] += 1
-            if self.info_callback is not None:
-                self.info_callback({'x': x, 'i': info['steps'] - 1, 't': s, 't_up': s, 'denoised': denoised, 'error': error, 'h': pid.h, **info})
-        return x, info
-@torch.no_grad()
-def sample_dpm_fast(model, x, sigma_min, sigma_max, n, extra_args=None, callback=None, disable=None, eta=0., s_noise=1., noise_sampler=None):
-    """DPM-Solver-Fast (fixed step size). See https://arxiv.org/abs/2206.00927."""
-    if sigma_min <= 0 or sigma_max <= 0:
-        raise ValueError('sigma_min and sigma_max must not be 0')
-    with tqdm(total=n, disable=disable) as pbar:
-        dpm_solver = DPMSolver(model, extra_args, eps_callback=pbar.update)
-        if callback is not None:
-            dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info})
-        return dpm_solver.dpm_solver_fast(x, dpm_solver.t(torch.tensor(sigma_max)), dpm_solver.t(torch.tensor(sigma_min)), n, eta, s_noise, noise_sampler)
-@torch.no_grad()
-def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callback=None, disable=None, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None, return_info=False):
-    """DPM-Solver-12 and 23 (adaptive step size). See https://arxiv.org/abs/2206.00927."""
-    if sigma_min <= 0 or sigma_max <= 0:
-        raise ValueError('sigma_min and sigma_max must not be 0')
-    with tqdm(disable=disable) as pbar:
-        dpm_solver = DPMSolver(model, extra_args, eps_callback=pbar.update)
-        if callback is not None:
-            dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info})
-        x, info = dpm_solver.dpm_solver_adaptive(x, dpm_solver.t(torch.tensor(sigma_max)), dpm_solver.t(torch.tensor(sigma_min)), order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise, noise_sampler)
-    if return_info:
-        return x, info
-    return x
-@torch.no_grad()
-def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
-    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda t: t.neg().exp()
-    t_fn = lambda sigma: sigma.log().neg()
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigma_down == 0:
-            # Euler method
-            d = to_d(x, sigmas[i], denoised)
-            dt = sigma_down - sigmas[i]
-            x = x + d * dt
-        else:
-            # DPM-Solver++(2S)
-            t, t_next = t_fn(sigmas[i]), t_fn(sigma_down)
-            r = 1 / 2
-            h = t_next - t
-            s = t + r * h
-            x_2 = (sigma_fn(s) / sigma_fn(t)) * x - (-h * r).expm1() * denoised
-            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
-            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_2
-        # Noise addition
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-    return x
-@torch.no_grad()
-def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
-    """DPM-Solver++ (stochastic)."""
-    if len(sigmas) <= 1:
-        return x
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    seed = extra_args.get("seed", None)
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda t: t.neg().exp()
-    t_fn = lambda sigma: sigma.log().neg()
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Euler method
-            d = to_d(x, sigmas[i], denoised)
-            dt = sigmas[i + 1] - sigmas[i]
-            x = x + d * dt
-        else:
-            # DPM-Solver++
-            t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
-            h = t_next - t
-            s = t + h * r
-            fac = 1 / (2 * r)
-            # Step 1
-            sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(s), eta)
-            s_ = t_fn(sd)
-            x_2 = (sigma_fn(s_) / sigma_fn(t)) * x - (t - s_).expm1() * denoised
-            x_2 = x_2 + noise_sampler(sigma_fn(t), sigma_fn(s)) * s_noise * su
-            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
-            # Step 2
-            sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(t_next), eta)
-            t_next_ = t_fn(sd)
-            denoised_d = (1 - fac) * denoised + fac * denoised_2
-            x = (sigma_fn(t_next_) / sigma_fn(t)) * x - (t - t_next_).expm1() * denoised_d
-            x = x + noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * su
-    return x
-@torch.no_grad()
-def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, disable=None):
-    """DPM-Solver++(2M)."""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda t: t.neg().exp()
-    t_fn = lambda sigma: sigma.log().neg()
-    old_denoised = None
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
-        h = t_next - t
-        if old_denoised is None or sigmas[i + 1] == 0:
-            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised
-        else:
-            h_last = t - t_fn(sigmas[i - 1])
-            r = h_last / h
-            denoised_d = (1 + 1 / (2 * r)) * denoised - (1 / (2 * r)) * old_denoised
-            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_d
-        old_denoised = denoised
-    return x
-@torch.no_grad()
-def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
-    """DPM-Solver++(2M) SDE."""
-    if len(sigmas) <= 1:
-        return x
-    if solver_type not in {'heun', 'midpoint'}:
-        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')
-    seed = extra_args.get("seed", None)
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    old_denoised = None
-    h_last = None
-    h = None
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
-        else:
-            # DPM-Solver++(2M) SDE
-            t, s = -sigmas[i].log(), -sigmas[i + 1].log()
-            h = s - t
-            eta_h = eta * h
-            x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised
-            if old_denoised is not None:
-                r = h_last / h
-                if solver_type == 'heun':
-                    x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised)
-                elif solver_type == 'midpoint':
-                    x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised)
-            if eta:
-                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise
-        old_denoised = denoised
-        h_last = h
-    return x
-@torch.no_grad()
-def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """DPM-Solver++(3M) SDE."""
-    if len(sigmas) <= 1:
-        return x
-    seed = extra_args.get("seed", None)
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    denoised_1, denoised_2 = None, None
-    h, h_1, h_2 = None, None, None
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Denoising step
-            x = denoised
-        else:
-            t, s = -sigmas[i].log(), -sigmas[i + 1].log()
-            h = s - t
-            h_eta = h * (eta + 1)
-            x = torch.exp(-h_eta) * x + (-h_eta).expm1().neg() * denoised
-            if h_2 is not None:
-                r0 = h_1 / h
-                r1 = h_2 / h
-                d1_0 = (denoised - denoised_1) / r0
-                d1_1 = (denoised_1 - denoised_2) / r1
-                d1 = d1_0 + (d1_0 - d1_1) * r0 / (r0 + r1)
-                d2 = (d1_0 - d1_1) / (r0 + r1)
-                phi_2 = h_eta.neg().expm1() / h_eta + 1
-                phi_3 = phi_2 / h_eta - 0.5
-                x = x + phi_2 * d1 - phi_3 * d2
-            elif h_1 is not None:
-                r = h_1 / h
-                d = (denoised - denoised_1) / r
-                phi_2 = h_eta.neg().expm1() / h_eta + 1
-                x = x + phi_2 * d
-            if eta:
-                x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * h * eta).expm1().neg().sqrt() * s_noise
-        denoised_1, denoised_2 = denoised, denoised_1
-        h_1, h_2 = h, h_1
-    return x
-@torch.no_grad()
-def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    if len(sigmas) <= 1:
-        return x
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
-    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
-@torch.no_grad()
-def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
-    if len(sigmas) <= 1:
-        return x
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
-    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
-@torch.no_grad()
-def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
-    if len(sigmas) <= 1:
-        return x
-    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
-    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
-    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
-def DDPMSampler_step(x, sigma, sigma_prev, noise, noise_sampler):
-    alpha_cumprod = 1 / ((sigma * sigma) + 1)
-    alpha_cumprod_prev = 1 / ((sigma_prev * sigma_prev) + 1)
-    alpha = (alpha_cumprod / alpha_cumprod_prev)
-    mu = (1.0 / alpha).sqrt() * (x - (1 - alpha) * noise / (1 - alpha_cumprod).sqrt())
-    if sigma_prev > 0:
-        mu += ((1 - alpha) * (1. - alpha_cumprod_prev) / (1. - alpha_cumprod)).sqrt() * noise_sampler(sigma, sigma_prev)
-    return mu
-def generic_step_sampler(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, step_function=None):
-    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        x = step_function(x / torch.sqrt(1.0 + sigmas[i] ** 2.0), sigmas[i], sigmas[i + 1], (x - denoised) / sigmas[i], noise_sampler)
-        if sigmas[i + 1] != 0:
-            x *= torch.sqrt(1.0 + sigmas[i + 1] ** 2.0)
-    return x
-@torch.no_grad()
-def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
-    return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
-@torch.no_grad()
-def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
-    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        x = denoised
-        if sigmas[i + 1] > 0:
-            x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
-    return x
-@torch.no_grad()
-def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
-    # From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    s_end = sigmas[-1]
-    for i in trange(len(sigmas) - 1, disable=disable):
-        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
-        eps = torch.randn_like(x) * s_noise
-        sigma_hat = sigmas[i] * (gamma + 1)
-        if gamma > 0:
-            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
-        denoised = model(x, sigma_hat * s_in, **extra_args)
-        d = to_d(x, sigma_hat, denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
-        dt = sigmas[i + 1] - sigma_hat
-        if sigmas[i + 1] == s_end:
-            # Euler method
-            x = x + d * dt
-        elif sigmas[i + 2] == s_end:
-            # Heun's method
-            x_2 = x + d * dt
-            denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
-            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
-            w = 2 * sigmas[0]
-            w2 = sigmas[i+1]/w
-            w1 = 1 - w2
-            d_prime = d * w1 + d_2 * w2
-            x = x + d_prime * dt
-        else:
-            # Heun++
-            x_2 = x + d * dt
-            denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
-            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
-            dt_2 = sigmas[i + 2] - sigmas[i + 1]
-            x_3 = x_2 + d_2 * dt_2
-            denoised_3 = model(x_3, sigmas[i + 2] * s_in, **extra_args)
-            d_3 = to_d(x_3, sigmas[i + 2], denoised_3)
-            w = 3 * sigmas[0]
-            w2 = sigmas[i + 1] / w
-            w3 = sigmas[i + 2] / w
-            w1 = 1 - w2 - w3
-            d_prime = w1 * d + w2 * d_2 + w3 * d_3
-            x = x + d_prime * dt
-    return x

MagicQuill/comfy/k_diffusion/utils.py DELETED Viewed

@@ -1,313 +0,0 @@
-from contextlib import contextmanager
-import hashlib
-import math
-from pathlib import Path
-import shutil
-import urllib
-import warnings
-from PIL import Image
-import torch
-from torch import nn, optim
-from torch.utils import data
-def hf_datasets_augs_helper(examples, transform, image_key, mode='RGB'):
-    """Apply passed in transforms for HuggingFace Datasets."""
-    images = [transform(image.convert(mode)) for image in examples[image_key]]
-    return {image_key: images}
-def append_dims(x, target_dims):
-    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
-    dims_to_append = target_dims - x.ndim
-    if dims_to_append < 0:
-        raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
-    expanded = x[(...,) + (None,) * dims_to_append]
-    # MPS will get inf values if it tries to index into the new axes, but detaching fixes this.
-    # https://github.com/pytorch/pytorch/issues/84364
-    return expanded.detach().clone() if expanded.device.type == 'mps' else expanded
-def n_params(module):
-    """Returns the number of trainable parameters in a module."""
-    return sum(p.numel() for p in module.parameters())
-def download_file(path, url, digest=None):
-    """Downloads a file if it does not exist, optionally checking its SHA-256 hash."""
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    if not path.exists():
-        with urllib.request.urlopen(url) as response, open(path, 'wb') as f:
-            shutil.copyfileobj(response, f)
-    if digest is not None:
-        file_digest = hashlib.sha256(open(path, 'rb').read()).hexdigest()
-        if digest != file_digest:
-            raise OSError(f'hash of {path} (url: {url}) failed to validate')
-    return path
-@contextmanager
-def train_mode(model, mode=True):
-    """A context manager that places a model into training mode and restores
-    the previous mode on exit."""
-    modes = [module.training for module in model.modules()]
-    try:
-        yield model.train(mode)
-    finally:
-        for i, module in enumerate(model.modules()):
-            module.training = modes[i]
-def eval_mode(model):
-    """A context manager that places a model into evaluation mode and restores
-    the previous mode on exit."""
-    return train_mode(model, False)
-@torch.no_grad()
-def ema_update(model, averaged_model, decay):
-    """Incorporates updated model parameters into an exponential moving averaged
-    version of a model. It should be called after each optimizer step."""
-    model_params = dict(model.named_parameters())
-    averaged_params = dict(averaged_model.named_parameters())
-    assert model_params.keys() == averaged_params.keys()
-    for name, param in model_params.items():
-        averaged_params[name].mul_(decay).add_(param, alpha=1 - decay)
-    model_buffers = dict(model.named_buffers())
-    averaged_buffers = dict(averaged_model.named_buffers())
-    assert model_buffers.keys() == averaged_buffers.keys()
-    for name, buf in model_buffers.items():
-        averaged_buffers[name].copy_(buf)
-class EMAWarmup:
-    """Implements an EMA warmup using an inverse decay schedule.
-    If inv_gamma=1 and power=1, implements a simple average. inv_gamma=1, power=2/3 are
-    good values for models you plan to train for a million or more steps (reaches decay
-    factor 0.999 at 31.6K steps, 0.9999 at 1M steps), inv_gamma=1, power=3/4 for models
-    you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at
-    215.4k steps).
-    Args:
-        inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
-        power (float): Exponential factor of EMA warmup. Default: 1.
-        min_value (float): The minimum EMA decay rate. Default: 0.
-        max_value (float): The maximum EMA decay rate. Default: 1.
-        start_at (int): The epoch to start averaging at. Default: 0.
-        last_epoch (int): The index of last epoch. Default: 0.
-    """
-    def __init__(self, inv_gamma=1., power=1., min_value=0., max_value=1., start_at=0,
-                 last_epoch=0):
-        self.inv_gamma = inv_gamma
-        self.power = power
-        self.min_value = min_value
-        self.max_value = max_value
-        self.start_at = start_at
-        self.last_epoch = last_epoch
-    def state_dict(self):
-        """Returns the state of the class as a :class:`dict`."""
-        return dict(self.__dict__.items())
-    def load_state_dict(self, state_dict):
-        """Loads the class's state.
-        Args:
-            state_dict (dict): scaler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-    def get_value(self):
-        """Gets the current EMA decay rate."""
-        epoch = max(0, self.last_epoch - self.start_at)
-        value = 1 - (1 + epoch / self.inv_gamma) ** -self.power
-        return 0. if epoch < 0 else min(self.max_value, max(self.min_value, value))
-    def step(self):
-        """Updates the step count."""
-        self.last_epoch += 1
-class InverseLR(optim.lr_scheduler._LRScheduler):
-    """Implements an inverse decay learning rate schedule with an optional exponential
-    warmup. When last_epoch=-1, sets initial lr as lr.
-    inv_gamma is the number of steps/epochs required for the learning rate to decay to
-    (1 / 2)**power of its original value.
-    Args:
-        optimizer (Optimizer): Wrapped optimizer.
-        inv_gamma (float): Inverse multiplicative factor of learning rate decay. Default: 1.
-        power (float): Exponential factor of learning rate decay. Default: 1.
-        warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable)
-            Default: 0.
-        min_lr (float): The minimum learning rate. Default: 0.
-        last_epoch (int): The index of last epoch. Default: -1.
-        verbose (bool): If ``True``, prints a message to stdout for
-            each update. Default: ``False``.
-    """
-    def __init__(self, optimizer, inv_gamma=1., power=1., warmup=0., min_lr=0.,
-                 last_epoch=-1, verbose=False):
-        self.inv_gamma = inv_gamma
-        self.power = power
-        if not 0. <= warmup < 1:
-            raise ValueError('Invalid value for warmup')
-        self.warmup = warmup
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch, verbose)
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.")
-        return self._get_closed_form_lr()
-    def _get_closed_form_lr(self):
-        warmup = 1 - self.warmup ** (self.last_epoch + 1)
-        lr_mult = (1 + self.last_epoch / self.inv_gamma) ** -self.power
-        return [warmup * max(self.min_lr, base_lr * lr_mult)
-                for base_lr in self.base_lrs]
-class ExponentialLR(optim.lr_scheduler._LRScheduler):
-    """Implements an exponential learning rate schedule with an optional exponential
-    warmup. When last_epoch=-1, sets initial lr as lr. Decays the learning rate
-    continuously by decay (default 0.5) every num_steps steps.
-    Args:
-        optimizer (Optimizer): Wrapped optimizer.
-        num_steps (float): The number of steps to decay the learning rate by decay in.
-        decay (float): The factor by which to decay the learning rate every num_steps
-            steps. Default: 0.5.
-        warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable)
-            Default: 0.
-        min_lr (float): The minimum learning rate. Default: 0.
-        last_epoch (int): The index of last epoch. Default: -1.
-        verbose (bool): If ``True``, prints a message to stdout for
-            each update. Default: ``False``.
-    """
-    def __init__(self, optimizer, num_steps, decay=0.5, warmup=0., min_lr=0.,
-                 last_epoch=-1, verbose=False):
-        self.num_steps = num_steps
-        self.decay = decay
-        if not 0. <= warmup < 1:
-            raise ValueError('Invalid value for warmup')
-        self.warmup = warmup
-        self.min_lr = min_lr
-        super().__init__(optimizer, last_epoch, verbose)
-    def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.")
-        return self._get_closed_form_lr()
-    def _get_closed_form_lr(self):
-        warmup = 1 - self.warmup ** (self.last_epoch + 1)
-        lr_mult = (self.decay ** (1 / self.num_steps)) ** self.last_epoch
-        return [warmup * max(self.min_lr, base_lr * lr_mult)
-                for base_lr in self.base_lrs]
-def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32):
-    """Draws samples from an lognormal distribution."""
-    return (torch.randn(shape, device=device, dtype=dtype) * scale + loc).exp()
-def rand_log_logistic(shape, loc=0., scale=1., min_value=0., max_value=float('inf'), device='cpu', dtype=torch.float32):
-    """Draws samples from an optionally truncated log-logistic distribution."""
-    min_value = torch.as_tensor(min_value, device=device, dtype=torch.float64)
-    max_value = torch.as_tensor(max_value, device=device, dtype=torch.float64)
-    min_cdf = min_value.log().sub(loc).div(scale).sigmoid()
-    max_cdf = max_value.log().sub(loc).div(scale).sigmoid()
-    u = torch.rand(shape, device=device, dtype=torch.float64) * (max_cdf - min_cdf) + min_cdf
-    return u.logit().mul(scale).add(loc).exp().to(dtype)
-def rand_log_uniform(shape, min_value, max_value, device='cpu', dtype=torch.float32):
-    """Draws samples from an log-uniform distribution."""
-    min_value = math.log(min_value)
-    max_value = math.log(max_value)
-    return (torch.rand(shape, device=device, dtype=dtype) * (max_value - min_value) + min_value).exp()
-def rand_v_diffusion(shape, sigma_data=1., min_value=0., max_value=float('inf'), device='cpu', dtype=torch.float32):
-    """Draws samples from a truncated v-diffusion training timestep distribution."""
-    min_cdf = math.atan(min_value / sigma_data) * 2 / math.pi
-    max_cdf = math.atan(max_value / sigma_data) * 2 / math.pi
-    u = torch.rand(shape, device=device, dtype=dtype) * (max_cdf - min_cdf) + min_cdf
-    return torch.tan(u * math.pi / 2) * sigma_data
-def rand_split_log_normal(shape, loc, scale_1, scale_2, device='cpu', dtype=torch.float32):
-    """Draws samples from a split lognormal distribution."""
-    n = torch.randn(shape, device=device, dtype=dtype).abs()
-    u = torch.rand(shape, device=device, dtype=dtype)
-    n_left = n * -scale_1 + loc
-    n_right = n * scale_2 + loc
-    ratio = scale_1 / (scale_1 + scale_2)
-    return torch.where(u < ratio, n_left, n_right).exp()
-class FolderOfImages(data.Dataset):
-    """Recursively finds all images in a directory. It does not support
-    classes/targets."""
-    IMG_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp'}
-    def __init__(self, root, transform=None):
-        super().__init__()
-        self.root = Path(root)
-        self.transform = nn.Identity() if transform is None else transform
-        self.paths = sorted(path for path in self.root.rglob('*') if path.suffix.lower() in self.IMG_EXTENSIONS)
-    def __repr__(self):
-        return f'FolderOfImages(root="{self.root}", len: {len(self)})'
-    def __len__(self):
-        return len(self.paths)
-    def __getitem__(self, key):
-        path = self.paths[key]
-        with open(path, 'rb') as f:
-            image = Image.open(f).convert('RGB')
-        image = self.transform(image)
-        return image,
-class CSVLogger:
-    def __init__(self, filename, columns):
-        self.filename = Path(filename)
-        self.columns = columns
-        if self.filename.exists():
-            self.file = open(self.filename, 'a')
-        else:
-            self.file = open(self.filename, 'w')
-            self.write(*self.columns)
-    def write(self, *args):
-        print(*args, sep=',', file=self.file, flush=True)
-@contextmanager
-def tf32_mode(cudnn=None, matmul=None):
-    """A context manager that sets whether TF32 is allowed on cuDNN or matmul."""
-    cudnn_old = torch.backends.cudnn.allow_tf32
-    matmul_old = torch.backends.cuda.matmul.allow_tf32
-    try:
-        if cudnn is not None:
-            torch.backends.cudnn.allow_tf32 = cudnn
-        if matmul is not None:
-            torch.backends.cuda.matmul.allow_tf32 = matmul
-        yield
-    finally:
-        if cudnn is not None:
-            torch.backends.cudnn.allow_tf32 = cudnn_old
-        if matmul is not None:
-            torch.backends.cuda.matmul.allow_tf32 = matmul_old

MagicQuill/comfy/latent_formats.py DELETED Viewed

@@ -1,141 +0,0 @@
-import torch
-class LatentFormat:
-    scale_factor = 1.0
-    latent_channels = 4
-    latent_rgb_factors = None
-    taesd_decoder_name = None
-    def process_in(self, latent):
-        return latent * self.scale_factor
-    def process_out(self, latent):
-        return latent / self.scale_factor
-class SD15(LatentFormat):
-    def __init__(self, scale_factor=0.18215):
-        self.scale_factor = scale_factor
-        self.latent_rgb_factors = [
-                    #   R        G        B
-                    [ 0.3512,  0.2297,  0.3227],
-                    [ 0.3250,  0.4974,  0.2350],
-                    [-0.2829,  0.1762,  0.2721],
-                    [-0.2120, -0.2616, -0.7177]
-                ]
-        self.taesd_decoder_name = "taesd_decoder"
-class SDXL(LatentFormat):
-    scale_factor = 0.13025
-    def __init__(self):
-        self.latent_rgb_factors = [
-                    #   R        G        B
-                    [ 0.3920,  0.4054,  0.4549],
-                    [-0.2634, -0.0196,  0.0653],
-                    [ 0.0568,  0.1687, -0.0755],
-                    [-0.3112, -0.2359, -0.2076]
-                ]
-        self.taesd_decoder_name = "taesdxl_decoder"
-class SDXL_Playground_2_5(LatentFormat):
-    def __init__(self):
-        self.scale_factor = 0.5
-        self.latents_mean = torch.tensor([-1.6574, 1.886, -1.383, 2.5155]).view(1, 4, 1, 1)
-        self.latents_std = torch.tensor([8.4927, 5.9022, 6.5498, 5.2299]).view(1, 4, 1, 1)
-        self.latent_rgb_factors = [
-                    #   R        G        B
-                    [ 0.3920,  0.4054,  0.4549],
-                    [-0.2634, -0.0196,  0.0653],
-                    [ 0.0568,  0.1687, -0.0755],
-                    [-0.3112, -0.2359, -0.2076]
-                ]
-        self.taesd_decoder_name = "taesdxl_decoder"
-    def process_in(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return (latent - latents_mean) * self.scale_factor / latents_std
-    def process_out(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return latent * latents_std / self.scale_factor + latents_mean
-class SD_X4(LatentFormat):
-    def __init__(self):
-        self.scale_factor = 0.08333
-        self.latent_rgb_factors = [
-            [-0.2340, -0.3863, -0.3257],
-            [ 0.0994,  0.0885, -0.0908],
-            [-0.2833, -0.2349, -0.3741],
-            [ 0.2523, -0.0055, -0.1651]
-        ]
-class SC_Prior(LatentFormat):
-    latent_channels = 16
-    def __init__(self):
-        self.scale_factor = 1.0
-        self.latent_rgb_factors = [
-            [-0.0326, -0.0204, -0.0127],
-            [-0.1592, -0.0427,  0.0216],
-            [ 0.0873,  0.0638, -0.0020],
-            [-0.0602,  0.0442,  0.1304],
-            [ 0.0800, -0.0313, -0.1796],
-            [-0.0810, -0.0638, -0.1581],
-            [ 0.1791,  0.1180,  0.0967],
-            [ 0.0740,  0.1416,  0.0432],
-            [-0.1745, -0.1888, -0.1373],
-            [ 0.2412,  0.1577,  0.0928],
-            [ 0.1908,  0.0998,  0.0682],
-            [ 0.0209,  0.0365, -0.0092],
-            [ 0.0448, -0.0650, -0.1728],
-            [-0.1658, -0.1045, -0.1308],
-            [ 0.0542,  0.1545,  0.1325],
-            [-0.0352, -0.1672, -0.2541]
-        ]
-class SC_B(LatentFormat):
-    def __init__(self):
-        self.scale_factor = 1.0 / 0.43
-        self.latent_rgb_factors = [
-            [ 0.1121,  0.2006,  0.1023],
-            [-0.2093, -0.0222, -0.0195],
-            [-0.3087, -0.1535,  0.0366],
-            [ 0.0290, -0.1574, -0.4078]
-        ]
-class SD3(LatentFormat):
-    latent_channels = 16
-    def __init__(self):
-        self.scale_factor = 1.5305
-        self.shift_factor = 0.0609
-        self.latent_rgb_factors = [
-            [-0.0645,  0.0177,  0.1052],
-            [ 0.0028,  0.0312,  0.0650],
-            [ 0.1848,  0.0762,  0.0360],
-            [ 0.0944,  0.0360,  0.0889],
-            [ 0.0897,  0.0506, -0.0364],
-            [-0.0020,  0.1203,  0.0284],
-            [ 0.0855,  0.0118,  0.0283],
-            [-0.0539,  0.0658,  0.1047],
-            [-0.0057,  0.0116,  0.0700],
-            [-0.0412,  0.0281, -0.0039],
-            [ 0.1106,  0.1171,  0.1220],
-            [-0.0248,  0.0682, -0.0481],
-            [ 0.0815,  0.0846,  0.1207],
-            [-0.0120, -0.0055, -0.0867],
-            [-0.0749, -0.0634, -0.0456],
-            [-0.1418, -0.1457, -0.1259]
-        ]
-        self.taesd_decoder_name = "taesd3_decoder"
-    def process_in(self, latent):
-        return (latent - self.shift_factor) * self.scale_factor
-    def process_out(self, latent):
-        return (latent / self.scale_factor) + self.shift_factor
-class StableAudio1(LatentFormat):
-    latent_channels = 64

MagicQuill/comfy/ldm/.DS_Store DELETED Viewed

Binary file (6.15 kB)

MagicQuill/comfy/ldm/__pycache__/util.cpython-310.pyc DELETED Viewed

Binary file (6.19 kB)

MagicQuill/comfy/ldm/audio/__pycache__/autoencoder.cpython-310.pyc DELETED Viewed

Binary file (8.08 kB)

MagicQuill/comfy/ldm/audio/__pycache__/dit.cpython-310.pyc DELETED Viewed

Binary file (18.7 kB)

MagicQuill/comfy/ldm/audio/__pycache__/embedders.cpython-310.pyc DELETED Viewed

Binary file (4.34 kB)

MagicQuill/comfy/ldm/audio/autoencoder.py DELETED Viewed

@@ -1,282 +0,0 @@
-# code adapted from: https://github.com/Stability-AI/stable-audio-tools
-import torch
-from torch import nn
-from typing import Literal, Dict, Any
-import math
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-def vae_sample(mean, scale):
-        stdev = nn.functional.softplus(scale) + 1e-4
-        var = stdev * stdev
-        logvar = torch.log(var)
-        latents = torch.randn_like(mean) * stdev + mean
-        kl = (mean * mean + var - logvar - 1).sum(1).mean()
-        return latents, kl
-class VAEBottleneck(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.is_discrete = False
-    def encode(self, x, return_info=False, **kwargs):
-        info = {}
-        mean, scale = x.chunk(2, dim=1)
-        x, kl = vae_sample(mean, scale)
-        info["kl"] = kl
-        if return_info:
-            return x, info
-        else:
-            return x
-    def decode(self, x):
-        return x
-def snake_beta(x, alpha, beta):
-    return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2)
-# Adapted from https://github.com/NVIDIA/BigVGAN/blob/main/activations.py under MIT license
-class SnakeBeta(nn.Module):
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
-        super(SnakeBeta, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale: # log scale alphas initialized to zeros
-            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
-            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
-        else: # linear scale alphas initialized to ones
-            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
-            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
-        # self.alpha.requires_grad = alpha_trainable
-        # self.beta.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1).to(x.device) # line up with x to [B, C, T]
-        beta = self.beta.unsqueeze(0).unsqueeze(-1).to(x.device)
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-            beta = torch.exp(beta)
-        x = snake_beta(x, alpha, beta)
-        return x
-def WNConv1d(*args, **kwargs):
-    try:
-        return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
-    except:
-        return torch.nn.utils.weight_norm(ops.Conv1d(*args, **kwargs)) #support pytorch 2.1 and older
-def WNConvTranspose1d(*args, **kwargs):
-    try:
-        return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
-    except:
-        return torch.nn.utils.weight_norm(ops.ConvTranspose1d(*args, **kwargs)) #support pytorch 2.1 and older
-def get_activation(activation: Literal["elu", "snake", "none"], antialias=False, channels=None) -> nn.Module:
-    if activation == "elu":
-        act = torch.nn.ELU()
-    elif activation == "snake":
-        act = SnakeBeta(channels)
-    elif activation == "none":
-        act = torch.nn.Identity()
-    else:
-        raise ValueError(f"Unknown activation {activation}")
-    if antialias:
-        act = Activation1d(act)
-    return act
-class ResidualUnit(nn.Module):
-    def __init__(self, in_channels, out_channels, dilation, use_snake=False, antialias_activation=False):
-        super().__init__()
-        self.dilation = dilation
-        padding = (dilation * (7-1)) // 2
-        self.layers = nn.Sequential(
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
-            WNConv1d(in_channels=in_channels, out_channels=out_channels,
-                      kernel_size=7, dilation=dilation, padding=padding),
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
-            WNConv1d(in_channels=out_channels, out_channels=out_channels,
-                      kernel_size=1)
-        )
-    def forward(self, x):
-        res = x
-        #x = checkpoint(self.layers, x)
-        x = self.layers(x)
-        return x + res
-class EncoderBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride, use_snake=False, antialias_activation=False):
-        super().__init__()
-        self.layers = nn.Sequential(
-            ResidualUnit(in_channels=in_channels,
-                         out_channels=in_channels, dilation=1, use_snake=use_snake),
-            ResidualUnit(in_channels=in_channels,
-                         out_channels=in_channels, dilation=3, use_snake=use_snake),
-            ResidualUnit(in_channels=in_channels,
-                         out_channels=in_channels, dilation=9, use_snake=use_snake),
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels),
-            WNConv1d(in_channels=in_channels, out_channels=out_channels,
-                      kernel_size=2*stride, stride=stride, padding=math.ceil(stride/2)),
-        )
-    def forward(self, x):
-        return self.layers(x)
-class DecoderBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride, use_snake=False, antialias_activation=False, use_nearest_upsample=False):
-        super().__init__()
-        if use_nearest_upsample:
-            upsample_layer = nn.Sequential(
-                nn.Upsample(scale_factor=stride, mode="nearest"),
-                WNConv1d(in_channels=in_channels,
-                        out_channels=out_channels,
-                        kernel_size=2*stride,
-                        stride=1,
-                        bias=False,
-                        padding='same')
-            )
-        else:
-            upsample_layer = WNConvTranspose1d(in_channels=in_channels,
-                               out_channels=out_channels,
-                               kernel_size=2*stride, stride=stride, padding=math.ceil(stride/2))
-        self.layers = nn.Sequential(
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels),
-            upsample_layer,
-            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
-                         dilation=1, use_snake=use_snake),
-            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
-                         dilation=3, use_snake=use_snake),
-            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
-                         dilation=9, use_snake=use_snake),
-        )
-    def forward(self, x):
-        return self.layers(x)
-class OobleckEncoder(nn.Module):
-    def __init__(self,
-                 in_channels=2,
-                 channels=128,
-                 latent_dim=32,
-                 c_mults = [1, 2, 4, 8],
-                 strides = [2, 4, 8, 8],
-                 use_snake=False,
-                 antialias_activation=False
-        ):
-        super().__init__()
-        c_mults = [1] + c_mults
-        self.depth = len(c_mults)
-        layers = [
-            WNConv1d(in_channels=in_channels, out_channels=c_mults[0] * channels, kernel_size=7, padding=3)
-        ]
-        for i in range(self.depth-1):
-            layers += [EncoderBlock(in_channels=c_mults[i]*channels, out_channels=c_mults[i+1]*channels, stride=strides[i], use_snake=use_snake)]
-        layers += [
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[-1] * channels),
-            WNConv1d(in_channels=c_mults[-1]*channels, out_channels=latent_dim, kernel_size=3, padding=1)
-        ]
-        self.layers = nn.Sequential(*layers)
-    def forward(self, x):
-        return self.layers(x)
-class OobleckDecoder(nn.Module):
-    def __init__(self,
-                 out_channels=2,
-                 channels=128,
-                 latent_dim=32,
-                 c_mults = [1, 2, 4, 8],
-                 strides = [2, 4, 8, 8],
-                 use_snake=False,
-                 antialias_activation=False,
-                 use_nearest_upsample=False,
-                 final_tanh=True):
-        super().__init__()
-        c_mults = [1] + c_mults
-        self.depth = len(c_mults)
-        layers = [
-            WNConv1d(in_channels=latent_dim, out_channels=c_mults[-1]*channels, kernel_size=7, padding=3),
-        ]
-        for i in range(self.depth-1, 0, -1):
-            layers += [DecoderBlock(
-                in_channels=c_mults[i]*channels,
-                out_channels=c_mults[i-1]*channels,
-                stride=strides[i-1],
-                use_snake=use_snake,
-                antialias_activation=antialias_activation,
-                use_nearest_upsample=use_nearest_upsample
-                )
-            ]
-        layers += [
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[0] * channels),
-            WNConv1d(in_channels=c_mults[0] * channels, out_channels=out_channels, kernel_size=7, padding=3, bias=False),
-            nn.Tanh() if final_tanh else nn.Identity()
-        ]
-        self.layers = nn.Sequential(*layers)
-    def forward(self, x):
-        return self.layers(x)
-class AudioOobleckVAE(nn.Module):
-    def __init__(self,
-                 in_channels=2,
-                 channels=128,
-                 latent_dim=64,
-                 c_mults = [1, 2, 4, 8, 16],
-                 strides = [2, 4, 4, 8, 8],
-                 use_snake=True,
-                 antialias_activation=False,
-                 use_nearest_upsample=False,
-                 final_tanh=False):
-        super().__init__()
-        self.encoder = OobleckEncoder(in_channels, channels, latent_dim * 2, c_mults, strides, use_snake, antialias_activation)
-        self.decoder = OobleckDecoder(in_channels, channels, latent_dim, c_mults, strides, use_snake, antialias_activation,
-                                      use_nearest_upsample=use_nearest_upsample, final_tanh=final_tanh)
-        self.bottleneck = VAEBottleneck()
-    def encode(self, x):
-        return self.bottleneck.encode(self.encoder(x))
-    def decode(self, x):
-        return self.decoder(self.bottleneck.decode(x))

MagicQuill/comfy/ldm/audio/dit.py DELETED Viewed

@@ -1,888 +0,0 @@
-# code adapted from: https://github.com/Stability-AI/stable-audio-tools
-from comfy.ldm.modules.attention import optimized_attention
-import typing as tp
-import torch
-from einops import rearrange
-from torch import nn
-from torch.nn import functional as F
-import math
-class FourierFeatures(nn.Module):
-    def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
-        super().__init__()
-        assert out_features % 2 == 0
-        self.weight = nn.Parameter(torch.empty(
-            [out_features // 2, in_features], dtype=dtype, device=device))
-    def forward(self, input):
-        f = 2 * math.pi * input @ self.weight.T.to(dtype=input.dtype, device=input.device)
-        return torch.cat([f.cos(), f.sin()], dim=-1)
-# norms
-class LayerNorm(nn.Module):
-    def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):
-        """
-        bias-less layernorm has been shown to be more stable. most newer models have moved towards rmsnorm, also bias-less
-        """
-        super().__init__()
-        self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
-        if bias:
-            self.beta = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
-        else:
-            self.beta = None
-    def forward(self, x):
-        beta = self.beta
-        if self.beta is not None:
-            beta = beta.to(dtype=x.dtype, device=x.device)
-        return F.layer_norm(x, x.shape[-1:], weight=self.gamma.to(dtype=x.dtype, device=x.device), bias=beta)
-class GLU(nn.Module):
-    def __init__(
-        self,
-        dim_in,
-        dim_out,
-        activation,
-        use_conv = False,
-        conv_kernel_size = 3,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.act = activation
-        self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device) if not use_conv else operations.Conv1d(dim_in, dim_out * 2, conv_kernel_size, padding = (conv_kernel_size // 2), dtype=dtype, device=device)
-        self.use_conv = use_conv
-    def forward(self, x):
-        if self.use_conv:
-            x = rearrange(x, 'b n d -> b d n')
-            x = self.proj(x)
-            x = rearrange(x, 'b d n -> b n d')
-        else:
-            x = self.proj(x)
-        x, gate = x.chunk(2, dim = -1)
-        return x * self.act(gate)
-class AbsolutePositionalEmbedding(nn.Module):
-    def __init__(self, dim, max_seq_len):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.max_seq_len = max_seq_len
-        self.emb = nn.Embedding(max_seq_len, dim)
-    def forward(self, x, pos = None, seq_start_pos = None):
-        seq_len, device = x.shape[1], x.device
-        assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}'
-        if pos is None:
-            pos = torch.arange(seq_len, device = device)
-        if seq_start_pos is not None:
-            pos = (pos - seq_start_pos[..., None]).clamp(min = 0)
-        pos_emb = self.emb(pos)
-        pos_emb = pos_emb * self.scale
-        return pos_emb
-class ScaledSinusoidalEmbedding(nn.Module):
-    def __init__(self, dim, theta = 10000):
-        super().__init__()
-        assert (dim % 2) == 0, 'dimension must be divisible by 2'
-        self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5)
-        half_dim = dim // 2
-        freq_seq = torch.arange(half_dim).float() / half_dim
-        inv_freq = theta ** -freq_seq
-        self.register_buffer('inv_freq', inv_freq, persistent = False)
-    def forward(self, x, pos = None, seq_start_pos = None):
-        seq_len, device = x.shape[1], x.device
-        if pos is None:
-            pos = torch.arange(seq_len, device = device)
-        if seq_start_pos is not None:
-            pos = pos - seq_start_pos[..., None]
-        emb = torch.einsum('i, j -> i j', pos, self.inv_freq)
-        emb = torch.cat((emb.sin(), emb.cos()), dim = -1)
-        return emb * self.scale
-class RotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim,
-        use_xpos = False,
-        scale_base = 512,
-        interpolation_factor = 1.,
-        base = 10000,
-        base_rescale_factor = 1.
-    ):
-        super().__init__()
-        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
-        # has some connection to NTK literature
-        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
-        base *= base_rescale_factor ** (dim / (dim - 2))
-        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-        assert interpolation_factor >= 1.
-        self.interpolation_factor = interpolation_factor
-        if not use_xpos:
-            self.register_buffer('scale', None)
-            return
-        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
-        self.scale_base = scale_base
-        self.register_buffer('scale', scale)
-    def forward_from_seq_len(self, seq_len, device, dtype):
-        # device = self.inv_freq.device
-        t = torch.arange(seq_len, device=device, dtype=dtype)
-        return self.forward(t)
-    def forward(self, t):
-        # device = self.inv_freq.device
-        device = t.device
-        dtype = t.dtype
-        # t = t.to(torch.float32)
-        t = t / self.interpolation_factor
-        freqs = torch.einsum('i , j -> i j', t, self.inv_freq.to(dtype=dtype, device=device))
-        freqs = torch.cat((freqs, freqs), dim = -1)
-        if self.scale is None:
-            return freqs, 1.
-        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
-        scale = self.scale.to(dtype=dtype, device=device) ** rearrange(power, 'n -> n 1')
-        scale = torch.cat((scale, scale), dim = -1)
-        return freqs, scale
-def rotate_half(x):
-    x = rearrange(x, '... (j d) -> ... j d', j = 2)
-    x1, x2 = x.unbind(dim = -2)
-    return torch.cat((-x2, x1), dim = -1)
-def apply_rotary_pos_emb(t, freqs, scale = 1):
-    out_dtype = t.dtype
-    # cast to float32 if necessary for numerical stability
-    dtype = t.dtype #reduce(torch.promote_types, (t.dtype, freqs.dtype, torch.float32))
-    rot_dim, seq_len = freqs.shape[-1], t.shape[-2]
-    freqs, t = freqs.to(dtype), t.to(dtype)
-    freqs = freqs[-seq_len:, :]
-    if t.ndim == 4 and freqs.ndim == 3:
-        freqs = rearrange(freqs, 'b n d -> b 1 n d')
-    # partial rotary embeddings, Wang et al. GPT-J
-    t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:]
-    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
-    t, t_unrotated = t.to(out_dtype), t_unrotated.to(out_dtype)
-    return torch.cat((t, t_unrotated), dim = -1)
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim,
-        dim_out = None,
-        mult = 4,
-        no_bias = False,
-        glu = True,
-        use_conv = False,
-        conv_kernel_size = 3,
-        zero_init_output = True,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        # Default to SwiGLU
-        activation = nn.SiLU()
-        dim_out = dim if dim_out is None else dim_out
-        if glu:
-            linear_in = GLU(dim, inner_dim, activation, dtype=dtype, device=device, operations=operations)
-        else:
-            linear_in = nn.Sequential(
-                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
-                operations.Linear(dim, inner_dim, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(dim, inner_dim, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device),
-                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
-                activation
-            )
-        linear_out = operations.Linear(inner_dim, dim_out, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device)
-        # # init last linear layer to 0
-        # if zero_init_output:
-        #     nn.init.zeros_(linear_out.weight)
-        #     if not no_bias:
-        #         nn.init.zeros_(linear_out.bias)
-        self.ff = nn.Sequential(
-            linear_in,
-            Rearrange('b d n -> b n d') if use_conv else nn.Identity(),
-            linear_out,
-            Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
-        )
-    def forward(self, x):
-        return self.ff(x)
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        dim_heads = 64,
-        dim_context = None,
-        causal = False,
-        zero_init_output=True,
-        qk_norm = False,
-        natten_kernel_size = None,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.dim_heads = dim_heads
-        self.causal = causal
-        dim_kv = dim_context if dim_context is not None else dim
-        self.num_heads = dim // dim_heads
-        self.kv_heads = dim_kv // dim_heads
-        if dim_context is not None:
-            self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
-            self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
-        else:
-            self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
-        self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
-        # if zero_init_output:
-        #     nn.init.zeros_(self.to_out.weight)
-        self.qk_norm = qk_norm
-    def forward(
-        self,
-        x,
-        context = None,
-        mask = None,
-        context_mask = None,
-        rotary_pos_emb = None,
-        causal = None
-    ):
-        h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None
-        kv_input = context if has_context else x
-        if hasattr(self, 'to_q'):
-            # Use separate linear projections for q and k/v
-            q = self.to_q(x)
-            q = rearrange(q, 'b n (h d) -> b h n d', h = h)
-            k, v = self.to_kv(kv_input).chunk(2, dim=-1)
-            k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
-        else:
-            # Use fused linear projection
-            q, k, v = self.to_qkv(x).chunk(3, dim=-1)
-            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
-        # Normalize q and k for cosine sim attention
-        if self.qk_norm:
-            q = F.normalize(q, dim=-1)
-            k = F.normalize(k, dim=-1)
-        if rotary_pos_emb is not None and not has_context:
-            freqs, _ = rotary_pos_emb
-            q_dtype = q.dtype
-            k_dtype = k.dtype
-            q = q.to(torch.float32)
-            k = k.to(torch.float32)
-            freqs = freqs.to(torch.float32)
-            q = apply_rotary_pos_emb(q, freqs)
-            k = apply_rotary_pos_emb(k, freqs)
-            q = q.to(q_dtype)
-            k = k.to(k_dtype)
-        input_mask = context_mask
-        if input_mask is None and not has_context:
-            input_mask = mask
-        # determine masking
-        masks = []
-        final_attn_mask = None # The mask that will be applied to the attention matrix, taking all masks into account
-        if input_mask is not None:
-            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
-            masks.append(~input_mask)
-        # Other masks will be added here later
-        if len(masks) > 0:
-            final_attn_mask = ~or_reduce(masks)
-        n, device = q.shape[-2], q.device
-        causal = self.causal if causal is None else causal
-        if n == 1 and causal:
-            causal = False
-        if h != kv_h:
-            # Repeat interleave kv_heads to match q_heads
-            heads_per_kv_head = h // kv_h
-            k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))
-        out = optimized_attention(q, k, v, h, skip_reshape=True)
-        out = self.to_out(out)
-        if mask is not None:
-            mask = rearrange(mask, 'b n -> b n 1')
-            out = out.masked_fill(~mask, 0.)
-        return out
-class ConformerModule(nn.Module):
-    def __init__(
-        self,
-        dim,
-        norm_kwargs = {},
-    ):
-        super().__init__()
-        self.dim = dim
-        self.in_norm = LayerNorm(dim, **norm_kwargs)
-        self.pointwise_conv = nn.Conv1d(dim, dim, kernel_size=1, bias=False)
-        self.glu = GLU(dim, dim, nn.SiLU())
-        self.depthwise_conv = nn.Conv1d(dim, dim, kernel_size=17, groups=dim, padding=8, bias=False)
-        self.mid_norm = LayerNorm(dim, **norm_kwargs) # This is a batch norm in the original but I don't like batch norm
-        self.swish = nn.SiLU()
-        self.pointwise_conv_2 = nn.Conv1d(dim, dim, kernel_size=1, bias=False)
-    def forward(self, x):
-        x = self.in_norm(x)
-        x = rearrange(x, 'b n d -> b d n')
-        x = self.pointwise_conv(x)
-        x = rearrange(x, 'b d n -> b n d')
-        x = self.glu(x)
-        x = rearrange(x, 'b n d -> b d n')
-        x = self.depthwise_conv(x)
-        x = rearrange(x, 'b d n -> b n d')
-        x = self.mid_norm(x)
-        x = self.swish(x)
-        x = rearrange(x, 'b n d -> b d n')
-        x = self.pointwise_conv_2(x)
-        x = rearrange(x, 'b d n -> b n d')
-        return x
-class TransformerBlock(nn.Module):
-    def __init__(
-            self,
-            dim,
-            dim_heads = 64,
-            cross_attend = False,
-            dim_context = None,
-            global_cond_dim = None,
-            causal = False,
-            zero_init_branch_outputs = True,
-            conformer = False,
-            layer_ix = -1,
-            remove_norms = False,
-            attn_kwargs = {},
-            ff_kwargs = {},
-            norm_kwargs = {},
-            dtype=None,
-            device=None,
-            operations=None,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.dim_heads = dim_heads
-        self.cross_attend = cross_attend
-        self.dim_context = dim_context
-        self.causal = causal
-        self.pre_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
-        self.self_attn = Attention(
-            dim,
-            dim_heads = dim_heads,
-            causal = causal,
-            zero_init_output=zero_init_branch_outputs,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-            **attn_kwargs
-        )
-        if cross_attend:
-            self.cross_attend_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
-            self.cross_attn = Attention(
-                dim,
-                dim_heads = dim_heads,
-                dim_context=dim_context,
-                causal = causal,
-                zero_init_output=zero_init_branch_outputs,
-                dtype=dtype,
-                device=device,
-                operations=operations,
-                **attn_kwargs
-            )
-        self.ff_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
-        self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations,**ff_kwargs)
-        self.layer_ix = layer_ix
-        self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None
-        self.global_cond_dim = global_cond_dim
-        if global_cond_dim is not None:
-            self.to_scale_shift_gate = nn.Sequential(
-                nn.SiLU(),
-                nn.Linear(global_cond_dim, dim * 6, bias=False)
-            )
-            nn.init.zeros_(self.to_scale_shift_gate[1].weight)
-            #nn.init.zeros_(self.to_scale_shift_gate_self[1].bias)
-    def forward(
-        self,
-        x,
-        context = None,
-        global_cond=None,
-        mask = None,
-        context_mask = None,
-        rotary_pos_emb = None
-    ):
-        if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:
-            scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1)
-            # self-attention with adaLN
-            residual = x
-            x = self.pre_norm(x)
-            x = x * (1 + scale_self) + shift_self
-            x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb)
-            x = x * torch.sigmoid(1 - gate_self)
-            x = x + residual
-            if context is not None:
-                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask)
-            if self.conformer is not None:
-                x = x + self.conformer(x)
-            # feedforward with adaLN
-            residual = x
-            x = self.ff_norm(x)
-            x = x * (1 + scale_ff) + shift_ff
-            x = self.ff(x)
-            x = x * torch.sigmoid(1 - gate_ff)
-            x = x + residual
-        else:
-            x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb)
-            if context is not None:
-                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask)
-            if self.conformer is not None:
-                x = x + self.conformer(x)
-            x = x + self.ff(self.ff_norm(x))
-        return x
-class ContinuousTransformer(nn.Module):
-    def __init__(
-        self,
-        dim,
-        depth,
-        *,
-        dim_in = None,
-        dim_out = None,
-        dim_heads = 64,
-        cross_attend=False,
-        cond_token_dim=None,
-        global_cond_dim=None,
-        causal=False,
-        rotary_pos_emb=True,
-        zero_init_branch_outputs=True,
-        conformer=False,
-        use_sinusoidal_emb=False,
-        use_abs_pos_emb=False,
-        abs_pos_emb_max_length=10000,
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs
-        ):
-        super().__init__()
-        self.dim = dim
-        self.depth = depth
-        self.causal = causal
-        self.layers = nn.ModuleList([])
-        self.project_in = operations.Linear(dim_in, dim, bias=False, dtype=dtype, device=device) if dim_in is not None else nn.Identity()
-        self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
-        if rotary_pos_emb:
-            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
-        else:
-            self.rotary_pos_emb = None
-        self.use_sinusoidal_emb = use_sinusoidal_emb
-        if use_sinusoidal_emb:
-            self.pos_emb = ScaledSinusoidalEmbedding(dim)
-        self.use_abs_pos_emb = use_abs_pos_emb
-        if use_abs_pos_emb:
-            self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length)
-        for i in range(depth):
-            self.layers.append(
-                TransformerBlock(
-                    dim,
-                    dim_heads = dim_heads,
-                    cross_attend = cross_attend,
-                    dim_context = cond_token_dim,
-                    global_cond_dim = global_cond_dim,
-                    causal = causal,
-                    zero_init_branch_outputs = zero_init_branch_outputs,
-                    conformer=conformer,
-                    layer_ix=i,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations,
-                    **kwargs
-                )
-            )
-    def forward(
-        self,
-        x,
-        mask = None,
-        prepend_embeds = None,
-        prepend_mask = None,
-        global_cond = None,
-        return_info = False,
-        **kwargs
-    ):
-        batch, seq, device = *x.shape[:2], x.device
-        info = {
-            "hidden_states": [],
-        }
-        x = self.project_in(x)
-        if prepend_embeds is not None:
-            prepend_length, prepend_dim = prepend_embeds.shape[1:]
-            assert prepend_dim == x.shape[-1], 'prepend dimension must match sequence dimension'
-            x = torch.cat((prepend_embeds, x), dim = -2)
-            if prepend_mask is not None or mask is not None:
-                mask = mask if mask is not None else torch.ones((batch, seq), device = device, dtype = torch.bool)
-                prepend_mask = prepend_mask if prepend_mask is not None else torch.ones((batch, prepend_length), device = device, dtype = torch.bool)
-                mask = torch.cat((prepend_mask, mask), dim = -1)
-        # Attention layers
-        if self.rotary_pos_emb is not None:
-            rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=x.dtype, device=x.device)
-        else:
-            rotary_pos_emb = None
-        if self.use_sinusoidal_emb or self.use_abs_pos_emb:
-            x = x + self.pos_emb(x)
-        # Iterate over the transformer layers
-        for layer in self.layers:
-            x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
-            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
-            if return_info:
-                info["hidden_states"].append(x)
-        x = self.project_out(x)
-        if return_info:
-            return x, info
-        return x
-class AudioDiffusionTransformer(nn.Module):
-    def __init__(self,
-        io_channels=64,
-        patch_size=1,
-        embed_dim=1536,
-        cond_token_dim=768,
-        project_cond_tokens=False,
-        global_cond_dim=1536,
-        project_global_cond=True,
-        input_concat_dim=0,
-        prepend_cond_dim=0,
-        depth=24,
-        num_heads=24,
-        transformer_type: tp.Literal["continuous_transformer"] = "continuous_transformer",
-        global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
-        audio_model="",
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs):
-        super().__init__()
-        self.dtype = dtype
-        self.cond_token_dim = cond_token_dim
-        # Timestep embeddings
-        timestep_features_dim = 256
-        self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
-        self.to_timestep_embed = nn.Sequential(
-            operations.Linear(timestep_features_dim, embed_dim, bias=True, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device),
-        )
-        if cond_token_dim > 0:
-            # Conditioning tokens
-            cond_embed_dim = cond_token_dim if not project_cond_tokens else embed_dim
-            self.to_cond_embed = nn.Sequential(
-                operations.Linear(cond_token_dim, cond_embed_dim, bias=False, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(cond_embed_dim, cond_embed_dim, bias=False, dtype=dtype, device=device)
-            )
-        else:
-            cond_embed_dim = 0
-        if global_cond_dim > 0:
-            # Global conditioning
-            global_embed_dim = global_cond_dim if not project_global_cond else embed_dim
-            self.to_global_embed = nn.Sequential(
-                operations.Linear(global_cond_dim, global_embed_dim, bias=False, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(global_embed_dim, global_embed_dim, bias=False, dtype=dtype, device=device)
-            )
-        if prepend_cond_dim > 0:
-            # Prepend conditioning
-            self.to_prepend_embed = nn.Sequential(
-                operations.Linear(prepend_cond_dim, embed_dim, bias=False, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
-            )
-        self.input_concat_dim = input_concat_dim
-        dim_in = io_channels + self.input_concat_dim
-        self.patch_size = patch_size
-        # Transformer
-        self.transformer_type = transformer_type
-        self.global_cond_type = global_cond_type
-        if self.transformer_type == "continuous_transformer":
-            global_dim = None
-            if self.global_cond_type == "adaLN":
-                # The global conditioning is projected to the embed_dim already at this point
-                global_dim = embed_dim
-            self.transformer = ContinuousTransformer(
-                dim=embed_dim,
-                depth=depth,
-                dim_heads=embed_dim // num_heads,
-                dim_in=dim_in * patch_size,
-                dim_out=io_channels * patch_size,
-                cross_attend = cond_token_dim > 0,
-                cond_token_dim = cond_embed_dim,
-                global_cond_dim=global_dim,
-                dtype=dtype,
-                device=device,
-                operations=operations,
-                **kwargs
-            )
-        else:
-            raise ValueError(f"Unknown transformer type: {self.transformer_type}")
-        self.preprocess_conv = operations.Conv1d(dim_in, dim_in, 1, bias=False, dtype=dtype, device=device)
-        self.postprocess_conv = operations.Conv1d(io_channels, io_channels, 1, bias=False, dtype=dtype, device=device)
-    def _forward(
-        self,
-        x,
-        t,
-        mask=None,
-        cross_attn_cond=None,
-        cross_attn_cond_mask=None,
-        input_concat_cond=None,
-        global_embed=None,
-        prepend_cond=None,
-        prepend_cond_mask=None,
-        return_info=False,
-        **kwargs):
-        if cross_attn_cond is not None:
-            cross_attn_cond = self.to_cond_embed(cross_attn_cond)
-        if global_embed is not None:
-            # Project the global conditioning to the embedding dimension
-            global_embed = self.to_global_embed(global_embed)
-        prepend_inputs = None
-        prepend_mask = None
-        prepend_length = 0
-        if prepend_cond is not None:
-            # Project the prepend conditioning to the embedding dimension
-            prepend_cond = self.to_prepend_embed(prepend_cond)
-            prepend_inputs = prepend_cond
-            if prepend_cond_mask is not None:
-                prepend_mask = prepend_cond_mask
-        if input_concat_cond is not None:
-            # Interpolate input_concat_cond to the same length as x
-            if input_concat_cond.shape[2] != x.shape[2]:
-                input_concat_cond = F.interpolate(input_concat_cond, (x.shape[2], ), mode='nearest')
-            x = torch.cat([x, input_concat_cond], dim=1)
-        # Get the batch of timestep embeddings
-        timestep_embed = self.to_timestep_embed(self.timestep_features(t[:, None]).to(x.dtype)) # (b, embed_dim)
-        # Timestep embedding is considered a global embedding. Add to the global conditioning if it exists
-        if global_embed is not None:
-            global_embed = global_embed + timestep_embed
-        else:
-            global_embed = timestep_embed
-        # Add the global_embed to the prepend inputs if there is no global conditioning support in the transformer
-        if self.global_cond_type == "prepend":
-            if prepend_inputs is None:
-                # Prepend inputs are just the global embed, and the mask is all ones
-                prepend_inputs = global_embed.unsqueeze(1)
-                prepend_mask = torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool)
-            else:
-                # Prepend inputs are the prepend conditioning + the global embed
-                prepend_inputs = torch.cat([prepend_inputs, global_embed.unsqueeze(1)], dim=1)
-                prepend_mask = torch.cat([prepend_mask, torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool)], dim=1)
-            prepend_length = prepend_inputs.shape[1]
-        x = self.preprocess_conv(x) + x
-        x = rearrange(x, "b c t -> b t c")
-        extra_args = {}
-        if self.global_cond_type == "adaLN":
-            extra_args["global_cond"] = global_embed
-        if self.patch_size > 1:
-            x = rearrange(x, "b (t p) c -> b t (c p)", p=self.patch_size)
-        if self.transformer_type == "x-transformers":
-            output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, **extra_args, **kwargs)
-        elif self.transformer_type == "continuous_transformer":
-            output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, **extra_args, **kwargs)
-            if return_info:
-                output, info = output
-        elif self.transformer_type == "mm_transformer":
-            output = self.transformer(x, context=cross_attn_cond, mask=mask, context_mask=cross_attn_cond_mask, **extra_args, **kwargs)
-        output = rearrange(output, "b t c -> b c t")[:,:,prepend_length:]
-        if self.patch_size > 1:
-            output = rearrange(output, "b (c p) t -> b c (t p)", p=self.patch_size)
-        output = self.postprocess_conv(output) + output
-        if return_info:
-            return output, info
-        return output
-    def forward(
-        self,
-        x,
-        timestep,
-        context=None,
-        context_mask=None,
-        input_concat_cond=None,
-        global_embed=None,
-        negative_global_embed=None,
-        prepend_cond=None,
-        prepend_cond_mask=None,
-        mask=None,
-        return_info=False,
-        control=None,
-        transformer_options={},
-        **kwargs):
-            return self._forward(
-                x,
-                timestep,
-                cross_attn_cond=context,
-                cross_attn_cond_mask=context_mask,
-                input_concat_cond=input_concat_cond,
-                global_embed=global_embed,
-                prepend_cond=prepend_cond,
-                prepend_cond_mask=prepend_cond_mask,
-                mask=mask,
-                return_info=return_info,
-                **kwargs
-            )

MagicQuill/comfy/ldm/audio/embedders.py DELETED Viewed

@@ -1,108 +0,0 @@
-# code adapted from: https://github.com/Stability-AI/stable-audio-tools
-import torch
-import torch.nn as nn
-from torch import Tensor, einsum
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
-from einops import rearrange
-import math
-import comfy.ops
-class LearnedPositionalEmbedding(nn.Module):
-    """Used for continuous time"""
-    def __init__(self, dim: int):
-        super().__init__()
-        assert (dim % 2) == 0
-        half_dim = dim // 2
-        self.weights = nn.Parameter(torch.empty(half_dim))
-    def forward(self, x: Tensor) -> Tensor:
-        x = rearrange(x, "b -> b 1")
-        freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * math.pi
-        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
-        fouriered = torch.cat((x, fouriered), dim=-1)
-        return fouriered
-def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
-    return nn.Sequential(
-        LearnedPositionalEmbedding(dim),
-        comfy.ops.manual_cast.Linear(in_features=dim + 1, out_features=out_features),
-    )
-class NumberEmbedder(nn.Module):
-    def __init__(
-        self,
-        features: int,
-        dim: int = 256,
-    ):
-        super().__init__()
-        self.features = features
-        self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
-    def forward(self, x: Union[List[float], Tensor]) -> Tensor:
-        if not torch.is_tensor(x):
-            device = next(self.embedding.parameters()).device
-            x = torch.tensor(x, device=device)
-        assert isinstance(x, Tensor)
-        shape = x.shape
-        x = rearrange(x, "... -> (...)")
-        embedding = self.embedding(x)
-        x = embedding.view(*shape, self.features)
-        return x  # type: ignore
-class Conditioner(nn.Module):
-    def __init__(
-            self,
-            dim: int,
-            output_dim: int,
-            project_out: bool = False
-            ):
-        super().__init__()
-        self.dim = dim
-        self.output_dim = output_dim
-        self.proj_out = nn.Linear(dim, output_dim) if (dim != output_dim or project_out) else nn.Identity()
-    def forward(self, x):
-        raise NotImplementedError()
-class NumberConditioner(Conditioner):
-    '''
-        Conditioner that takes a list of floats, normalizes them for a given range, and returns a list of embeddings
-    '''
-    def __init__(self,
-                output_dim: int,
-                min_val: float=0,
-                max_val: float=1
-                ):
-        super().__init__(output_dim, output_dim)
-        self.min_val = min_val
-        self.max_val = max_val
-        self.embedder = NumberEmbedder(features=output_dim)
-    def forward(self, floats, device=None):
-            # Cast the inputs to floats
-            floats = [float(x) for x in floats]
-            if device is None:
-                device = next(self.embedder.parameters()).device
-            floats = torch.tensor(floats).to(device)
-            floats = floats.clamp(self.min_val, self.max_val)
-            normalized_floats = (floats - self.min_val) / (self.max_val - self.min_val)
-            # Cast floats to same type as embedder
-            embedder_dtype = next(self.embedder.parameters()).dtype
-            normalized_floats = normalized_floats.to(embedder_dtype)
-            float_embeds = self.embedder(normalized_floats).unsqueeze(1)
-            return [float_embeds, torch.ones(float_embeds.shape[0], 1).to(device)]

MagicQuill/comfy/ldm/cascade/__pycache__/common.cpython-310.pyc DELETED Viewed

Binary file (7.69 kB)

MagicQuill/comfy/ldm/cascade/__pycache__/controlnet.cpython-310.pyc DELETED Viewed

Binary file (3.77 kB)

MagicQuill/comfy/ldm/cascade/__pycache__/stage_a.cpython-310.pyc DELETED Viewed

Binary file (9.41 kB)

MagicQuill/comfy/ldm/cascade/__pycache__/stage_b.cpython-310.pyc DELETED Viewed

Binary file (7.77 kB)

MagicQuill/comfy/ldm/cascade/__pycache__/stage_c.cpython-310.pyc DELETED Viewed

Binary file (8.58 kB)

MagicQuill/comfy/ldm/cascade/__pycache__/stage_c_coder.cpython-310.pyc DELETED Viewed

Binary file (3.5 kB)

MagicQuill/comfy/ldm/cascade/common.py DELETED Viewed

@@ -1,161 +0,0 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Stability AI
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-import torch
-import torch.nn as nn
-from comfy.ldm.modules.attention import optimized_attention
-class Linear(torch.nn.Linear):
-    def reset_parameters(self):
-        return None
-class Conv2d(torch.nn.Conv2d):
-    def reset_parameters(self):
-        return None
-class OptimizedAttention(nn.Module):
-    def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.heads = nhead
-        self.to_q = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
-        self.to_k = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
-        self.to_v = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
-        self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
-    def forward(self, q, k, v):
-        q = self.to_q(q)
-        k = self.to_k(k)
-        v = self.to_v(v)
-        out = optimized_attention(q, k, v, self.heads)
-        return self.out_proj(out)
-class Attention2D(nn.Module):
-    def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.attn = OptimizedAttention(c, nhead, dtype=dtype, device=device, operations=operations)
-        # self.attn = nn.MultiheadAttention(c, nhead, dropout=dropout, bias=True, batch_first=True, dtype=dtype, device=device)
-    def forward(self, x, kv, self_attn=False):
-        orig_shape = x.shape
-        x = x.view(x.size(0), x.size(1), -1).permute(0, 2, 1)  # Bx4xHxW -> Bx(HxW)x4
-        if self_attn:
-            kv = torch.cat([x, kv], dim=1)
-        # x = self.attn(x, kv, kv, need_weights=False)[0]
-        x = self.attn(x, kv, kv)
-        x = x.permute(0, 2, 1).view(*orig_shape)
-        return x
-def LayerNorm2d_op(operations):
-    class LayerNorm2d(operations.LayerNorm):
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-        def forward(self, x):
-            return super().forward(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-    return LayerNorm2d
-class GlobalResponseNorm(nn.Module):
-    "from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105"
-    def __init__(self, dim, dtype=None, device=None):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
-        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
-    def forward(self, x):
-        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
-        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
-        return self.gamma.to(device=x.device, dtype=x.dtype) * (x * Nx) + self.beta.to(device=x.device, dtype=x.dtype) + x
-class ResBlock(nn.Module):
-    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0, dtype=None, device=None, operations=None):  # , num_heads=4, expansion=2):
-        super().__init__()
-        self.depthwise = operations.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c, dtype=dtype, device=device)
-        #         self.depthwise = SAMBlock(c, num_heads, expansion)
-        self.norm = LayerNorm2d_op(operations)(c, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.channelwise = nn.Sequential(
-            operations.Linear(c + c_skip, c * 4, dtype=dtype, device=device),
-            nn.GELU(),
-            GlobalResponseNorm(c * 4, dtype=dtype, device=device),
-            nn.Dropout(dropout),
-            operations.Linear(c * 4, c, dtype=dtype, device=device)
-        )
-    def forward(self, x, x_skip=None):
-        x_res = x
-        x = self.norm(self.depthwise(x))
-        if x_skip is not None:
-            x = torch.cat([x, x_skip], dim=1)
-        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        return x + x_res
-class AttnBlock(nn.Module):
-    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.self_attn = self_attn
-        self.norm = LayerNorm2d_op(operations)(c, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.attention = Attention2D(c, nhead, dropout, dtype=dtype, device=device, operations=operations)
-        self.kv_mapper = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(c_cond, c, dtype=dtype, device=device)
-        )
-    def forward(self, x, kv):
-        kv = self.kv_mapper(kv)
-        x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
-        return x
-class FeedForwardBlock(nn.Module):
-    def __init__(self, c, dropout=0.0, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm = LayerNorm2d_op(operations)(c, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.channelwise = nn.Sequential(
-            operations.Linear(c, c * 4, dtype=dtype, device=device),
-            nn.GELU(),
-            GlobalResponseNorm(c * 4, dtype=dtype, device=device),
-            nn.Dropout(dropout),
-            operations.Linear(c * 4, c, dtype=dtype, device=device)
-        )
-    def forward(self, x):
-        x = x + self.channelwise(self.norm(x).permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        return x
-class TimestepBlock(nn.Module):
-    def __init__(self, c, c_timestep, conds=['sca'], dtype=None, device=None, operations=None):
-        super().__init__()
-        self.mapper = operations.Linear(c_timestep, c * 2, dtype=dtype, device=device)
-        self.conds = conds
-        for cname in conds:
-            setattr(self, f"mapper_{cname}", operations.Linear(c_timestep, c * 2, dtype=dtype, device=device))
-    def forward(self, x, t):
-        t = t.chunk(len(self.conds) + 1, dim=1)
-        a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
-        for i, c in enumerate(self.conds):
-            ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
-            a, b = a + ac, b + bc
-        return x * (1 + a) + b

MagicQuill/comfy/ldm/cascade/controlnet.py DELETED Viewed

@@ -1,93 +0,0 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Stability AI
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-import torch
-import torchvision
-from torch import nn
-from .common import LayerNorm2d_op
-class CNetResBlock(nn.Module):
-    def __init__(self, c, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.blocks = nn.Sequential(
-            LayerNorm2d_op(operations)(c, dtype=dtype, device=device),
-            nn.GELU(),
-            operations.Conv2d(c, c, kernel_size=3, padding=1),
-            LayerNorm2d_op(operations)(c, dtype=dtype, device=device),
-            nn.GELU(),
-            operations.Conv2d(c, c, kernel_size=3, padding=1),
-        )
-    def forward(self, x):
-        return x + self.blocks(x)
-class ControlNet(nn.Module):
-    def __init__(self, c_in=3, c_proj=2048, proj_blocks=None, bottleneck_mode=None, dtype=None, device=None, operations=nn):
-        super().__init__()
-        if bottleneck_mode is None:
-            bottleneck_mode = 'effnet'
-        self.proj_blocks = proj_blocks
-        if bottleneck_mode == 'effnet':
-            embd_channels = 1280
-            self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
-            if c_in != 3:
-                in_weights = self.backbone[0][0].weight.data
-                self.backbone[0][0] = operations.Conv2d(c_in, 24, kernel_size=3, stride=2, bias=False, dtype=dtype, device=device)
-                if c_in > 3:
-                    # nn.init.constant_(self.backbone[0][0].weight, 0)
-                    self.backbone[0][0].weight.data[:, :3] = in_weights[:, :3].clone()
-                else:
-                    self.backbone[0][0].weight.data = in_weights[:, :c_in].clone()
-        elif bottleneck_mode == 'simple':
-            embd_channels = c_in
-            self.backbone = nn.Sequential(
-                operations.Conv2d(embd_channels, embd_channels * 4, kernel_size=3, padding=1, dtype=dtype, device=device),
-                nn.LeakyReLU(0.2, inplace=True),
-                operations.Conv2d(embd_channels * 4, embd_channels, kernel_size=3, padding=1, dtype=dtype, device=device),
-            )
-        elif bottleneck_mode == 'large':
-            self.backbone = nn.Sequential(
-                operations.Conv2d(c_in, 4096 * 4, kernel_size=1, dtype=dtype, device=device),
-                nn.LeakyReLU(0.2, inplace=True),
-                operations.Conv2d(4096 * 4, 1024, kernel_size=1, dtype=dtype, device=device),
-                *[CNetResBlock(1024, dtype=dtype, device=device, operations=operations) for _ in range(8)],
-                operations.Conv2d(1024, 1280, kernel_size=1, dtype=dtype, device=device),
-            )
-            embd_channels = 1280
-        else:
-            raise ValueError(f'Unknown bottleneck mode: {bottleneck_mode}')
-        self.projections = nn.ModuleList()
-        for _ in range(len(proj_blocks)):
-            self.projections.append(nn.Sequential(
-                operations.Conv2d(embd_channels, embd_channels, kernel_size=1, bias=False, dtype=dtype, device=device),
-                nn.LeakyReLU(0.2, inplace=True),
-                operations.Conv2d(embd_channels, c_proj, kernel_size=1, bias=False, dtype=dtype, device=device),
-            ))
-            # nn.init.constant_(self.projections[-1][-1].weight, 0)  # zero output projection
-        self.xl = False
-        self.input_channels = c_in
-        self.unshuffle_amount = 8
-    def forward(self, x):
-        x = self.backbone(x)
-        proj_outputs = [None for _ in range(max(self.proj_blocks) + 1)]
-        for i, idx in enumerate(self.proj_blocks):
-            proj_outputs[idx] = self.projections[i](x)
-        return proj_outputs

MagicQuill/comfy/ldm/cascade/stage_a.py DELETED Viewed

@@ -1,255 +0,0 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Stability AI
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-import torch
-from torch import nn
-from torch.autograd import Function
-class vector_quantize(Function):
-    @staticmethod
-    def forward(ctx, x, codebook):
-        with torch.no_grad():
-            codebook_sqr = torch.sum(codebook ** 2, dim=1)
-            x_sqr = torch.sum(x ** 2, dim=1, keepdim=True)
-            dist = torch.addmm(codebook_sqr + x_sqr, x, codebook.t(), alpha=-2.0, beta=1.0)
-            _, indices = dist.min(dim=1)
-            ctx.save_for_backward(indices, codebook)
-            ctx.mark_non_differentiable(indices)
-            nn = torch.index_select(codebook, 0, indices)
-            return nn, indices
-    @staticmethod
-    def backward(ctx, grad_output, grad_indices):
-        grad_inputs, grad_codebook = None, None
-        if ctx.needs_input_grad[0]:
-            grad_inputs = grad_output.clone()
-        if ctx.needs_input_grad[1]:
-            # Gradient wrt. the codebook
-            indices, codebook = ctx.saved_tensors
-            grad_codebook = torch.zeros_like(codebook)
-            grad_codebook.index_add_(0, indices, grad_output)
-        return (grad_inputs, grad_codebook)
-class VectorQuantize(nn.Module):
-    def __init__(self, embedding_size, k, ema_decay=0.99, ema_loss=False):
-        """
-        Takes an input of variable size (as long as the last dimension matches the embedding size).
-        Returns one tensor containing the nearest neigbour embeddings to each of the inputs,
-        with the same size as the input, vq and commitment components for the loss as a touple
-        in the second output and the indices of the quantized vectors in the third:
-        quantized, (vq_loss, commit_loss), indices
-        """
-        super(VectorQuantize, self).__init__()
-        self.codebook = nn.Embedding(k, embedding_size)
-        self.codebook.weight.data.uniform_(-1./k, 1./k)
-        self.vq = vector_quantize.apply
-        self.ema_decay = ema_decay
-        self.ema_loss = ema_loss
-        if ema_loss:
-            self.register_buffer('ema_element_count', torch.ones(k))
-            self.register_buffer('ema_weight_sum', torch.zeros_like(self.codebook.weight))
-    def _laplace_smoothing(self, x, epsilon):
-        n = torch.sum(x)
-        return ((x + epsilon) / (n + x.size(0) * epsilon) * n)
-    def _updateEMA(self, z_e_x, indices):
-        mask = nn.functional.one_hot(indices, self.ema_element_count.size(0)).float()
-        elem_count = mask.sum(dim=0)
-        weight_sum = torch.mm(mask.t(), z_e_x)
-        self.ema_element_count = (self.ema_decay * self.ema_element_count) + ((1-self.ema_decay) * elem_count)
-        self.ema_element_count = self._laplace_smoothing(self.ema_element_count, 1e-5)
-        self.ema_weight_sum = (self.ema_decay * self.ema_weight_sum) + ((1-self.ema_decay) * weight_sum)
-        self.codebook.weight.data = self.ema_weight_sum / self.ema_element_count.unsqueeze(-1)
-    def idx2vq(self, idx, dim=-1):
-        q_idx = self.codebook(idx)
-        if dim != -1:
-            q_idx = q_idx.movedim(-1, dim)
-        return q_idx
-    def forward(self, x, get_losses=True, dim=-1):
-        if dim != -1:
-            x = x.movedim(dim, -1)
-        z_e_x = x.contiguous().view(-1, x.size(-1)) if len(x.shape) > 2 else x
-        z_q_x, indices = self.vq(z_e_x, self.codebook.weight.detach())
-        vq_loss, commit_loss = None, None
-        if self.ema_loss and self.training:
-            self._updateEMA(z_e_x.detach(), indices.detach())
-        # pick the graded embeddings after updating the codebook in order to have a more accurate commitment loss
-        z_q_x_grd = torch.index_select(self.codebook.weight, dim=0, index=indices)
-        if get_losses:
-            vq_loss = (z_q_x_grd - z_e_x.detach()).pow(2).mean()
-            commit_loss = (z_e_x - z_q_x_grd.detach()).pow(2).mean()
-        z_q_x = z_q_x.view(x.shape)
-        if dim != -1:
-            z_q_x = z_q_x.movedim(-1, dim)
-        return z_q_x, (vq_loss, commit_loss), indices.view(x.shape[:-1])
-class ResBlock(nn.Module):
-    def __init__(self, c, c_hidden):
-        super().__init__()
-        # depthwise/attention
-        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
-        self.depthwise = nn.Sequential(
-            nn.ReplicationPad2d(1),
-            nn.Conv2d(c, c, kernel_size=3, groups=c)
-        )
-        # channelwise
-        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
-        self.channelwise = nn.Sequential(
-            nn.Linear(c, c_hidden),
-            nn.GELU(),
-            nn.Linear(c_hidden, c),
-        )
-        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
-        # Init weights
-        def _basic_init(module):
-            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
-                torch.nn.init.xavier_uniform_(module.weight)
-                if module.bias is not None:
-                    nn.init.constant_(module.bias, 0)
-        self.apply(_basic_init)
-    def _norm(self, x, norm):
-        return norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-    def forward(self, x):
-        mods = self.gammas
-        x_temp = self._norm(x, self.norm1) * (1 + mods[0]) + mods[1]
-        try:
-            x = x + self.depthwise(x_temp) * mods[2]
-        except: #operation not implemented for bf16
-            x_temp = self.depthwise[0](x_temp.float()).to(x.dtype)
-            x = x + self.depthwise[1](x_temp) * mods[2]
-        x_temp = self._norm(x, self.norm2) * (1 + mods[3]) + mods[4]
-        x = x + self.channelwise(x_temp.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * mods[5]
-        return x
-class StageA(nn.Module):
-    def __init__(self, levels=2, bottleneck_blocks=12, c_hidden=384, c_latent=4, codebook_size=8192):
-        super().__init__()
-        self.c_latent = c_latent
-        c_levels = [c_hidden // (2 ** i) for i in reversed(range(levels))]
-        # Encoder blocks
-        self.in_block = nn.Sequential(
-            nn.PixelUnshuffle(2),
-            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
-        )
-        down_blocks = []
-        for i in range(levels):
-            if i > 0:
-                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
-            block = ResBlock(c_levels[i], c_levels[i] * 4)
-            down_blocks.append(block)
-        down_blocks.append(nn.Sequential(
-            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
-            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
-        ))
-        self.down_blocks = nn.Sequential(*down_blocks)
-        self.down_blocks[0]
-        self.codebook_size = codebook_size
-        self.vquantizer = VectorQuantize(c_latent, k=codebook_size)
-        # Decoder blocks
-        up_blocks = [nn.Sequential(
-            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
-        )]
-        for i in range(levels):
-            for j in range(bottleneck_blocks if i == 0 else 1):
-                block = ResBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
-                up_blocks.append(block)
-            if i < levels - 1:
-                up_blocks.append(
-                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
-                                       padding=1))
-        self.up_blocks = nn.Sequential(*up_blocks)
-        self.out_block = nn.Sequential(
-            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
-            nn.PixelShuffle(2),
-        )
-    def encode(self, x, quantize=False):
-        x = self.in_block(x)
-        x = self.down_blocks(x)
-        if quantize:
-            qe, (vq_loss, commit_loss), indices = self.vquantizer.forward(x, dim=1)
-            return qe, x, indices, vq_loss + commit_loss * 0.25
-        else:
-            return x
-    def decode(self, x):
-        x = self.up_blocks(x)
-        x = self.out_block(x)
-        return x
-    def forward(self, x, quantize=False):
-        qe, x, _, vq_loss = self.encode(x, quantize)
-        x = self.decode(qe)
-        return x, vq_loss
-class Discriminator(nn.Module):
-    def __init__(self, c_in=3, c_cond=0, c_hidden=512, depth=6):
-        super().__init__()
-        d = max(depth - 3, 3)
-        layers = [
-            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
-            nn.LeakyReLU(0.2),
-        ]
-        for i in range(depth - 1):
-            c_in = c_hidden // (2 ** max((d - i), 0))
-            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
-            layers.append(nn.InstanceNorm2d(c_out))
-            layers.append(nn.LeakyReLU(0.2))
-        self.encoder = nn.Sequential(*layers)
-        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
-        self.logits = nn.Sigmoid()
-    def forward(self, x, cond=None):
-        x = self.encoder(x)
-        if cond is not None:
-            cond = cond.view(cond.size(0), cond.size(1), 1, 1, ).expand(-1, -1, x.size(-2), x.size(-1))
-            x = torch.cat([x, cond], dim=1)
-        x = self.shuffle(x)
-        x = self.logits(x)
-        return x