Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

File size: 11,011 Bytes

149cc2d

from typing import Optional

import torch
import torch.nn.functional as F
from torch import nn

from einops import rearrange

from diffusers.models.attention_processor import Attention as CrossAttention
#from torch_cross_attention import CrossAttention


class TransformerPseudo3DModelOutput:
    def __init__(self, sample: torch.FloatTensor) -> None:
        self.sample = sample


class TransformerPseudo3DModel(nn.Module):
    def __init__(self,
            num_attention_heads: int = 16,
            attention_head_dim: int = 88,
            in_channels: Optional[int] = None,
            num_layers: int = 1,
            dropout: float = 0.0,
            norm_num_groups: int = 32,
            cross_attention_dim: Optional[int] = None,
            attention_bias: bool = False
    ) -> None:
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim
        inner_dim = num_attention_heads * attention_head_dim

        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
        # Define whether input is continuous or discrete depending on configuration
        # its continuous

        # 2. Define input layers
        self.in_channels = in_channels

        self.norm = torch.nn.GroupNorm(
                num_groups = norm_num_groups,
                num_channels = in_channels,
                eps = 1e-6,
                affine = True
        )
        self.proj_in = nn.Conv2d(
                in_channels,
                inner_dim,
                kernel_size = 1,
                stride = 1,
                padding = 0
        )

        # 3. Define transformers blocks
        self.transformer_blocks = nn.ModuleList(
            [
                BasicTransformerBlock(
                        inner_dim,
                        num_attention_heads,
                        attention_head_dim,
                        dropout = dropout,
                        cross_attention_dim = cross_attention_dim,
                        attention_bias = attention_bias,
                )
                for _ in range(num_layers)
            ]
        )

        # 4. Define output layers
        self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size = 1, stride = 1, padding = 0)

    def forward(self,
            hidden_states: torch.Tensor,
            encoder_hidden_states: Optional[torch.Tensor] = None,
            timestep: torch.long = None
    ) -> TransformerPseudo3DModelOutput:
        """
        Args:
            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
                hidden_states
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.long`, *optional*):
                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

        Returns:
            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
            tensor.
        """
        b, c, *_, h, w = hidden_states.shape
        is_video = hidden_states.ndim == 5
        f = None
        if is_video:
            b, c, f, h, w = hidden_states.shape
            hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) c h w')
            #encoder_hidden_states = encoder_hidden_states.repeat_interleave(f, 0)

        # 1. Input
        batch, channel, height, weight = hidden_states.shape
        residual = hidden_states
        hidden_states = self.norm(hidden_states)
        hidden_states = self.proj_in(hidden_states)
        inner_dim = hidden_states.shape[1]
        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)

        # 2. Blocks
        for block in self.transformer_blocks:
            hidden_states = block(
                    hidden_states,
                    context = encoder_hidden_states,
                    timestep = timestep,
                    frames_length = f,
                    height = height,
                    weight = weight
            )

        # 3. Output
        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
        hidden_states = self.proj_out(hidden_states)
        output = hidden_states + residual

        if is_video:
            output = rearrange(output, '(b f) c h w -> b c f h w', b = b)

        return TransformerPseudo3DModelOutput(sample = output)



class BasicTransformerBlock(nn.Module):
    r"""
    A basic Transformer block.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
        num_embeds_ada_norm (:
            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:
            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
    """

    def __init__(self,
            dim: int,
            num_attention_heads: int,
            attention_head_dim: int,
            dropout: float = 0.0,
            cross_attention_dim: Optional[int] = None,
            attention_bias: bool = False,
    ) -> None:
        super().__init__()
        self.attn1 = CrossAttention(
                query_dim = dim,
                heads = num_attention_heads,
                dim_head = attention_head_dim,
                dropout = dropout,
                bias = attention_bias
        )  # is a self-attention
        self.ff = FeedForward(dim, dropout = dropout)
        self.attn2 = CrossAttention(
                query_dim = dim,
                cross_attention_dim = cross_attention_dim,
                heads = num_attention_heads,
                dim_head = attention_head_dim,
                dropout = dropout,
                bias = attention_bias
        )  # is self-attn if context is none
        self.attn_temporal = CrossAttention(
                query_dim = dim,
                heads = num_attention_heads,
                dim_head = attention_head_dim,
                dropout = dropout,
                bias = attention_bias
        )  # is a self-attention

        # layer norms
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.norm_temporal = nn.LayerNorm(dim)
        self.norm3 = nn.LayerNorm(dim)

    def forward(self,
            hidden_states: torch.Tensor,
            context: Optional[torch.Tensor] = None,
            timestep: torch.int64 = None,
            frames_length: Optional[int] = None,
            height: Optional[int] = None,
            weight: Optional[int] = None
    ) -> torch.Tensor:
        if context is not None and frames_length is not None:
            context = context.repeat_interleave(frames_length, 0)
        # 1. Self-Attention
        norm_hidden_states = (
            self.norm1(hidden_states)
        )
        hidden_states = self.attn1(norm_hidden_states) + hidden_states

        # 2. Cross-Attention
        norm_hidden_states = (
            self.norm2(hidden_states)
        )
        hidden_states = self.attn2(
                norm_hidden_states,
                encoder_hidden_states = context
        ) + hidden_states

        # append temporal attention
        if frames_length is not None:
            hidden_states = rearrange(
                    hidden_states,
                    '(b f) (h w) c -> (b h w) f c',
                    f = frames_length,
                    h = height,
                    w = weight
            )
            norm_hidden_states = (
                self.norm_temporal(hidden_states)
            )
            hidden_states = self.attn_temporal(norm_hidden_states) + hidden_states
            hidden_states = rearrange(
                    hidden_states,
                    '(b h w) f c -> (b f) (h w) c',
                    f = frames_length,
                    h = height,
                    w = weight
            )

        # 3. Feed-forward
        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
        return hidden_states


class FeedForward(nn.Module):
    r"""
    A feed-forward layer.

    Parameters:
        dim (`int`): The number of channels in the input.
        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    """

    def __init__(self,
            dim: int,
            dim_out: Optional[int] = None,
            mult: int = 4,
            dropout: float = 0.0
    ) -> None:
        super().__init__()
        inner_dim = int(dim * mult)
        dim_out = dim_out if dim_out is not None else dim

        geglu = GEGLU(dim, inner_dim)

        self.net = nn.ModuleList([])
        # project in
        self.net.append(geglu)
        # project dropout
        self.net.append(nn.Dropout(dropout))
        # project out
        self.net.append(nn.Linear(inner_dim, dim_out))

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        for module in self.net:
            hidden_states = module(hidden_states)
        return hidden_states


# feedforward
class GEGLU(nn.Module):
    r"""
    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.

    Parameters:
        dim_in (`int`): The number of channels in the input.
        dim_out (`int`): The number of channels in the output.
    """

    def __init__(self, dim_in: int, dim_out: int) -> None:
        super().__init__()
        self.proj = nn.Linear(dim_in, dim_out * 2)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states, gate = self.proj(hidden_states).chunk(2, dim = -1)
        return hidden_states * F.gelu(gate)