Spaces:
Runtime error
Runtime error
| from typing import Optional | |
| import torch | |
| import torch.nn.functional as F | |
| from torch import nn | |
| from einops import rearrange | |
| from diffusers.models.attention_processor import Attention as CrossAttention | |
| #from torch_cross_attention import CrossAttention | |
| class TransformerPseudo3DModelOutput: | |
| def __init__(self, sample: torch.FloatTensor) -> None: | |
| self.sample = sample | |
| class TransformerPseudo3DModel(nn.Module): | |
| def __init__(self, | |
| num_attention_heads: int = 16, | |
| attention_head_dim: int = 88, | |
| in_channels: Optional[int] = None, | |
| num_layers: int = 1, | |
| dropout: float = 0.0, | |
| norm_num_groups: int = 32, | |
| cross_attention_dim: Optional[int] = None, | |
| attention_bias: bool = False | |
| ) -> None: | |
| super().__init__() | |
| self.num_attention_heads = num_attention_heads | |
| self.attention_head_dim = attention_head_dim | |
| inner_dim = num_attention_heads * attention_head_dim | |
| # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` | |
| # Define whether input is continuous or discrete depending on configuration | |
| # its continuous | |
| # 2. Define input layers | |
| self.in_channels = in_channels | |
| self.norm = torch.nn.GroupNorm( | |
| num_groups = norm_num_groups, | |
| num_channels = in_channels, | |
| eps = 1e-6, | |
| affine = True | |
| ) | |
| self.proj_in = nn.Conv2d( | |
| in_channels, | |
| inner_dim, | |
| kernel_size = 1, | |
| stride = 1, | |
| padding = 0 | |
| ) | |
| # 3. Define transformers blocks | |
| self.transformer_blocks = nn.ModuleList( | |
| [ | |
| BasicTransformerBlock( | |
| inner_dim, | |
| num_attention_heads, | |
| attention_head_dim, | |
| dropout = dropout, | |
| cross_attention_dim = cross_attention_dim, | |
| attention_bias = attention_bias, | |
| ) | |
| for _ in range(num_layers) | |
| ] | |
| ) | |
| # 4. Define output layers | |
| self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size = 1, stride = 1, padding = 0) | |
| def forward(self, | |
| hidden_states: torch.Tensor, | |
| encoder_hidden_states: Optional[torch.Tensor] = None, | |
| timestep: torch.long = None | |
| ) -> TransformerPseudo3DModelOutput: | |
| """ | |
| Args: | |
| hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. | |
| When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input | |
| hidden_states | |
| encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*): | |
| Conditional embeddings for cross attention layer. If not given, cross-attention defaults to | |
| self-attention. | |
| timestep ( `torch.long`, *optional*): | |
| Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. | |
| return_dict (`bool`, *optional*, defaults to `True`): | |
| Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. | |
| Returns: | |
| [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`] | |
| if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample | |
| tensor. | |
| """ | |
| b, c, *_, h, w = hidden_states.shape | |
| is_video = hidden_states.ndim == 5 | |
| f = None | |
| if is_video: | |
| b, c, f, h, w = hidden_states.shape | |
| hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) c h w') | |
| #encoder_hidden_states = encoder_hidden_states.repeat_interleave(f, 0) | |
| # 1. Input | |
| batch, channel, height, weight = hidden_states.shape | |
| residual = hidden_states | |
| hidden_states = self.norm(hidden_states) | |
| hidden_states = self.proj_in(hidden_states) | |
| inner_dim = hidden_states.shape[1] | |
| hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim) | |
| # 2. Blocks | |
| for block in self.transformer_blocks: | |
| hidden_states = block( | |
| hidden_states, | |
| context = encoder_hidden_states, | |
| timestep = timestep, | |
| frames_length = f, | |
| height = height, | |
| weight = weight | |
| ) | |
| # 3. Output | |
| hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2) | |
| hidden_states = self.proj_out(hidden_states) | |
| output = hidden_states + residual | |
| if is_video: | |
| output = rearrange(output, '(b f) c h w -> b c f h w', b = b) | |
| return TransformerPseudo3DModelOutput(sample = output) | |
| class BasicTransformerBlock(nn.Module): | |
| r""" | |
| A basic Transformer block. | |
| Parameters: | |
| dim (`int`): The number of channels in the input and output. | |
| num_attention_heads (`int`): The number of heads to use for multi-head attention. | |
| attention_head_dim (`int`): The number of channels in each head. | |
| dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. | |
| cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention. | |
| num_embeds_ada_norm (: | |
| obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. | |
| attention_bias (: | |
| obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. | |
| """ | |
| def __init__(self, | |
| dim: int, | |
| num_attention_heads: int, | |
| attention_head_dim: int, | |
| dropout: float = 0.0, | |
| cross_attention_dim: Optional[int] = None, | |
| attention_bias: bool = False, | |
| ) -> None: | |
| super().__init__() | |
| self.attn1 = CrossAttention( | |
| query_dim = dim, | |
| heads = num_attention_heads, | |
| dim_head = attention_head_dim, | |
| dropout = dropout, | |
| bias = attention_bias | |
| ) # is a self-attention | |
| self.ff = FeedForward(dim, dropout = dropout) | |
| self.attn2 = CrossAttention( | |
| query_dim = dim, | |
| cross_attention_dim = cross_attention_dim, | |
| heads = num_attention_heads, | |
| dim_head = attention_head_dim, | |
| dropout = dropout, | |
| bias = attention_bias | |
| ) # is self-attn if context is none | |
| self.attn_temporal = CrossAttention( | |
| query_dim = dim, | |
| heads = num_attention_heads, | |
| dim_head = attention_head_dim, | |
| dropout = dropout, | |
| bias = attention_bias | |
| ) # is a self-attention | |
| # layer norms | |
| self.norm1 = nn.LayerNorm(dim) | |
| self.norm2 = nn.LayerNorm(dim) | |
| self.norm_temporal = nn.LayerNorm(dim) | |
| self.norm3 = nn.LayerNorm(dim) | |
| def forward(self, | |
| hidden_states: torch.Tensor, | |
| context: Optional[torch.Tensor] = None, | |
| timestep: torch.int64 = None, | |
| frames_length: Optional[int] = None, | |
| height: Optional[int] = None, | |
| weight: Optional[int] = None | |
| ) -> torch.Tensor: | |
| if context is not None and frames_length is not None: | |
| context = context.repeat_interleave(frames_length, 0) | |
| # 1. Self-Attention | |
| norm_hidden_states = ( | |
| self.norm1(hidden_states) | |
| ) | |
| hidden_states = self.attn1(norm_hidden_states) + hidden_states | |
| # 2. Cross-Attention | |
| norm_hidden_states = ( | |
| self.norm2(hidden_states) | |
| ) | |
| hidden_states = self.attn2( | |
| norm_hidden_states, | |
| encoder_hidden_states = context | |
| ) + hidden_states | |
| # append temporal attention | |
| if frames_length is not None: | |
| hidden_states = rearrange( | |
| hidden_states, | |
| '(b f) (h w) c -> (b h w) f c', | |
| f = frames_length, | |
| h = height, | |
| w = weight | |
| ) | |
| norm_hidden_states = ( | |
| self.norm_temporal(hidden_states) | |
| ) | |
| hidden_states = self.attn_temporal(norm_hidden_states) + hidden_states | |
| hidden_states = rearrange( | |
| hidden_states, | |
| '(b h w) f c -> (b f) (h w) c', | |
| f = frames_length, | |
| h = height, | |
| w = weight | |
| ) | |
| # 3. Feed-forward | |
| hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states | |
| return hidden_states | |
| class FeedForward(nn.Module): | |
| r""" | |
| A feed-forward layer. | |
| Parameters: | |
| dim (`int`): The number of channels in the input. | |
| dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. | |
| mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. | |
| dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. | |
| """ | |
| def __init__(self, | |
| dim: int, | |
| dim_out: Optional[int] = None, | |
| mult: int = 4, | |
| dropout: float = 0.0 | |
| ) -> None: | |
| super().__init__() | |
| inner_dim = int(dim * mult) | |
| dim_out = dim_out if dim_out is not None else dim | |
| geglu = GEGLU(dim, inner_dim) | |
| self.net = nn.ModuleList([]) | |
| # project in | |
| self.net.append(geglu) | |
| # project dropout | |
| self.net.append(nn.Dropout(dropout)) | |
| # project out | |
| self.net.append(nn.Linear(inner_dim, dim_out)) | |
| def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: | |
| for module in self.net: | |
| hidden_states = module(hidden_states) | |
| return hidden_states | |
| # feedforward | |
| class GEGLU(nn.Module): | |
| r""" | |
| A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202. | |
| Parameters: | |
| dim_in (`int`): The number of channels in the input. | |
| dim_out (`int`): The number of channels in the output. | |
| """ | |
| def __init__(self, dim_in: int, dim_out: int) -> None: | |
| super().__init__() | |
| self.proj = nn.Linear(dim_in, dim_out * 2) | |
| def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: | |
| hidden_states, gate = self.proj(hidden_states).chunk(2, dim = -1) | |
| return hidden_states * F.gelu(gate) | |