Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

App Files Files Community

makeavid-sd-jax / makeavid_sd /torch_impl /torch_attention_pseudo3d.py

lopho

forgot about the nested package structure

b2f876f about 2 years ago

raw

history blame

11 kB

	from typing import Optional

	import torch
	import torch.nn.functional as F
	from torch import nn

	from einops import rearrange

	from diffusers.models.attention_processor import Attention as CrossAttention
	#from torch_cross_attention import CrossAttention


	class TransformerPseudo3DModelOutput:
	def __init__(self, sample: torch.FloatTensor) -> None:
	self.sample = sample


	class TransformerPseudo3DModel(nn.Module):
	def __init__(self,
	num_attention_heads: int = 16,
	attention_head_dim: int = 88,
	in_channels: Optional[int] = None,
	num_layers: int = 1,
	dropout: float = 0.0,
	norm_num_groups: int = 32,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False
	) -> None:
	super().__init__()
	self.num_attention_heads = num_attention_heads
	self.attention_head_dim = attention_head_dim
	inner_dim = num_attention_heads * attention_head_dim

	# 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
	# Define whether input is continuous or discrete depending on configuration
	# its continuous

	# 2. Define input layers
	self.in_channels = in_channels

	self.norm = torch.nn.GroupNorm(
	num_groups = norm_num_groups,
	num_channels = in_channels,
	eps = 1e-6,
	affine = True
	)
	self.proj_in = nn.Conv2d(
	in_channels,
	inner_dim,
	kernel_size = 1,
	stride = 1,
	padding = 0
	)

	# 3. Define transformers blocks
	self.transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	inner_dim,
	num_attention_heads,
	attention_head_dim,
	dropout = dropout,
	cross_attention_dim = cross_attention_dim,
	attention_bias = attention_bias,
	)
	for _ in range(num_layers)
	]
	)

	# 4. Define output layers
	self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size = 1, stride = 1, padding = 0)

	def forward(self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	timestep: torch.long = None
	) -> TransformerPseudo3DModelOutput:
	"""
	Args:
	hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
	When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
	hidden_states
	encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, optional):
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.
	timestep ( `torch.long`, optional):
	Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
	if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
	tensor.
	"""
	b, c, *_, h, w = hidden_states.shape
	is_video = hidden_states.ndim == 5
	f = None
	if is_video:
	b, c, f, h, w = hidden_states.shape
	hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) c h w')
	#encoder_hidden_states = encoder_hidden_states.repeat_interleave(f, 0)

	# 1. Input
	batch, channel, height, weight = hidden_states.shape
	residual = hidden_states
	hidden_states = self.norm(hidden_states)
	hidden_states = self.proj_in(hidden_states)
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)

	# 2. Blocks
	for block in self.transformer_blocks:
	hidden_states = block(
	hidden_states,
	context = encoder_hidden_states,
	timestep = timestep,
	frames_length = f,
	height = height,
	weight = weight
	)

	# 3. Output
	hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
	hidden_states = self.proj_out(hidden_states)
	output = hidden_states + residual

	if is_video:
	output = rearrange(output, '(b f) c h w -> b c f h w', b = b)

	return TransformerPseudo3DModelOutput(sample = output)



	class BasicTransformerBlock(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the context vector for cross attention.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	"""

	def __init__(self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout: float = 0.0,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False,
	) -> None:
	super().__init__()
	self.attn1 = CrossAttention(
	query_dim = dim,
	heads = num_attention_heads,
	dim_head = attention_head_dim,
	dropout = dropout,
	bias = attention_bias
	) # is a self-attention
	self.ff = FeedForward(dim, dropout = dropout)
	self.attn2 = CrossAttention(
	query_dim = dim,
	cross_attention_dim = cross_attention_dim,
	heads = num_attention_heads,
	dim_head = attention_head_dim,
	dropout = dropout,
	bias = attention_bias
	) # is self-attn if context is none
	self.attn_temporal = CrossAttention(
	query_dim = dim,
	heads = num_attention_heads,
	dim_head = attention_head_dim,
	dropout = dropout,
	bias = attention_bias
	) # is a self-attention

	# layer norms
	self.norm1 = nn.LayerNorm(dim)
	self.norm2 = nn.LayerNorm(dim)
	self.norm_temporal = nn.LayerNorm(dim)
	self.norm3 = nn.LayerNorm(dim)

	def forward(self,
	hidden_states: torch.Tensor,
	context: Optional[torch.Tensor] = None,
	timestep: torch.int64 = None,
	frames_length: Optional[int] = None,
	height: Optional[int] = None,
	weight: Optional[int] = None
	) -> torch.Tensor:
	if context is not None and frames_length is not None:
	context = context.repeat_interleave(frames_length, 0)
	# 1. Self-Attention
	norm_hidden_states = (
	self.norm1(hidden_states)
	)
	hidden_states = self.attn1(norm_hidden_states) + hidden_states

	# 2. Cross-Attention
	norm_hidden_states = (
	self.norm2(hidden_states)
	)
	hidden_states = self.attn2(
	norm_hidden_states,
	encoder_hidden_states = context
	) + hidden_states

	# append temporal attention
	if frames_length is not None:
	hidden_states = rearrange(
	hidden_states,
	'(b f) (h w) c -> (b h w) f c',
	f = frames_length,
	h = height,
	w = weight
	)
	norm_hidden_states = (
	self.norm_temporal(hidden_states)
	)
	hidden_states = self.attn_temporal(norm_hidden_states) + hidden_states
	hidden_states = rearrange(
	hidden_states,
	'(b h w) f c -> (b f) (h w) c',
	f = frames_length,
	h = height,
	w = weight
	)

	# 3. Feed-forward
	hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
	return hidden_states


	class FeedForward(nn.Module):
	r"""
	A feed-forward layer.

	Parameters:
	dim (`int`): The number of channels in the input.
	dim_out (`int`, optional): The number of channels in the output. If not given, defaults to `dim`.
	mult (`int`, optional, defaults to 4): The multiplier to use for the hidden dimension.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	"""

	def __init__(self,
	dim: int,
	dim_out: Optional[int] = None,
	mult: int = 4,
	dropout: float = 0.0
	) -> None:
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = dim_out if dim_out is not None else dim

	geglu = GEGLU(dim, inner_dim)

	self.net = nn.ModuleList([])
	# project in
	self.net.append(geglu)
	# project dropout
	self.net.append(nn.Dropout(dropout))
	# project out
	self.net.append(nn.Linear(inner_dim, dim_out))

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	for module in self.net:
	hidden_states = module(hidden_states)
	return hidden_states


	# feedforward
	class GEGLU(nn.Module):
	r"""
	A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.

	Parameters:
	dim_in (`int`): The number of channels in the input.
	dim_out (`int`): The number of channels in the output.
	"""

	def __init__(self, dim_in: int, dim_out: int) -> None:
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out * 2)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states, gate = self.proj(hidden_states).chunk(2, dim = -1)
	return hidden_states * F.gelu(gate)