bmarci's picture
edit
f2a451c
raw
history blame
9.17 kB
import math
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from transformers.activations import ACT2FN
from models.config import LlamaConfig
from utils.misc import LargeInt
from utils.model_utils import expand_t, randn_tensor
from utils.compile_utils import smart_compile
class LlamaMLP(nn.Module):
def __init__(self, config: LlamaConfig):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
def modulate(x, shift, scale=None):
if shift is None:
return x * (1 + scale)
return x * (1 + scale) + shift
class ResBlock(nn.Module):
def __init__(self, channels, mlp_ratio=1.0):
super().__init__()
self.channels = channels
self.intermediate_size = int(channels * mlp_ratio)
self.in_ln = nn.LayerNorm(self.channels, eps=1e-6)
self.mlp = nn.Sequential(
nn.Linear(self.channels, self.intermediate_size),
nn.SiLU(),
nn.Linear(self.intermediate_size, self.channels),
)
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(channels, 3 * channels, bias=True))
def forward(self, x, y):
shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
h = self.mlp(h)
return x + gate_mlp * h
class FinalLayer(nn.Module):
def __init__(self, model_channels, out_channels):
super().__init__()
self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(model_channels, out_channels, bias=True)
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(model_channels, 2 * model_channels, bias=True))
def forward(self, x, c):
shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
x = modulate(self.norm_final(x), shift, scale)
x = self.linear(x)
return x
class TimestepEmbedder(nn.Module):
"""
Embeds scalar timesteps into vector representations.
"""
def __init__(self, hidden_size, frequency_embedding_size=256):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size, bias=True),
)
self.frequency_embedding_size = frequency_embedding_size
@staticmethod
def timestep_embedding(t: torch.Tensor, dim: int, max_period: float = 10000.0):
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element. These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
half = dim // 2
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
device=t.device
)
args = t[:, None].float() * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding
def forward(self, t):
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
return t_emb
class SimpleMLPAdaLN(nn.Module):
def __init__(self, input_dim, cond_dim, dim=1536, layers=12, mlp_ratio=1.0):
super().__init__()
self.input_dim = input_dim
self.cond_dim = cond_dim
self.dim = dim
self.layers = layers
self.mlp_ratio = mlp_ratio
self.time_embed = TimestepEmbedder(dim)
self.cond_embed = nn.Linear(cond_dim, dim)
self.input_proj = nn.Linear(input_dim, dim)
res_blocks = []
for _ in range(layers):
res_blocks.append(ResBlock(dim, mlp_ratio))
self.res_blocks = nn.ModuleList(res_blocks)
self.final_layer = FinalLayer(dim, input_dim)
self.grad_checkpointing = False
@smart_compile()
def forward(self, x, t, c):
"""
x.shape = (bsz, input_dim)
t.shape = (bsz,)
c.shape = (bsz, cond_dim)
"""
x = self.input_proj(x)
t = self.time_embed(t)
c = self.cond_embed(c)
y = t + c
for block in self.res_blocks:
if self.grad_checkpointing and self.training:
x = checkpoint(block, x, y, use_reentrant=True)
else:
x = block(x, y)
return self.final_layer(x, y)
class FlowMatchingHead(nn.Module):
def __init__(self, input_dim, cond_dim, dim=1536, layers=12, mlp_ratio=1.0):
super(FlowMatchingHead, self).__init__()
self.input_dim = input_dim
self.net = SimpleMLPAdaLN(input_dim=input_dim, cond_dim=cond_dim, dim=dim, layers=layers, mlp_ratio=mlp_ratio)
@property
def dtype(self):
return self.net.input_proj.weight.dtype
@property
def device(self):
return self.net.input_proj.weight.device
@property
def trainable_params(self) -> float:
n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
return LargeInt(n_params)
def get_score_from_velocity(self, velocity, x, t):
"""Wrapper function: transfrom velocity prediction model to score
Args:
velocity: [bsz, ...] shaped tensor; velocity model output
x: [bsz, ...] shaped tensor; x_t data point
t: [bsz,] time tensor
"""
t = expand_t(t, x)
alpha_t, d_alpha_t = t, 1
sigma_t, d_sigma_t = 1 - t, -1
mean = x
reverse_alpha_ratio = alpha_t / d_alpha_t
var = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
score = (reverse_alpha_ratio * velocity - mean) / var
return score
def get_velocity_from_cfg(self, velocity, cfg, cfg_img, cfg_mult):
if cfg_mult == 2:
cond_v, uncond_v = torch.chunk(velocity, 2, dim=0)
velocity = uncond_v + cfg * (cond_v - uncond_v)
elif cfg_mult == 3:
cond_v, uncond_v1, uncond_v2 = torch.chunk(velocity, 3, dim=0)
velocity = uncond_v2 + cfg_img * (uncond_v1 - uncond_v2) + cfg * (cond_v - uncond_v1)
return velocity
@smart_compile(options={"triton.cudagraphs": True}, fullgraph=True)
@torch.no_grad()
def sample(
self,
c: torch.Tensor,
cfg: float = 1.0,
cfg_img: float = 1.0,
timesteps_shift: float = 1.0,
num_sampling_steps: int = 20,
last_step_size: float = 0.0,
noise_repeat: int = 1,
):
"""c.shape = (bsz, cond_dim)"""
cfg_mult = 1
if cfg > 1.0:
cfg_mult += 1
if cfg_img > 1.0:
cfg_mult += 1
device, dtype = c.device, c.dtype
noise = randn_tensor((c.shape[0] // cfg_mult, self.input_dim), noise_repeat, device, dtype)
mean_x = noise
x = noise
xs = []
t0, t1 = 0, 1
timesteps = torch.linspace(t0, t1, num_sampling_steps + 1, device=device)[:-1]
timesteps = timesteps / (timesteps_shift - (timesteps_shift - 1) * timesteps)
timesteps = torch.cat([timesteps, torch.ones(1, device=device)])
for ti, tj in zip(timesteps[:-1], timesteps[1:]):
dt = tj - ti
combined = torch.cat([x] * cfg_mult, dim=0)
velocity = self.net(combined.to(c.dtype), ti.expand(c.shape[0]).to(c), c)
velocity = velocity.to(torch.float32)
velocity = self.get_velocity_from_cfg(velocity, cfg, cfg_img, cfg_mult)
score = self.get_score_from_velocity(velocity, x, ti.expand(x.shape[0]).to(x))
drift = velocity + (1 - expand_t(ti.expand(x.shape[0]).to(x), x)) * score
w_cur = randn_tensor((c.shape[0] // cfg_mult, self.input_dim), noise_repeat, device, dtype)
dw = w_cur * torch.sqrt(dt)
mean_x = x + drift * dt
x = mean_x + torch.sqrt(2 * (1 - expand_t(ti.expand(x.shape[0]).to(x), x))) * dw
xs.append(x)
if len(xs) != num_sampling_steps:
raise ValueError(f"Samples ({len(xs)}) does not match the number of steps ({num_sampling_steps})")
return xs[-1]