Spaces:

1inkusFace
/

SkyReelsV2

Running on Zero

+import numpy as np
+import torch
+import torch.amp as amp
+from torch.backends.cuda import sdp_kernel
+from xfuser.core.distributed import get_sequence_parallel_rank
+from xfuser.core.distributed import get_sequence_parallel_world_size
+from xfuser.core.distributed import get_sp_group
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+from ..modules.transformer import sinusoidal_embedding_1d
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(pad_size, s1, s2, dtype=original_tensor.dtype, device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast("cuda", enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    grid = [grid_sizes.tolist()] * x.size(0)
+    for i, (f, h, w) in enumerate(grid):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(s, n, -1, 2))
+        freqs_i = torch.cat(
+            [
+                freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        ).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank) : ((sp_rank + 1) * s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank.cuda()).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def broadcast_should_calc(should_calc: bool) -> bool:
+    import torch.distributed as dist
+    device = torch.cuda.current_device()
+    int_should_calc = 1 if should_calc else 0
+    tensor = torch.tensor([int_should_calc], device=device, dtype=torch.int8)
+    dist.broadcast(tensor, src=0)
+    should_calc = tensor.item() == 1
+    return should_calc
+def usp_dit_forward(self, x, t, context, clip_fea=None, y=None, fps=None):
+    """
+    x:              A list of videos each with shape [C, T, H, W].
+    t:              [B].
+    context:        A list of text embeddings each with shape [L, C].
+    """
+    if self.model_type == "i2v":
+        assert clip_fea is not None and y is not None
+    # params
+    device = self.patch_embedding.weight.device
+    if self.freqs.device != device:
+        self.freqs = self.freqs.to(device)
+    if y is not None:
+        x = torch.cat([x, y], dim=1)
+    # embeddings
+    x = self.patch_embedding(x)
+    grid_sizes = torch.tensor(x.shape[2:], dtype=torch.long)
+    x = x.flatten(2).transpose(1, 2)
+    if self.flag_causal_attention:
+        frame_num = grid_sizes[0]
+        height = grid_sizes[1]
+        width = grid_sizes[2]
+        block_num = frame_num // self.num_frame_per_block
+        range_tensor = torch.arange(block_num).view(-1, 1)
+        range_tensor = range_tensor.repeat(1, self.num_frame_per_block).flatten()
+        casual_mask = range_tensor.unsqueeze(0) <= range_tensor.unsqueeze(1)  # f, f
+        casual_mask = casual_mask.view(frame_num, 1, 1, frame_num, 1, 1).to(x.device)
+        casual_mask = casual_mask.repeat(1, height, width, 1, height, width)
+        casual_mask = casual_mask.reshape(frame_num * height * width, frame_num * height * width)
+        self.block_mask = casual_mask.unsqueeze(0).unsqueeze(0)
+    # time embeddings
+    with amp.autocast("cuda", dtype=torch.float32):
+        if t.dim() == 2:
+            b, f = t.shape
+            _flag_df = True
+        else:
+            _flag_df = False
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(self.patch_embedding.weight.dtype)
+        )  # b, dim
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))  # b, 6, dim
+        if self.inject_sample_info:
+            fps = torch.tensor(fps, dtype=torch.long, device=device)
+            fps_emb = self.fps_embedding(fps).float()
+            if _flag_df:
+                e0 = e0 + self.fps_projection(fps_emb).unflatten(1, (6, self.dim)).repeat(t.shape[1], 1, 1)
+            else:
+                e0 = e0 + self.fps_projection(fps_emb).unflatten(1, (6, self.dim))
+        if _flag_df:
+            e = e.view(b, f, 1, 1, self.dim)
+            e0 = e0.view(b, f, 1, 1, 6, self.dim)
+            e = e.repeat(1, 1, grid_sizes[1], grid_sizes[2], 1).flatten(1, 3)
+            e0 = e0.repeat(1, 1, grid_sizes[1], grid_sizes[2], 1, 1).flatten(1, 3)
+            e0 = e0.transpose(1, 2).contiguous()
+        assert e.dtype == torch.float32 and e0.dtype == torch.float32
+    # context
+    context = self.text_embedding(context)
+    if clip_fea is not None:
+        context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+        context = torch.concat([context_clip, context], dim=1)
+    # arguments
+    if e0.ndim == 4:
+        e0 = torch.chunk(e0, get_sequence_parallel_world_size(), dim=2)[get_sequence_parallel_rank()]
+    kwargs = dict(e=e0, grid_sizes=grid_sizes, freqs=self.freqs, context=context, block_mask=self.block_mask)
+    if self.enable_teacache:
+        modulated_inp = e0 if self.use_ref_steps else e
+        # teacache
+        if self.cnt % 2 == 0:  # even -> conditon
+            self.is_even = True
+            if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                should_calc_even = True
+                self.accumulated_rel_l1_distance_even = 0
+            else:
+                rescale_func = np.poly1d(self.coefficients)
+                self.accumulated_rel_l1_distance_even += rescale_func(
+                    ((modulated_inp - self.previous_e0_even).abs().mean() / self.previous_e0_even.abs().mean())
+                    .cpu()
+                    .item()
+                )
+                if self.accumulated_rel_l1_distance_even < self.teacache_thresh:
+                    should_calc_even = False
+                else:
+                    should_calc_even = True
+                    self.accumulated_rel_l1_distance_even = 0
+            self.previous_e0_even = modulated_inp.clone()
+        else:  # odd -> unconditon
+            self.is_even = False
+            if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                should_calc_odd = True
+                self.accumulated_rel_l1_distance_odd = 0
+            else:
+                rescale_func = np.poly1d(self.coefficients)
+                self.accumulated_rel_l1_distance_odd += rescale_func(
+                    ((modulated_inp - self.previous_e0_odd).abs().mean() / self.previous_e0_odd.abs().mean())
+                    .cpu()
+                    .item()
+                )
+                if self.accumulated_rel_l1_distance_odd < self.teacache_thresh:
+                    should_calc_odd = False
+                else:
+                    should_calc_odd = True
+                    self.accumulated_rel_l1_distance_odd = 0
+            self.previous_e0_odd = modulated_inp.clone()
+    x = torch.chunk(x, get_sequence_parallel_world_size(), dim=1)[get_sequence_parallel_rank()]
+    if self.enable_teacache:
+        if self.is_even:
+            should_calc_even = broadcast_should_calc(should_calc_even)
+            if not should_calc_even:
+                x += self.previous_residual_even
+            else:
+                ori_x = x.clone()
+                for block in self.blocks:
+                    x = block(x, **kwargs)
+                ori_x.mul_(-1)
+                ori_x.add_(x)
+                self.previous_residual_even = ori_x
+        else:
+            should_calc_odd = broadcast_should_calc(should_calc_odd)
+            if not should_calc_odd:
+                x += self.previous_residual_odd
+            else:
+                ori_x = x.clone()
+                for block in self.blocks:
+                    x = block(x, **kwargs)
+                ori_x.mul_(-1)
+                ori_x.add_(x)
+                self.previous_residual_odd = ori_x
+        self.cnt += 1
+        if self.cnt >= self.num_steps:
+            self.cnt = 0
+    else:
+        # Context Parallel
+        for block in self.blocks:
+            x = block(x, **kwargs)
+    # head
+    if e.ndim == 3:
+        e = torch.chunk(e, get_sequence_parallel_world_size(), dim=1)[get_sequence_parallel_rank()]
+    x = self.head(x, e)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, grid_sizes)
+    return x.float()
+def usp_attn_forward(self, x, grid_sizes, freqs, block_mask):
+    r"""
+    Args:
+        x(Tensor): Shape [B, L, num_heads, C / num_heads]
+        seq_lens(Tensor): Shape [B]
+        grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+        freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+    """
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(torch.bfloat16)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    x = x.to(self.q.weight.dtype)
+    q, k, v = qkv_fn(x)
+    if not self._flag_ar_attention:
+        q = rope_apply(q, grid_sizes, freqs)
+        k = rope_apply(k, grid_sizes, freqs)
+    else:
+        q = rope_apply(q, grid_sizes, freqs)
+        k = rope_apply(k, grid_sizes, freqs)
+        q = q.to(torch.bfloat16)
+        k = k.to(torch.bfloat16)
+        v = v.to(torch.bfloat16)
+        # x = torch.nn.functional.scaled_dot_product_attention(
+        #     q.transpose(1, 2),
+        #     k.transpose(1, 2),
+        #     v.transpose(1, 2),
+        #    ).transpose(1, 2).contiguous()
+        with sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+            x = (
+                torch.nn.functional.scaled_dot_product_attention(
+                    q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), attn_mask=block_mask
+                )
+                .transpose(1, 2)
+                .contiguous()
+            )
+    x = xFuserLongContextAttention()(None, query=half(q), key=half(k), value=half(v), window_size=self.window_size)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

skyreels_v2_infer/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gc
+import os
+import torch
+from safetensors.torch import load_file
+from .clip import CLIPModel
+from .t5 import T5EncoderModel
+from .transformer import WanModel
+from .vae import WanVAE
+def download_model(model_id):
+    if not os.path.exists(model_id):
+        from huggingface_hub import snapshot_download
+        model_id = snapshot_download(repo_id=model_id)
+    return model_id
+def get_vae(model_path, device="cuda", weight_dtype=torch.float32) -> WanVAE:
+    vae = WanVAE(model_path).to(device).to(weight_dtype)
+    vae.vae.requires_grad_(False)
+    vae.vae.eval()
+    gc.collect()
+    torch.cuda.empty_cache()
+    return vae
+def get_transformer(model_path, device="cuda", weight_dtype=torch.bfloat16) -> WanModel:
+    config_path = os.path.join(model_path, "config.json")
+    transformer = WanModel.from_config(config_path).to(weight_dtype).to(device)
+    for file in os.listdir(model_path):
+        if file.endswith(".safetensors"):
+            file_path = os.path.join(model_path, file)
+            state_dict = load_file(file_path)
+            transformer.load_state_dict(state_dict, strict=False)
+            del state_dict
+            gc.collect()
+            torch.cuda.empty_cache()
+    transformer.requires_grad_(False)
+    transformer.eval()
+    gc.collect()
+    torch.cuda.empty_cache()
+    return transformer
+def get_text_encoder(model_path, device="cuda", weight_dtype=torch.bfloat16) -> T5EncoderModel:
+    t5_model = os.path.join(model_path, "models_t5_umt5-xxl-enc-bf16.pth")
+    tokenizer_path = os.path.join(model_path, "google", "umt5-xxl")
+    text_encoder = T5EncoderModel(checkpoint_path=t5_model, tokenizer_path=tokenizer_path).to(device).to(weight_dtype)
+    text_encoder.requires_grad_(False)
+    text_encoder.eval()
+    gc.collect()
+    torch.cuda.empty_cache()
+    return text_encoder
+def get_image_encoder(model_path, device="cuda", weight_dtype=torch.bfloat16) -> CLIPModel:
+    checkpoint_path = os.path.join(model_path, "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth")
+    tokenizer_path = os.path.join(model_path, "xlm-roberta-large")
+    image_enc = CLIPModel(checkpoint_path, tokenizer_path).to(weight_dtype).to(device)
+    image_enc.requires_grad_(False)
+    image_enc.eval()
+    gc.collect()
+    torch.cuda.empty_cache()
+    return image_enc

skyreels_v2_infer/modules/attention.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+import warnings
+__all__ = [
+    "flash_attention",
+    "attention",
+]
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == "cuda" and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    q = half(q.flatten(0, 1))
+    q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
+    # preprocess key, value
+    k = half(k.flatten(0, 1))
+    v = half(v.flatten(0, 1))
+    k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(device=k.device, non_blocking=True)
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn("Flash attention 3 is not available, use flash attention 2 instead.")
+    torch.cuda.nvtx.range_push(f"{list(q.shape)}-{list(k.shape)}-{list(v.shape)}-{q.dtype}-{k.dtype}-{v.dtype}")
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic,
+        )[0].unflatten(0, (b, lq))
+    else:
+        assert FLASH_ATTN_2_AVAILABLE
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+        ).unflatten(0, (b, lq))
+    torch.cuda.nvtx.range_pop()
+    # output
+    return x
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                "Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance."
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
+        )
+        out = out.transpose(1, 2).contiguous()
+        return out

skyreels_v2_infer/modules/clip.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from diffusers.models import ModelMixin
+from .attention import flash_attention
+from .tokenizers import HuggingfaceTokenizer
+from .xlm_roberta import XLMRoberta
+__all__ = [
+    "XLMRobertaCLIP",
+    "clip_xlm_roberta_vit_h_14",
+    "CLIPModel",
+]
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat(
+            [
+                pos[:, :n],
+                F.interpolate(
+                    pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(0, 3, 1, 2),
+                    size=(tar_grid, tar_grid),
+                    mode="bicubic",
+                    align_corners=False,
+                )
+                .flatten(2)
+                .transpose(1, 2),
+            ],
+            dim=1,
+        )
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, causal=False, attn_dropout=0.0, proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
+        # compute attention
+        p = self.attn_dropout if self.training else 0.0
+        x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
+        x = x.reshape(b, s, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mlp_ratio,
+        num_heads,
+        post_norm=False,
+        causal=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        assert activation in ["quick_gelu", "gelu", "swi_glu"]
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == "swi_glu":
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim),
+                nn.Dropout(proj_dropout),
+            )
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(self, dim, mlp_ratio, num_heads, activation="gelu", proj_dropout=0.0, norm_eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim),
+            nn.Dropout(proj_dropout),
+        )
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
+        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
+        # compute attention
+        x = flash_attention(q, k, v, version=2)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=16,
+        dim=768,
+        mlp_ratio=4,
+        out_dim=512,
+        num_heads=12,
+        num_layers=12,
+        pool_type="token",
+        pre_norm=True,
+        post_norm=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        if image_size % patch_size != 0:
+            print("[WARNING] image_size is not divisible by patch_size", flush=True)
+        assert pool_type in ("token", "token_fc", "attn_pool")
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size) ** 2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size, bias=not pre_norm)
+        if pool_type in ("token", "token_fc"):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain * torch.randn(1, self.num_patches + (1 if pool_type in ("token", "token_fc") else 0), dim)
+        )
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(
+            *[
+                AttentionBlock(
+                    dim, mlp_ratio, num_heads, post_norm, False, activation, attn_dropout, proj_dropout, norm_eps
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == "token":
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == "token_fc":
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == "attn_pool":
+            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation, proj_dropout, norm_eps)
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ("token", "token_fc"):
+            x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop("out_dim")
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(), nn.Linear(mid_dim, self.out_dim, bias=False)
+        )
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(
+        self,
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        vision_pre_norm=True,
+        vision_post_norm=False,
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.textual = XLMRobertaWithHead(
+            vocab_size=vocab_size,
+            max_seq_len=max_text_len,
+            type_size=type_size,
+            pad_id=pad_id,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            post_norm=text_post_norm,
+            dropout=text_dropout,
+        )
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [
+            {
+                "params": [p for n, p in self.named_parameters() if "norm" in n or n.endswith("bias")],
+                "weight_decay": 0.0,
+            },
+            {"params": [p for n, p in self.named_parameters() if not ("norm" in n or n.endswith("bias"))]},
+        ]
+        return groups
+def _clip(
+    pretrained=False,
+    pretrained_name=None,
+    model_cls=XLMRobertaCLIP,
+    return_transforms=False,
+    return_tokenizer=False,
+    tokenizer_padding="eos",
+    dtype=torch.float32,
+    device="cpu",
+    **kwargs,
+):
+    # init a model on device
+    with torch.device(device):
+        model = model_cls(**kwargs)
+    # set device
+    model = model.to(dtype=dtype, device=device)
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if "siglip" in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose(
+            [
+                T.Resize((model.image_size, model.image_size), interpolation=T.InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=mean, std=std),
+            ]
+        )
+        output += (transforms,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(pretrained=False, pretrained_name="open-clip-xlm-roberta-large-vit-huge-14", **kwargs):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class CLIPModel(ModelMixin):
+    def __init__(self, checkpoint_path, tokenizer_path):
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        super().__init__()
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False, return_transforms=True, return_tokenizer=False
+        )
+        self.model = self.model.eval().requires_grad_(False)
+        logging.info(f"loading {checkpoint_path}")
+        self.model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path, seq_len=self.model.max_text_len - 2, clean="whitespace"
+        )
+    def encode_video(self, video):
+        # preprocess
+        b, c, t, h, w = video.shape
+        video = video.transpose(1, 2)
+        video = video.reshape(b * t, c, h, w)
+        size = (self.model.image_size,) * 2
+        video = F.interpolate(
+                video,
+                size=size,
+                mode='bicubic',
+                align_corners=False)
+        video = self.transforms.transforms[-1](video.mul_(0.5).add_(0.5))
+        # forward
+        with torch.amp.autocast(dtype=self.dtype, device_type=self.device.type):
+            out = self.model.visual(video, use_31_block=True)
+        return out

skyreels_v2_infer/modules/t5.py ADDED Viewed

	@@ -0,0 +1,454 @@

+# Modified from transformers.models.t5.modeling_t5
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models import ModelMixin
+from .tokenizers import HuggingfaceTokenizer
+__all__ = [
+    "T5Model",
+    "T5Encoder",
+    "T5Decoder",
+    "T5EncoderModel",
+]
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5Model):
+        nn.init.normal_(m.token_embedding.weight, std=1.0)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn) ** -0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn) ** -0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(m.embedding.weight, std=(2 * m.num_buckets * m.num_heads) ** -0.5)
+class GELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1, -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum("binc,bjnc->bnij", q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum("bnij,bjnc->binc", attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5CrossAttention(nn.Module):
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5CrossAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm3 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=False)
+    def forward(self, x, mask=None, encoder_states=None, encoder_mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.cross_attn(self.norm2(x), context=encoder_states, mask=encoder_mask))
+        x = fp16_clamp(x + self.ffn(self.norm3(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = (
+            max_exact
+            + (
+                torch.log(rel_pos.float() / max_exact) / math.log(self.max_dist / max_exact) * (num_buckets - max_exact)
+            ).long()
+        )
+        rel_pos_large = torch.min(rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+class T5Encoder(nn.Module):
+    def __init__(self, vocab, dim, dim_attn, dim_ffn, num_heads, num_layers, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5Encoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout)
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+class T5Decoder(nn.Module):
+    def __init__(self, vocab, dim, dim_attn, dim_ffn, num_heads, num_layers, num_buckets, shared_pos=True, dropout=0.1):
+        super(T5Decoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=False) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout)
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
+        b, s = ids.size()
+        # causal mask
+        if mask is None:
+            mask = torch.tril(torch.ones(1, s, s).to(ids.device))
+        elif mask.ndim == 2:
+            mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
+        # layers
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+class T5Model(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        encoder_layers,
+        decoder_layers,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5Model, self).__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_buckets = num_buckets
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.encoder = T5Encoder(
+            self.token_embedding, dim, dim_attn, dim_ffn, num_heads, encoder_layers, num_buckets, shared_pos, dropout
+        )
+        self.decoder = T5Decoder(
+            self.token_embedding, dim, dim_attn, dim_ffn, num_heads, decoder_layers, num_buckets, shared_pos, dropout
+        )
+        self.head = nn.Linear(dim, vocab_size, bias=False)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
+        x = self.encoder(encoder_ids, encoder_mask)
+        x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
+        x = self.head(x)
+        return x
+def _t5(
+    name,
+    encoder_only=False,
+    decoder_only=False,
+    return_tokenizer=False,
+    tokenizer_kwargs={},
+    dtype=torch.float32,
+    device="cpu",
+    **kwargs,
+):
+    # sanity check
+    assert not (encoder_only and decoder_only)
+    # params
+    if encoder_only:
+        model_cls = T5Encoder
+        kwargs["vocab"] = kwargs.pop("vocab_size")
+        kwargs["num_layers"] = kwargs.pop("encoder_layers")
+        _ = kwargs.pop("decoder_layers")
+    elif decoder_only:
+        model_cls = T5Decoder
+        kwargs["vocab"] = kwargs.pop("vocab_size")
+        kwargs["num_layers"] = kwargs.pop("decoder_layers")
+        _ = kwargs.pop("encoder_layers")
+    else:
+        model_cls = T5Model
+    # init model
+    with torch.device(device):
+        model = model_cls(**kwargs)
+    # set device
+    model = model.to(dtype=dtype, device=device)
+    # init tokenizer
+    if return_tokenizer:
+        from .tokenizers import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(f"google/{name}", **tokenizer_kwargs)
+        return model, tokenizer
+    else:
+        return model
+def umt5_xxl(**kwargs):
+    cfg = dict(
+        vocab_size=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        encoder_layers=24,
+        decoder_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1,
+    )
+    cfg.update(**kwargs)
+    return _t5("umt5-xxl", **cfg)
+class T5EncoderModel(ModelMixin):
+    def __init__(
+        self,
+        checkpoint_path=None,
+        tokenizer_path=None,
+        text_len=512,
+        shard_fn=None,
+    ):
+        self.text_len = text_len
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        super().__init__()
+        # init model
+        model = umt5_xxl(encoder_only=True, return_tokenizer=False)
+        logging.info(f"loading {checkpoint_path}")
+        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
+        self.model = model
+        if shard_fn is not None:
+            self.model = shard_fn(self.model, sync_module_states=False)
+        else:
+            self.model.eval().requires_grad_(False)
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(name=tokenizer_path, seq_len=text_len, clean="whitespace")
+    def encode(self, texts):
+        ids, mask = self.tokenizer(texts, return_mask=True, add_special_tokens=True)
+        ids = ids.to(self.device)
+        mask = mask.to(self.device)
+        # seq_lens = mask.gt(0).sum(dim=1).long()
+        context = self.model(ids, mask)
+        context = context * mask.unsqueeze(-1).cuda()
+        return context

skyreels_v2_infer/modules/tokenizers.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import html
+import string
+import ftfy
+import regex as re
+from transformers import AutoTokenizer
+__all__ = ["HuggingfaceTokenizer"]
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans("", "", string.punctuation))
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, "whitespace", "lower", "canonicalize")
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop("return_mask", False)
+        # arguments
+        _kwargs = {"return_tensors": "pt"}
+        if self.seq_len is not None:
+            _kwargs.update({"padding": "max_length", "truncation": True, "max_length": self.seq_len})
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == "whitespace":
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == "lower":
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == "canonicalize":
+            text = canonicalize(basic_clean(text))
+        return text

skyreels_v2_infer/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,839 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+import numpy as np
+import torch
+import torch.amp as amp
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.configuration_utils import register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.models.modeling_utils import ModelMixin
+from torch.backends.cuda import sdp_kernel
+from torch.nn.attention.flex_attention import BlockMask
+from torch.nn.attention.flex_attention import create_block_mask
+from torch.nn.attention.flex_attention import flex_attention
+from .attention import flash_attention
+flex_attention = torch.compile(flex_attention, dynamic=False, mode="max-autotune")
+DISABLE_COMPILE = False  # get os env
+__all__ = ["WanModel"]
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+@amp.autocast("cuda", enabled=False)
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len), 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim))
+    )
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@amp.autocast("cuda", enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    n, c = x.size(2), x.size(3) // 2
+    bs = x.size(0)
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    f, h, w = grid_sizes.tolist()
+    seq_len = f * h * w
+    # precompute multipliers
+    x = torch.view_as_complex(x.to(torch.float32).reshape(bs, seq_len, n, -1, 2))
+    freqs_i = torch.cat(
+        [
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+        ],
+        dim=-1,
+    ).reshape(seq_len, 1, -1)
+    # apply rotary embedding
+    x = torch.view_as_real(x * freqs_i).flatten(3)
+    return x
+@torch.compile(dynamic=True, disable=DISABLE_COMPILE)
+def fast_rms_norm(x, weight, eps):
+    x = x.float()
+    x = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + eps)
+    x = x.type_as(x) * weight
+    return x
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return fast_rms_norm(x, self.weight, self.eps)
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return super().forward(x)
+class WanSelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self._flag_ar_attention = False
+    def set_ar_attention(self):
+        self._flag_ar_attention = True
+    def forward(self, x, grid_sizes, freqs, block_mask):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            seq_lens(Tensor): Shape [B]
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        x = x.to(self.q.weight.dtype)
+        q, k, v = qkv_fn(x)
+        if not self._flag_ar_attention:
+            q = rope_apply(q, grid_sizes, freqs)
+            k = rope_apply(k, grid_sizes, freqs)
+            x = flash_attention(q=q, k=k, v=v, window_size=self.window_size)
+        else:
+            q = rope_apply(q, grid_sizes, freqs)
+            k = rope_apply(k, grid_sizes, freqs)
+            q = q.to(torch.bfloat16)
+            k = k.to(torch.bfloat16)
+            v = v.to(torch.bfloat16)
+            with sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+                x = (
+                    torch.nn.functional.scaled_dot_product_attention(
+                        q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), attn_mask=block_mask
+                    )
+                    .transpose(1, 2)
+                    .contiguous()
+                )
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanT2VCrossAttention(WanSelfAttention):
+    def forward(self, x, context):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        # compute attention
+        x = flash_attention(q, k, v)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanI2VCrossAttention(WanSelfAttention):
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps)
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, context):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
+        v_img = self.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img)
+        # compute attention
+        x = flash_attention(q, k, v)
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        x = x + img_x
+        x = self.o(x)
+        return x
+WAN_CROSSATTENTION_CLASSES = {
+    "t2v_cross_attn": WanT2VCrossAttention,
+    "i2v_cross_attn": WanI2VCrossAttention,
+}
+def mul_add(x, y, z):
+    return x.float() + y.float() * z.float()
+def mul_add_add(x, y, z):
+    return x.float() * (1 + y) + z
+mul_add_compile = torch.compile(mul_add, dynamic=True, disable=DISABLE_COMPILE)
+mul_add_add_compile = torch.compile(mul_add_add, dynamic=True, disable=DISABLE_COMPILE)
+class WanAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        cross_attn_type,
+        dim,
+        ffn_dim,
+        num_heads,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
+        self.norm3 = WanLayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim, num_heads, (-1, -1), qk_norm, eps)
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(nn.Linear(dim, ffn_dim), nn.GELU(approximate="tanh"), nn.Linear(ffn_dim, dim))
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def set_ar_attention(self):
+        self.self_attn.set_ar_attention()
+    def forward(
+        self,
+        x,
+        e,
+        grid_sizes,
+        freqs,
+        context,
+        block_mask,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        if e.dim() == 3:
+            modulation = self.modulation  # 1, 6, dim
+            with amp.autocast("cuda", dtype=torch.float32):
+                e = (modulation + e).chunk(6, dim=1)
+        elif e.dim() == 4:
+            modulation = self.modulation.unsqueeze(2)  # 1, 6, 1, dim
+            with amp.autocast("cuda", dtype=torch.float32):
+                e = (modulation + e).chunk(6, dim=1)
+            e = [ei.squeeze(1) for ei in e]
+        # self-attention
+        out = mul_add_add_compile(self.norm1(x), e[1], e[0])
+        y = self.self_attn(out, grid_sizes, freqs, block_mask)
+        with amp.autocast("cuda", dtype=torch.float32):
+            x = mul_add_compile(x, y, e[2])
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context, e):
+            dtype = context.dtype
+            x = x + self.cross_attn(self.norm3(x.to(dtype)), context)
+            y = self.ffn(mul_add_add_compile(self.norm2(x), e[4], e[3]).to(dtype))
+            with amp.autocast("cuda", dtype=torch.float32):
+                x = mul_add_compile(x, y, e[5])
+            return x
+        x = cross_attn_ffn(x, context, e)
+        return x.to(torch.bfloat16)
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        with amp.autocast("cuda", dtype=torch.float32):
+            if e.dim() == 2:
+                modulation = self.modulation  # 1, 2, dim
+                e = (modulation + e.unsqueeze(1)).chunk(2, dim=1)
+            elif e.dim() == 3:
+                modulation = self.modulation.unsqueeze(2)  # 1, 2, seq, dim
+                e = (modulation + e.unsqueeze(1)).chunk(2, dim=1)
+                e = [ei.squeeze(1) for ei in e]
+            x = self.head(self.norm(x) * (1 + e[1]) + e[0])
+        return x
+class MLPProj(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim),
+            torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim),
+        )
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class WanModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+    ignore_for_config = ["patch_size", "cross_attn_norm", "qk_norm", "text_dim", "window_size"]
+    _no_split_modules = ["WanAttentionBlock"]
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        model_type="t2v",
+        patch_size=(1, 2, 2),
+        text_len=512,
+        in_dim=16,
+        dim=2048,
+        ffn_dim=8192,
+        freq_dim=256,
+        text_dim=4096,
+        out_dim=16,
+        num_heads=16,
+        num_layers=32,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=True,
+        inject_sample_info=False,
+        eps=1e-6,
+    ):
+        r"""
+        Initialize the diffusion model backbone.
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            window_size (`tuple`, *optional*, defaults to (-1, -1)):
+                Window size for local attention (-1 indicates global attention)
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+        super().__init__()
+        assert model_type in ["t2v", "i2v"]
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.num_frame_per_block = 1
+        self.flag_causal_attention = False
+        self.block_mask = None
+        self.enable_teacache = False
+        # embeddings
+        self.patch_embedding = nn.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size)
+        self.text_embedding = nn.Sequential(nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim))
+        self.time_embedding = nn.Sequential(nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        if inject_sample_info:
+            self.fps_embedding = nn.Embedding(2, dim)
+            self.fps_projection = nn.Sequential(nn.Linear(dim, dim), nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn"
+        self.blocks = nn.ModuleList(
+            [
+                WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps)
+                for _ in range(num_layers)
+            ]
+        )
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat(
+            [rope_params(1024, d - 4 * (d // 6)), rope_params(1024, 2 * (d // 6)), rope_params(1024, 2 * (d // 6))],
+            dim=1,
+        )
+        if model_type == "i2v":
+            self.img_emb = MLPProj(1280, dim)
+        self.gradient_checkpointing = False
+        self.cpu_offloading = False
+        self.inject_sample_info = inject_sample_info
+        # initialize weights
+        self.init_weights()
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    def zero_init_i2v_cross_attn(self):
+        print("zero init i2v cross attn")
+        for i in range(self.num_layers):
+            self.blocks[i].cross_attn.v_img.weight.data.zero_()
+            self.blocks[i].cross_attn.v_img.bias.data.zero_()
+    @staticmethod
+    def _prepare_blockwise_causal_attn_mask(
+        device: torch.device | str, num_frames: int = 21, frame_seqlen: int = 1560, num_frame_per_block=1
+    ) -> BlockMask:
+        """
+        we will divide the token sequence into the following format
+        [1 latent frame] [1 latent frame] ... [1 latent frame]
+        We use flexattention to construct the attention mask
+        """
+        total_length = num_frames * frame_seqlen
+        # we do right padding to get to a multiple of 128
+        padded_length = math.ceil(total_length / 128) * 128 - total_length
+        ends = torch.zeros(total_length + padded_length, device=device, dtype=torch.long)
+        # Block-wise causal mask will attend to all elements that are before the end of the current chunk
+        frame_indices = torch.arange(start=0, end=total_length, step=frame_seqlen * num_frame_per_block, device=device)
+        for tmp in frame_indices:
+            ends[tmp : tmp + frame_seqlen * num_frame_per_block] = tmp + frame_seqlen * num_frame_per_block
+        def attention_mask(b, h, q_idx, kv_idx):
+            return (kv_idx < ends[q_idx]) | (q_idx == kv_idx)
+            # return ((kv_idx < total_length) & (q_idx < total_length))  | (q_idx == kv_idx) # bidirectional mask
+        block_mask = create_block_mask(
+            attention_mask,
+            B=None,
+            H=None,
+            Q_LEN=total_length + padded_length,
+            KV_LEN=total_length + padded_length,
+            _compile=False,
+            device=device,
+        )
+        return block_mask
+    def initialize_teacache(self, enable_teacache=True, num_steps=25, teacache_thresh=0.15, use_ret_steps=False, ckpt_dir=''):
+        self.enable_teacache = enable_teacache
+        print('using teacache')
+        self.cnt = 0
+        self.num_steps = num_steps
+        self.teacache_thresh = teacache_thresh
+        self.accumulated_rel_l1_distance_even = 0
+        self.accumulated_rel_l1_distance_odd = 0
+        self.previous_e0_even = None
+        self.previous_e0_odd = None
+        self.previous_residual_even = None
+        self.previous_residual_odd = None
+        self.use_ref_steps = use_ret_steps
+        if "I2V" in ckpt_dir:
+            if use_ret_steps:
+                if '540P' in ckpt_dir:
+                    self.coefficients = [ 2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
+                if '720P' in ckpt_dir:
+                    self.coefficients = [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
+                self.ret_steps = 5*2
+                self.cutoff_steps = num_steps*2
+            else:
+                if '540P' in ckpt_dir:
+                    self.coefficients = [-3.02331670e+02,  2.23948934e+02, -5.25463970e+01,  5.87348440e+00, -2.01973289e-01]
+                if '720P' in ckpt_dir:
+                    self.coefficients = [-114.36346466,   65.26524496,  -18.82220707,    4.91518089,   -0.23412683]
+                self.ret_steps = 1*2
+                self.cutoff_steps = num_steps*2 - 2
+        else:
+            if use_ret_steps:
+                if '1.3B' in ckpt_dir:
+                    self.coefficients = [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02]
+                if '14B' in ckpt_dir:
+                    self.coefficients = [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01]
+                self.ret_steps = 5*2
+                self.cutoff_steps = num_steps*2
+            else:
+                if '1.3B' in ckpt_dir:
+                    self.coefficients = [2.39676752e+03, -1.31110545e+03,  2.01331979e+02, -8.29855975e+00, 1.37887774e-01]
+                if '14B' in ckpt_dir:
+                    self.coefficients = [-5784.54975374,  5449.50911966, -1811.16591783,   256.27178429, -13.02252404]
+                self.ret_steps = 1*2
+                self.cutoff_steps = num_steps*2 - 2
+    def forward(self, x, t, context, clip_fea=None, y=None, fps=None):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = torch.cat([x, y], dim=1)
+        # embeddings
+        x = self.patch_embedding(x)
+        grid_sizes = torch.tensor(x.shape[2:], dtype=torch.long)
+        x = x.flatten(2).transpose(1, 2)
+        if self.flag_causal_attention:
+            frame_num = grid_sizes[0]
+            height = grid_sizes[1]
+            width = grid_sizes[2]
+            block_num = frame_num // self.num_frame_per_block
+            range_tensor = torch.arange(block_num).view(-1, 1)
+            range_tensor = range_tensor.repeat(1, self.num_frame_per_block).flatten()
+            casual_mask = range_tensor.unsqueeze(0) <= range_tensor.unsqueeze(1)  # f, f
+            casual_mask = casual_mask.view(frame_num, 1, 1, frame_num, 1, 1).to(x.device)
+            casual_mask = casual_mask.repeat(1, height, width, 1, height, width)
+            casual_mask = casual_mask.reshape(frame_num * height * width, frame_num * height * width)
+            self.block_mask = casual_mask.unsqueeze(0).unsqueeze(0)
+        # time embeddings
+        with amp.autocast("cuda", dtype=torch.float32):
+            if t.dim() == 2:
+                b, f = t.shape
+                _flag_df = True
+            else:
+                _flag_df = False
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(self.patch_embedding.weight.dtype)
+            )  # b, dim
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))  # b, 6, dim
+            if self.inject_sample_info:
+                fps = torch.tensor(fps, dtype=torch.long, device=device)
+                fps_emb = self.fps_embedding(fps).float()
+                if _flag_df:
+                    e0 = e0 + self.fps_projection(fps_emb).unflatten(1, (6, self.dim)).repeat(t.shape[1], 1, 1)
+                else:
+                    e0 = e0 + self.fps_projection(fps_emb).unflatten(1, (6, self.dim))
+            if _flag_df:
+                e = e.view(b, f, 1, 1, self.dim)
+                e0 = e0.view(b, f, 1, 1, 6, self.dim)
+                e = e.repeat(1, 1, grid_sizes[1], grid_sizes[2], 1).flatten(1, 3)
+                e0 = e0.repeat(1, 1, grid_sizes[1], grid_sizes[2], 1, 1).flatten(1, 3)
+                e0 = e0.transpose(1, 2).contiguous()
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context = self.text_embedding(context)
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(e=e0, grid_sizes=grid_sizes, freqs=self.freqs, context=context, block_mask=self.block_mask)
+        if self.enable_teacache:
+            modulated_inp = e0 if self.use_ref_steps else e
+            # teacache
+            if self.cnt%2==0: # even -> conditon
+                self.is_even = True
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_even = True
+                    self.accumulated_rel_l1_distance_even = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_even += rescale_func(((modulated_inp-self.previous_e0_even).abs().mean() / self.previous_e0_even.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_even < self.teacache_thresh:
+                        should_calc_even = False
+                    else:
+                        should_calc_even = True
+                        self.accumulated_rel_l1_distance_even = 0
+                self.previous_e0_even = modulated_inp.clone()
+            else: # odd -> unconditon
+                self.is_even = False
+                if self.cnt < self.ret_steps or self.cnt >= self.cutoff_steps:
+                    should_calc_odd = True
+                    self.accumulated_rel_l1_distance_odd = 0
+                else:
+                    rescale_func = np.poly1d(self.coefficients)
+                    self.accumulated_rel_l1_distance_odd += rescale_func(((modulated_inp-self.previous_e0_odd).abs().mean() / self.previous_e0_odd.abs().mean()).cpu().item())
+                    if self.accumulated_rel_l1_distance_odd < self.teacache_thresh:
+                        should_calc_odd = False
+                    else:
+                        should_calc_odd = True
+                        self.accumulated_rel_l1_distance_odd = 0
+                self.previous_e0_odd = modulated_inp.clone()
+        if self.enable_teacache:
+            if self.is_even:
+                if not should_calc_even:
+                    x += self.previous_residual_even
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_even = x - ori_x
+            else:
+                if not should_calc_odd:
+                    x += self.previous_residual_odd
+                else:
+                    ori_x = x.clone()
+                    for block in self.blocks:
+                        x = block(x, **kwargs)
+                    self.previous_residual_odd = x - ori_x
+            self.cnt += 1
+            if self.cnt >= self.num_steps:
+                self.cnt = 0
+        else:
+            for block in self.blocks:
+                x = block(x, **kwargs)
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return x.float()
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+        c = self.out_dim
+        bs = x.shape[0]
+        x = x.view(bs, *grid_sizes, *self.patch_size, c)
+        x = torch.einsum("bfhwpqrc->bcfphqwr", x)
+        x = x.reshape(bs, c, *[i * j for i, j in zip(grid_sizes, self.patch_size)])
+        return x
+    def set_ar_attention(self, causal_block_size):
+        self.num_frame_per_block = causal_block_size
+        self.flag_causal_attention = True
+        for block in self.blocks:
+            block.set_ar_attention()
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        if self.inject_sample_info:
+            nn.init.normal_(self.fps_embedding.weight, std=0.02)
+            for m in self.fps_projection.modules():
+                if isinstance(m, nn.Linear):
+                    nn.init.normal_(m.weight, std=0.02)
+            nn.init.zeros_(self.fps_projection[-1].weight)
+            nn.init.zeros_(self.fps_projection[-1].bias)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)

skyreels_v2_infer/modules/vae.py ADDED Viewed

	@@ -0,0 +1,639 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+__all__ = [
+    "WanVAE",
+]
+CACHE_T = 2
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ("none", "upsample2d", "upsample3d", "downsample2d", "downsample3d")
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim // 2, 3, padding=1)
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        # conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        # init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim), ResidualBlock(out_dim, out_dim, dropout)
+        )
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(), CausalConv3d(out_dim, z_dim, 3, padding=1)
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]), ResidualBlock(dims[0], dims[0], dropout)
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = "upsample3d" if temperal_upsample[i] else "upsample2d"
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(), CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+class WanVAE_(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(
+            dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout)
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        ## 对encode输入的x，按时间拆分为1、4、4、4....
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(1, self.z_dim, 1, 1, 1)
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        self.clear_cache()
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+def _video_vae(pretrained_path=None, z_dim=None, device="cpu", **kwargs):
+    """
+    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
+    """
+    # params
+    cfg = dict(
+        dim=96,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    # init model
+    with torch.device("meta"):
+        model = WanVAE_(**cfg)
+    # load checkpoint
+    logging.info(f"loading {pretrained_path}")
+    model.load_state_dict(torch.load(pretrained_path, map_location=device), assign=True)
+    return model
+class WanVAE:
+    def __init__(self, vae_pth="cache/vae_step_411000.pth", z_dim=16):
+        mean = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ]
+        std = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ]
+        self.vae_stride = (4, 8, 8)
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.vae = (
+            _video_vae(
+                pretrained_path=vae_pth,
+                z_dim=z_dim,
+            )
+            .eval()
+            .requires_grad_(False)
+        )
+    def encode(self, video):
+        """
+        videos: A list of videos each with shape [C, T, H, W].
+        """
+        return self.vae.encode(video, self.scale).float()
+    def to(self, *args, **kwargs):
+        self.mean = self.mean.to(*args, **kwargs)
+        self.std = self.std.to(*args, **kwargs)
+        self.scale = [self.mean, 1.0 / self.std]
+        self.vae = self.vae.to(*args, **kwargs)
+        return self
+    def decode(self, z):
+        return self.vae.decode(z, self.scale).float().clamp_(-1, 1)

skyreels_v2_infer/modules/xlm_roberta.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Modified from transformers.models.xlm_roberta.modeling_xlm_roberta
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ["XLMRoberta", "xlm_roberta_large"]
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim), nn.Dropout(dropout))
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+    def __init__(
+        self,
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+        # blocks
+        self.blocks = nn.ModuleList(
+            [AttentionBlock(dim, num_heads, post_norm, dropout, eps) for _ in range(num_layers)]
+        )
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+        # embeddings
+        x = (
+            self.token_embedding(ids)
+            + self.type_embedding(torch.zeros_like(ids))
+            + self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        )
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+        # blocks
+        mask = torch.where(mask.view(b, 1, 1, s).gt(0), 0.0, torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+def xlm_roberta_large(pretrained=False, return_tokenizer=False, device="cpu", **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5,
+    )
+    cfg.update(**kwargs)
+    # init a model on device
+    with torch.device(device):
+        model = XLMRoberta(**cfg)
+    return model

skyreels_v2_infer/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .diffusion_forcing_pipeline import DiffusionForcingPipeline
+from .image2video_pipeline import Image2VideoPipeline
+from .image2video_pipeline import resizecrop
+from .prompt_enhancer import PromptEnhancer
+from .text2video_pipeline import Text2VideoPipeline

skyreels_v2_infer/pipelines/diffusion_forcing_pipeline.py ADDED Viewed

	@@ -0,0 +1,659 @@

+import math
+import os
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+import torch
+from diffusers.image_processor import PipelineImageInput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from tqdm import tqdm
+import decord
+from decord import VideoReader
+from ..modules import get_text_encoder
+from ..modules import get_transformer
+from ..modules import get_vae
+from ..scheduler.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class DiffusionForcingPipeline:
+    """
+    A pipeline for diffusion-based video generation tasks.
+    This pipeline supports two main tasks:
+    - Image-to-Video (i2v): Generates a video sequence from a source image
+    - Text-to-Video (t2v): Generates a video sequence from a text description
+    The pipeline integrates multiple components including:
+    - A transformer model for diffusion
+    - A VAE for encoding/decoding
+    - A text encoder for processing text prompts
+    - An image encoder for processing image inputs (i2v mode only)
+    """
+    def __init__(
+        self,
+        model_path: str,
+        dit_path: str,
+        device: str = "cuda",
+        weight_dtype=torch.bfloat16,
+        use_usp=False,
+        offload=False,
+    ):
+        """
+        Initialize the diffusion forcing pipeline class
+        Args:
+            model_path (str): Path to the model
+            dit_path (str): Path to the DIT model, containing model configuration file (config.json) and weight file (*.safetensor)
+            device (str): Device to run on, defaults to 'cuda'
+            weight_dtype: Weight data type, defaults to torch.bfloat16
+        """
+        load_device = "cpu" if offload else device
+        self.transformer = get_transformer(dit_path, load_device, weight_dtype)
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+        self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+        self.video_processor = VideoProcessor(vae_scale_factor=16)
+        self.device = device
+        self.offload = offload
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size
+            from ..distributed.xdit_context_parallel import usp_attn_forward, usp_dit_forward
+            import types
+            for block in self.transformer.blocks:
+                block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+                self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
+                self.sp_size = get_sequence_parallel_world_size()
+        self.scheduler = FlowUniPCMultistepScheduler()
+    @property
+    def do_classifier_free_guidance(self) -> bool:
+        return self._guidance_scale > 1
+    def encode_image(
+        self, image: PipelineImageInput, height: int, width: int, num_frames: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # prefix_video
+        prefix_video = np.array(image.resize((width, height))).transpose(2, 0, 1)
+        prefix_video = torch.tensor(prefix_video).unsqueeze(1)  # .to(image_embeds.dtype).unsqueeze(1)
+        if prefix_video.dtype == torch.uint8:
+            prefix_video = (prefix_video.float() / (255.0 / 2.0)) - 1.0
+        prefix_video = prefix_video.to(self.device)
+        prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
+        causal_block_size = self.transformer.num_frame_per_block
+        if prefix_video[0].shape[1] % causal_block_size != 0:
+            truncate_len = prefix_video[0].shape[1] % causal_block_size
+            print("the length of prefix video is truncated for the casual block size alignment.")
+            prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
+        predix_video_latent_length = prefix_video[0].shape[1]
+        return prefix_video, predix_video_latent_length
+    def prepare_latents(
+        self,
+        shape: Tuple[int],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ) -> torch.Tensor:
+        return randn_tensor(shape, generator, device=device, dtype=dtype)
+    def generate_timestep_matrix(
+        self,
+        num_frames,
+        step_template,
+        base_num_frames,
+        ar_step=5,
+        num_pre_ready=0,
+        casual_block_size=1,
+        shrink_interval_with_mask=False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, list[tuple]]:
+        step_matrix, step_index = [], []
+        update_mask, valid_interval = [], []
+        num_iterations = len(step_template) + 1
+        num_frames_block = num_frames // casual_block_size
+        base_num_frames_block = base_num_frames // casual_block_size
+        if base_num_frames_block < num_frames_block:
+            infer_step_num = len(step_template)
+            gen_block = base_num_frames_block
+            min_ar_step = infer_step_num / gen_block
+            assert ar_step >= min_ar_step, f"ar_step should be at least {math.ceil(min_ar_step)} in your setting"
+        # print(num_frames, step_template, base_num_frames, ar_step, num_pre_ready, casual_block_size, num_frames_block, base_num_frames_block)
+        step_template = torch.cat(
+            [
+                torch.tensor([999], dtype=torch.int64, device=step_template.device),
+                step_template.long(),
+                torch.tensor([0], dtype=torch.int64, device=step_template.device),
+            ]
+        )  # to handle the counter in row works starting from 1
+        pre_row = torch.zeros(num_frames_block, dtype=torch.long)
+        if num_pre_ready > 0:
+            pre_row[: num_pre_ready // casual_block_size] = num_iterations
+        while torch.all(pre_row >= (num_iterations - 1)) == False:
+            new_row = torch.zeros(num_frames_block, dtype=torch.long)
+            for i in range(num_frames_block):
+                if i == 0 or pre_row[i - 1] >= (
+                    num_iterations - 1
+                ):  # the first frame or the last frame is completely denoised
+                    new_row[i] = pre_row[i] + 1
+                else:
+                    new_row[i] = new_row[i - 1] - ar_step
+            new_row = new_row.clamp(0, num_iterations)
+            update_mask.append(
+                (new_row != pre_row) & (new_row != num_iterations)
+            )  # False: no need to update， True: need to update
+            step_index.append(new_row)
+            step_matrix.append(step_template[new_row])
+            pre_row = new_row
+        # for long video we split into several sequences, base_num_frames is set to the model max length (for training)
+        terminal_flag = base_num_frames_block
+        if shrink_interval_with_mask:
+            idx_sequence = torch.arange(num_frames_block, dtype=torch.int64)
+            update_mask = update_mask[0]
+            update_mask_idx = idx_sequence[update_mask]
+            last_update_idx = update_mask_idx[-1].item()
+            terminal_flag = last_update_idx + 1
+        # for i in range(0, len(update_mask)):
+        for curr_mask in update_mask:
+            if terminal_flag < num_frames_block and curr_mask[terminal_flag]:
+                terminal_flag += 1
+            valid_interval.append((max(terminal_flag - base_num_frames_block, 0), terminal_flag))
+        step_update_mask = torch.stack(update_mask, dim=0)
+        step_index = torch.stack(step_index, dim=0)
+        step_matrix = torch.stack(step_matrix, dim=0)
+        if casual_block_size > 1:
+            step_update_mask = step_update_mask.unsqueeze(-1).repeat(1, 1, casual_block_size).flatten(1).contiguous()
+            step_index = step_index.unsqueeze(-1).repeat(1, 1, casual_block_size).flatten(1).contiguous()
+            step_matrix = step_matrix.unsqueeze(-1).repeat(1, 1, casual_block_size).flatten(1).contiguous()
+            valid_interval = [(s * casual_block_size, e * casual_block_size) for s, e in valid_interval]
+        return step_matrix, step_index, step_update_mask, valid_interval
+    def get_video_as_tensor(self, video_path, width, height):
+        """
+        Loads a video from the given path and returns it as a tensor with proper channel ordering.
+        Args:
+            video_path (str): Path to the video file
+        Returns:
+            torch.Tensor: Video tensor in [C, T, H, W] format (channels first)
+        """
+        # Set Decord to use CPU for video decoding
+        decord.bridge.set_bridge('torch')
+        # Load video
+        vr = VideoReader(video_path, width=width, height=height)
+        total_frames = len(vr)
+        # Read all frames
+        video_frames = vr.get_batch(list(range(total_frames)))
+        # Convert from [T, H, W, C] to [C, T, H, W] format
+        video_tensor = video_frames.permute(0, 3, 1, 2).float()
+        return video_tensor
+    @torch.no_grad()
+    def extend_video(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Union[str, List[str]] = "",
+        prefix_video_path: List[torch.Tensor] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 97,
+        num_inference_steps: int = 50,
+        shift: float = 1.0,
+        guidance_scale: float = 5.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        overlap_history: int = None,
+        addnoise_condition: int = 0,
+        base_num_frames: int = 97,
+        ar_step: int = 5,
+        causal_block_size: int = None,
+        fps: int = 24,
+    ):
+        latent_height = height // 8
+        latent_width = width // 8
+        latent_length = (num_frames - 1) // 4 + 1
+        self._guidance_scale = guidance_scale
+        i2v_extra_kwrags = {}
+        prefix_video = None
+        predix_video_latent_length = 0
+        self.text_encoder.to(self.device)
+        prompt_embeds = self.text_encoder.encode(prompt).to(self.transformer.dtype)
+        if self.do_classifier_free_guidance:
+            negative_prompt_embeds = self.text_encoder.encode(negative_prompt).to(self.transformer.dtype)
+        if self.offload:
+            self.text_encoder.cpu()
+            torch.cuda.empty_cache()
+        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device, shift=shift)
+        init_timesteps = self.scheduler.timesteps
+        if causal_block_size is None:
+            causal_block_size = self.transformer.num_frame_per_block
+        fps_embeds = [fps] * prompt_embeds.shape[0]
+        fps_embeds = [0 if i == 16 else 1 for i in fps_embeds]
+        transformer_dtype = self.transformer.dtype
+        # with torch.cuda.amp.autocast(dtype=self.transformer.dtype), torch.no_grad():
+        prefix_video = self.get_video_as_tensor(prefix_video_path, width, height)
+        prefix_frame = torch.tensor(prefix_video, device=self.device)
+        start_video = (prefix_frame.float() / (255.0 / 2.0)) - 1.0
+        start_video = start_video.transpose(0, 1)
+        # long video generation
+        base_num_frames = (base_num_frames - 1) // 4 + 1 if base_num_frames is not None else latent_length
+        overlap_history_frames = (overlap_history - 1) // 4 + 1
+        n_iter = 1 + (latent_length - base_num_frames - 1) // (base_num_frames - overlap_history_frames) + 1
+        print(f"n_iter:{n_iter}")
+        output_video = start_video.cpu()
+        for i in range(n_iter):
+            prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)
+            prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
+            if prefix_video[0].shape[1] % causal_block_size != 0:
+                truncate_len = prefix_video[0].shape[1] % causal_block_size
+                print("the length of prefix video is truncated for the casual block size alignment.")
+                prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
+            predix_video_latent_length = prefix_video[0].shape[1]
+            finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
+            left_frame_num = latent_length - finished_frame_num
+            base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
+            if ar_step > 0 and self.transformer.enable_teacache:
+                num_steps = num_inference_steps + ((base_num_frames_iter - overlap_history_frames) // causal_block_size - 1) * ar_step
+                self.transformer.num_steps = num_steps
+            latent_shape = [16, base_num_frames_iter, latent_height, latent_width]
+            latents = self.prepare_latents(
+                latent_shape, dtype=transformer_dtype, device=prompt_embeds.device, generator=generator
+            )
+            latents = [latents]
+            if prefix_video is not None:
+                latents[0][:, :predix_video_latent_length] = prefix_video[0].to(transformer_dtype)
+            step_matrix, _, step_update_mask, valid_interval = self.generate_timestep_matrix(
+                base_num_frames_iter,
+                init_timesteps,
+                base_num_frames_iter,
+                ar_step,
+                predix_video_latent_length,
+                causal_block_size,
+            )
+            sample_schedulers = []
+            for _ in range(base_num_frames_iter):
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=1000, shift=1, use_dynamic_shifting=False
+                )
+                sample_scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device, shift=shift)
+                sample_schedulers.append(sample_scheduler)
+            sample_schedulers_counter = [0] * base_num_frames_iter
+            self.transformer.to(self.device)
+            for i, timestep_i in enumerate(tqdm(step_matrix)):
+                update_mask_i = step_update_mask[i]
+                valid_interval_i = valid_interval[i]
+                valid_interval_start, valid_interval_end = valid_interval_i
+                timestep = timestep_i[None, valid_interval_start:valid_interval_end].clone()
+                latent_model_input = [latents[0][:, valid_interval_start:valid_interval_end, :, :].clone()]
+                if addnoise_condition > 0 and valid_interval_start < predix_video_latent_length:
+                    noise_factor = 0.001 * addnoise_condition
+                    timestep_for_noised_condition = addnoise_condition
+                    latent_model_input[0][:, valid_interval_start:predix_video_latent_length] = (
+                        latent_model_input[0][:, valid_interval_start:predix_video_latent_length]
+                        * (1.0 - noise_factor)
+                        + torch.randn_like(
+                            latent_model_input[0][:, valid_interval_start:predix_video_latent_length]
+                        )
+                        * noise_factor
+                    )
+                    timestep[:, valid_interval_start:predix_video_latent_length] = timestep_for_noised_condition
+                if not self.do_classifier_free_guidance:
+                    noise_pred = self.transformer(
+                        torch.stack([latent_model_input[0]]),
+                        t=timestep,
+                        context=prompt_embeds,
+                        fps=fps_embeds,
+                        **i2v_extra_kwrags,
+                    )[0]
+                else:
+                    noise_pred_cond = self.transformer(
+                        torch.stack([latent_model_input[0]]),
+                        t=timestep,
+                        context=prompt_embeds,
+                        fps=fps_embeds,
+                        **i2v_extra_kwrags,
+                    )[0]
+                    noise_pred_uncond = self.transformer(
+                        torch.stack([latent_model_input[0]]),
+                        t=timestep,
+                        context=negative_prompt_embeds,
+                        fps=fps_embeds,
+                        **i2v_extra_kwrags,
+                    )[0]
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                for idx in range(valid_interval_start, valid_interval_end):
+                    if update_mask_i[idx].item():
+                        latents[0][:, idx] = sample_schedulers[idx].step(
+                            noise_pred[:, idx - valid_interval_start],
+                            timestep_i[idx],
+                            latents[0][:, idx],
+                            return_dict=False,
+                            generator=generator,
+                        )[0]
+                        sample_schedulers_counter[idx] += 1
+            if self.offload:
+                self.transformer.cpu()
+                torch.cuda.empty_cache()
+            x0 = latents[0].unsqueeze(0)
+            videos = [self.vae.decode(x0)[0]]
+            if output_video is None:
+                output_video = videos[0].clamp(-1, 1).cpu()  # c, f, h, w
+            else:
+                output_video = torch.cat(
+                    [output_video, videos[0][:, overlap_history:].clamp(-1, 1).cpu()], 1
+                )  # c, f, h, w
+        output_video = [(output_video / 2 + 0.5).clamp(0, 1)]
+        output_video = [video for video in output_video]
+        output_video = [video.permute(1, 2, 3, 0) * 255 for video in output_video]
+        output_video = [video.cpu().numpy().astype(np.uint8) for video in output_video]
+        return output_video
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Union[str, List[str]] = "",
+        image: PipelineImageInput = None,
+        end_image: PipelineImageInput = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 97,
+        num_inference_steps: int = 50,
+        shift: float = 1.0,
+        guidance_scale: float = 5.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        overlap_history: int = None,
+        addnoise_condition: int = 0,
+        base_num_frames: int = 97,
+        ar_step: int = 5,
+        causal_block_size: int = None,
+        fps: int = 24,
+    ):
+        latent_height = height // 8
+        latent_width = width // 8
+        latent_length = (num_frames - 1) // 4 + 1
+        self._guidance_scale = guidance_scale
+        i2v_extra_kwrags = {}
+        prefix_video = None
+        predix_video_latent_length = 0
+        end_video = None
+        end_video_latent_length = 0
+        if image:
+            prefix_video, predix_video_latent_length = self.encode_image(image, height, width, num_frames)
+        if end_image:
+            end_video, end_video_latent_length = self.encode_image(end_image, height, width, num_frames)
+        self.text_encoder.to(self.device)
+        prompt_embeds = self.text_encoder.encode(prompt).to(self.transformer.dtype)
+        if self.do_classifier_free_guidance:
+            negative_prompt_embeds = self.text_encoder.encode(negative_prompt).to(self.transformer.dtype)
+        if self.offload:
+            self.text_encoder.cpu()
+            torch.cuda.empty_cache()
+        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device, shift=shift)
+        init_timesteps = self.scheduler.timesteps
+        if causal_block_size is None:
+            causal_block_size = self.transformer.num_frame_per_block
+        fps_embeds = [fps] * prompt_embeds.shape[0]
+        fps_embeds = [0 if i == 16 else 1 for i in fps_embeds]
+        transformer_dtype = self.transformer.dtype
+        # with torch.cuda.amp.autocast(dtype=self.transformer.dtype), torch.no_grad():
+        if overlap_history is None or base_num_frames is None or num_frames <= base_num_frames:
+            # short video generation
+            latent_shape = [16, latent_length, latent_height, latent_width]
+            latents = self.prepare_latents(
+                latent_shape, dtype=transformer_dtype, device=prompt_embeds.device, generator=generator
+            )
+            latents = [latents]
+            if prefix_video is not None:
+                latents[0][:, :predix_video_latent_length] = prefix_video[0].to(transformer_dtype)
+            if end_video is not None:
+                latents[0] = torch.cat([latents[0], end_video[0].to(transformer_dtype)], dim=1)
+            base_num_frames = num_frames
+            base_num_frames = (base_num_frames - 1) // 4 + 1 if base_num_frames is not None else latent_length
+            if end_video is not None:
+                base_num_frames += end_video_latent_length
+                latent_length += end_video_latent_length
+            step_matrix, _, step_update_mask, valid_interval = self.generate_timestep_matrix(
+                latent_length, init_timesteps, base_num_frames, ar_step, predix_video_latent_length, causal_block_size
+            )
+            if end_video is not None:
+                step_matrix[:, -end_video_latent_length:] = 0
+                step_update_mask[:, -end_video_latent_length:] = False
+            sample_schedulers = []
+            for _ in range(latent_length):
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=1000, shift=1, use_dynamic_shifting=False
+                )
+                sample_scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device, shift=shift)
+                sample_schedulers.append(sample_scheduler)
+            sample_schedulers_counter = [0] * latent_length
+            self.transformer.to(self.device)
+            for i, timestep_i in enumerate(tqdm(step_matrix)):
+                update_mask_i = step_update_mask[i]
+                valid_interval_i = valid_interval[i]
+                valid_interval_start, valid_interval_end = valid_interval_i
+                timestep = timestep_i[None, valid_interval_start:valid_interval_end].clone()
+                latent_model_input = [latents[0][:, valid_interval_start:valid_interval_end, :, :].clone()]
+                if addnoise_condition > 0 and valid_interval_start < predix_video_latent_length:
+                    noise_factor = 0.001 * addnoise_condition
+                    timestep_for_noised_condition = addnoise_condition
+                    latent_model_input[0][:, valid_interval_start:predix_video_latent_length] = (
+                        latent_model_input[0][:, valid_interval_start:predix_video_latent_length] * (1.0 - noise_factor)
+                        + torch.randn_like(latent_model_input[0][:, valid_interval_start:predix_video_latent_length])
+                        * noise_factor
+                    )
+                    timestep[:, valid_interval_start:predix_video_latent_length] = timestep_for_noised_condition
+                if not self.do_classifier_free_guidance:
+                    noise_pred = self.transformer(
+                        torch.stack([latent_model_input[0]]),
+                        t=timestep,
+                        context=prompt_embeds,
+                        fps=fps_embeds,
+                        **i2v_extra_kwrags,
+                    )[0]
+                else:
+                    noise_pred_cond = self.transformer(
+                        torch.stack([latent_model_input[0]]),
+                        t=timestep,
+                        context=prompt_embeds,
+                        fps=fps_embeds,
+                        **i2v_extra_kwrags,
+                    )[0]
+                    noise_pred_uncond = self.transformer(
+                        torch.stack([latent_model_input[0]]),
+                        t=timestep,
+                        context=negative_prompt_embeds,
+                        fps=fps_embeds,
+                        **i2v_extra_kwrags,
+                    )[0]
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                for idx in range(valid_interval_start, valid_interval_end):
+                    if update_mask_i[idx].item():
+                        latents[0][:, idx] = sample_schedulers[idx].step(
+                            noise_pred[:, idx - valid_interval_start],
+                            timestep_i[idx],
+                            latents[0][:, idx],
+                            return_dict=False,
+                            generator=generator,
+                        )[0]
+                        sample_schedulers_counter[idx] += 1
+            if self.offload:
+                self.transformer.cpu()
+                torch.cuda.empty_cache()
+            x0 = latents[0].unsqueeze(0)
+            if end_video is not None:
+                x0 = latents[0][:, :-end_video_latent_length].unsqueeze(0)
+            videos = self.vae.decode(x0)
+            videos = (videos / 2 + 0.5).clamp(0, 1)
+            videos = [video for video in videos]
+            videos = [video.permute(1, 2, 3, 0) * 255 for video in videos]
+            videos = [video.cpu().numpy().astype(np.uint8) for video in videos]
+            return videos
+        else:
+            # long video generation
+            base_num_frames = (base_num_frames - 1) // 4 + 1 if base_num_frames is not None else latent_length
+            overlap_history_frames = (overlap_history - 1) // 4 + 1
+            n_iter = 1 + (latent_length - base_num_frames - 1) // (base_num_frames - overlap_history_frames) + 1
+            print(f"n_iter:{n_iter}")
+            output_video = None
+            for i in range(n_iter):
+                if output_video is not None:  # i !=0
+                    prefix_video = output_video[:, -overlap_history:].to(prompt_embeds.device)
+                    prefix_video = [self.vae.encode(prefix_video.unsqueeze(0))[0]]  # [(c, f, h, w)]
+                    if prefix_video[0].shape[1] % causal_block_size != 0:
+                        truncate_len = prefix_video[0].shape[1] % causal_block_size
+                        print("the length of prefix video is truncated for the casual block size alignment.")
+                        prefix_video[0] = prefix_video[0][:, : prefix_video[0].shape[1] - truncate_len]
+                    predix_video_latent_length = prefix_video[0].shape[1]
+                    finished_frame_num = i * (base_num_frames - overlap_history_frames) + overlap_history_frames
+                    left_frame_num = latent_length - finished_frame_num
+                    base_num_frames_iter = min(left_frame_num + overlap_history_frames, base_num_frames)
+                    if ar_step > 0 and self.transformer.enable_teacache:
+                        num_steps = num_inference_steps + ((base_num_frames_iter - overlap_history_frames) // causal_block_size - 1) * ar_step
+                        self.transformer.num_steps = num_steps
+                else:  # i == 0
+                    base_num_frames_iter = base_num_frames
+                latent_shape = [16, base_num_frames_iter, latent_height, latent_width]
+                latents = self.prepare_latents(
+                    latent_shape, dtype=transformer_dtype, device=prompt_embeds.device, generator=generator
+                )
+                latents = [latents]
+                if prefix_video is not None:
+                    latents[0][:, :predix_video_latent_length] = prefix_video[0].to(transformer_dtype)
+                if end_video is not None and i == n_iter - 1:
+                    base_num_frames_iter += end_video_latent_length
+                    latents[0] = torch.cat([latents[0], end_video[0].to(transformer_dtype)], dim=1)
+                step_matrix, _, step_update_mask, valid_interval = self.generate_timestep_matrix(
+                    base_num_frames_iter,
+                    init_timesteps,
+                    base_num_frames_iter,
+                    ar_step,
+                    predix_video_latent_length,
+                    causal_block_size,
+                )
+                if end_video is not None and i == n_iter - 1:
+                    step_matrix[:, -end_video_latent_length:] = 0
+                    step_update_mask[:, -end_video_latent_length:] = False
+                sample_schedulers = []
+                for _ in range(base_num_frames_iter):
+                    sample_scheduler = FlowUniPCMultistepScheduler(
+                        num_train_timesteps=1000, shift=1, use_dynamic_shifting=False
+                    )
+                    sample_scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device, shift=shift)
+                    sample_schedulers.append(sample_scheduler)
+                sample_schedulers_counter = [0] * base_num_frames_iter
+                self.transformer.to(self.device)
+                for i, timestep_i in enumerate(tqdm(step_matrix)):
+                    update_mask_i = step_update_mask[i]
+                    valid_interval_i = valid_interval[i]
+                    valid_interval_start, valid_interval_end = valid_interval_i
+                    timestep = timestep_i[None, valid_interval_start:valid_interval_end].clone()
+                    latent_model_input = [latents[0][:, valid_interval_start:valid_interval_end, :, :].clone()]
+                    if addnoise_condition > 0 and valid_interval_start < predix_video_latent_length:
+                        noise_factor = 0.001 * addnoise_condition
+                        timestep_for_noised_condition = addnoise_condition
+                        latent_model_input[0][:, valid_interval_start:predix_video_latent_length] = (
+                            latent_model_input[0][:, valid_interval_start:predix_video_latent_length]
+                            * (1.0 - noise_factor)
+                            + torch.randn_like(
+                                latent_model_input[0][:, valid_interval_start:predix_video_latent_length]
+                            )
+                            * noise_factor
+                        )
+                        timestep[:, valid_interval_start:predix_video_latent_length] = timestep_for_noised_condition
+                    if not self.do_classifier_free_guidance:
+                        noise_pred = self.transformer(
+                            torch.stack([latent_model_input[0]]),
+                            t=timestep,
+                            context=prompt_embeds,
+                            fps=fps_embeds,
+                            **i2v_extra_kwrags,
+                        )[0]
+                    else:
+                        noise_pred_cond = self.transformer(
+                            torch.stack([latent_model_input[0]]),
+                            t=timestep,
+                            context=prompt_embeds,
+                            fps=fps_embeds,
+                            **i2v_extra_kwrags,
+                        )[0]
+                        noise_pred_uncond = self.transformer(
+                            torch.stack([latent_model_input[0]]),
+                            t=timestep,
+                            context=negative_prompt_embeds,
+                            fps=fps_embeds,
+                            **i2v_extra_kwrags,
+                        )[0]
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    for idx in range(valid_interval_start, valid_interval_end):
+                        if update_mask_i[idx].item():
+                            latents[0][:, idx] = sample_schedulers[idx].step(
+                                noise_pred[:, idx - valid_interval_start],
+                                timestep_i[idx],
+                                latents[0][:, idx],
+                                return_dict=False,
+                                generator=generator,
+                            )[0]
+                            sample_schedulers_counter[idx] += 1
+                if self.offload:
+                    self.transformer.cpu()
+                    torch.cuda.empty_cache()
+                x0 = latents[0].unsqueeze(0)
+                if end_video is not None and i == n_iter - 1:
+                    x0 = latents[0][:, :-end_video_latent_length].unsqueeze(0)
+                videos = [self.vae.decode(x0)[0]]
+                if output_video is None:
+                    output_video = videos[0].clamp(-1, 1).cpu()  # c, f, h, w
+                else:
+                    output_video = torch.cat(
+                        [output_video, videos[0][:, overlap_history:].clamp(-1, 1).cpu()], 1
+                    )  # c, f, h, w
+            output_video = [(output_video / 2 + 0.5).clamp(0, 1)]
+            output_video = [video for video in output_video]
+            output_video = [video.permute(1, 2, 3, 0) * 255 for video in output_video]
+            output_video = [video.cpu().numpy().astype(np.uint8) for video in output_video]
+            return output_video

skyreels_v2_infer/pipelines/image2video_pipeline.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+from typing import List
+from typing import Optional
+from typing import Union
+import numpy as np
+import torch
+from diffusers.image_processor import PipelineImageInput
+from diffusers.video_processor import VideoProcessor
+from PIL import Image
+from tqdm import tqdm
+from ..modules import get_image_encoder
+from ..modules import get_text_encoder
+from ..modules import get_transformer
+from ..modules import get_vae
+from ..scheduler.fm_solvers_unipc import FlowUniPCMultistepScheduler
+def resizecrop(image: Image.Image, th, tw):
+    w, h = image.size
+    if w == tw and h == th:
+        return image
+    if h / w > th / tw:
+        new_w = int(w)
+        new_h = int(new_w * th / tw)
+    else:
+        new_h = int(h)
+        new_w = int(new_h * tw / th)
+    left = (w - new_w) / 2
+    top = (h - new_h) / 2
+    right = (w + new_w) / 2
+    bottom = (h + new_h) / 2
+    image = image.crop((left, top, right, bottom))
+    return image
+class Image2VideoPipeline:
+    def __init__(
+        self, model_path, dit_path, device: str = "cuda", weight_dtype=torch.bfloat16, use_usp=False, offload=False
+    ):
+        load_device = "cpu" if offload else device
+        self.transformer = get_transformer(dit_path, load_device, weight_dtype)
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+        self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+        self.clip = get_image_encoder(model_path, load_device, weight_dtype)
+        self.sp_size = 1
+        self.device = device
+        self.offload = offload
+        self.video_processor = VideoProcessor(vae_scale_factor=16)
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size
+            from ..distributed.xdit_context_parallel import usp_attn_forward, usp_dit_forward
+            import types
+            for block in self.transformer.blocks:
+                block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+                self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
+                self.sp_size = get_sequence_parallel_world_size()
+        self.scheduler = FlowUniPCMultistepScheduler()
+        self.vae_stride = (4, 8, 8)
+        self.patch_size = (1, 2, 2)
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 544,
+        width: int = 960,
+        num_frames: int = 97,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        shift: float = 5.0,
+        generator: Optional[torch.Generator] = None,
+    ):
+        F = num_frames
+        latent_height = height // 8 // 2 * 2
+        latent_width = width // 8 // 2 * 2
+        latent_length = (F - 1) // 4 + 1
+        h = latent_height * 8
+        w = latent_width * 8
+        img = self.video_processor.preprocess(image, height=h, width=w)
+        img = img.to(device=self.device, dtype=self.transformer.dtype)
+        padding_video = torch.zeros(img.shape[0], 3, F - 1, h, w, device=self.device)
+        img = img.unsqueeze(2)
+        img_cond = torch.concat([img, padding_video], dim=2)
+        img_cond = self.vae.encode(img_cond)
+        mask = torch.ones_like(img_cond)
+        mask[:, :, 1:] = 0
+        y = torch.cat([mask[:, :4], img_cond], dim=1)
+        self.clip.to(self.device)
+        clip_context = self.clip.encode_video(img)
+        if self.offload:
+            self.clip.cpu()
+            torch.cuda.empty_cache()
+        # preprocess
+        self.text_encoder.to(self.device)
+        context = self.text_encoder.encode(prompt).to(self.device)
+        context_null = self.text_encoder.encode(negative_prompt).to(self.device)
+        if self.offload:
+            self.text_encoder.cpu()
+            torch.cuda.empty_cache()
+        latent = torch.randn(
+            16, latent_length, latent_height, latent_width, dtype=torch.float32, generator=generator, device=self.device
+        )
+        self.transformer.to(self.device)
+        with torch.cuda.amp.autocast(dtype=self.transformer.dtype), torch.no_grad():
+            self.scheduler.set_timesteps(num_inference_steps, device=self.device, shift=shift)
+            timesteps = self.scheduler.timesteps
+            arg_c = {
+                "context": context,
+                "clip_fea": clip_context,
+                "y": y,
+            }
+            arg_null = {
+                "context": context_null,
+                "clip_fea": clip_context,
+                "y": y,
+            }
+            self.transformer.to(self.device)
+            for _, t in enumerate(tqdm(timesteps)):
+                latent_model_input = torch.stack([latent]).to(self.device)
+                timestep = torch.stack([t]).to(self.device)
+                noise_pred_cond = self.transformer(latent_model_input, t=timestep, **arg_c)[0].to(self.device)
+                noise_pred_uncond = self.transformer(latent_model_input, t=timestep, **arg_null)[0].to(self.device)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                temp_x0 = self.scheduler.step(
+                    noise_pred.unsqueeze(0), t, latent.unsqueeze(0), return_dict=False, generator=generator
+                )[0]
+                latent = temp_x0.squeeze(0)
+            if self.offload:
+                self.transformer.cpu()
+                torch.cuda.empty_cache()
+            videos = self.vae.decode(latent)
+            videos = (videos / 2 + 0.5).clamp(0, 1)
+            videos = [video for video in videos]
+            videos = [video.permute(1, 2, 3, 0) * 255 for video in videos]
+            videos = [video.cpu().numpy().astype(np.uint8) for video in videos]
+        return videos

skyreels_v2_infer/pipelines/prompt_enhancer.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import argparse
+from transformers import AutoModelForCausalLM, AutoTokenizer
+sys_prompt = """
+Transform the short prompt into a detailed video-generation caption using this structure:
+Opening shot type (long/medium/close-up/extreme close-up/full shot)
+Primary subject(s) with vivid attributes (colors, textures, actions, interactions)
+Dynamic elements (movement, transitions, or changes over time, e.g., 'gradually lowers,' 'begins to climb,' 'camera moves toward...')
+Scene composition (background, environment, spatial relationships)
+Lighting/atmosphere (natural/artificial, time of day, mood)
+Camera motion (zooms, pans, static/handheld shots) if applicable.
+Pattern Summary from Examples:
+[Shot Type] of [Subject+Action] + [Detailed Subject Description] + [Environmental Context] + [Lighting Conditions] + [Camera Movement]
+One case:
+Short prompt: a person is playing football
+Long prompt: Medium shot of a young athlete in a red jersey sprinting across a muddy field, dribbling a soccer ball with precise footwork. The player glances toward the goalpost, adjusts their stance, and kicks the ball forcefully into the net. Raindrops fall lightly, creating reflections under stadium floodlights. The camera follows the ball’s trajectory in a smooth pan.
+Note: If the subject is stationary, incorporate camera movement to ensure the generated video remains dynamic.
+Now expand this short prompt: [{}]. Please only output the final long prompt in English.
+"""
+class PromptEnhancer:
+    def __init__(self, model_name="Qwen/Qwen2.5-32B-Instruct"):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype="auto",
+            device_map="cuda:0",
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def __call__(self, prompt):
+        prompt = prompt.strip()
+        prompt = sys_prompt.format(prompt)
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
+        generated_ids = self.model.generate(
+            **model_inputs,
+            max_new_tokens=2048,
+        )
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        rewritten_prompt = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return rewritten_prompt
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, default="In a still frame, a stop sign")
+    args = parser.parse_args()
+    prompt_enhancer = PromptEnhancer()
+    enhanced_prompt = prompt_enhancer(args.prompt)
+    print(f'Original prompt: {args.prompt}')
+    print(f'Enhanced prompt: {enhanced_prompt}')

skyreels_v2_infer/pipelines/text2video_pipeline.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+from typing import List
+from typing import Optional
+from typing import Union
+import numpy as np
+import torch
+from diffusers.video_processor import VideoProcessor
+from tqdm import tqdm
+from ..modules import get_text_encoder
+from ..modules import get_transformer
+from ..modules import get_vae
+from ..scheduler.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class Text2VideoPipeline:
+    def __init__(
+        self, model_path, dit_path, device: str = "cuda", weight_dtype=torch.bfloat16, use_usp=False, offload=False
+    ):
+        load_device = "cpu" if offload else device
+        self.transformer = get_transformer(dit_path, load_device, weight_dtype)
+        vae_model_path = os.path.join(model_path, "Wan2.1_VAE.pth")
+        self.vae = get_vae(vae_model_path, device, weight_dtype=torch.float32)
+        self.text_encoder = get_text_encoder(model_path, load_device, weight_dtype)
+        self.video_processor = VideoProcessor(vae_scale_factor=16)
+        self.sp_size = 1
+        self.device = device
+        self.offload = offload
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size
+            from ..distributed.xdit_context_parallel import usp_attn_forward, usp_dit_forward
+            import types
+            for block in self.transformer.blocks:
+                block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+                self.transformer.forward = types.MethodType(usp_dit_forward, self.transformer)
+                self.sp_size = get_sequence_parallel_world_size()
+        self.scheduler = FlowUniPCMultistepScheduler()
+        self.vae_stride = (4, 8, 8)
+        self.patch_size = (1, 2, 2)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        width: int = 544,
+        height: int = 960,
+        num_frames: int = 97,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        shift: float = 5.0,
+        generator: Optional[torch.Generator] = None,
+    ):
+        # preprocess
+        F = num_frames
+        target_shape = (
+            self.vae.vae.z_dim,
+            (F - 1) // self.vae_stride[0] + 1,
+            height // self.vae_stride[1],
+            width // self.vae_stride[2],
+        )
+        self.text_encoder.to(self.device)
+        context = self.text_encoder.encode(prompt).to(self.device)
+        context_null = self.text_encoder.encode(negative_prompt).to(self.device)
+        if self.offload:
+            self.text_encoder.cpu()
+            torch.cuda.empty_cache()
+        latents = [
+            torch.randn(
+                target_shape[0],
+                target_shape[1],
+                target_shape[2],
+                target_shape[3],
+                dtype=torch.float32,
+                device=self.device,
+                generator=generator,
+            )
+        ]
+        # evaluation mode
+        self.transformer.to(self.device)
+        with torch.cuda.amp.autocast(dtype=self.transformer.dtype), torch.no_grad():
+            self.scheduler.set_timesteps(num_inference_steps, device=self.device, shift=shift)
+            timesteps = self.scheduler.timesteps
+            for _, t in enumerate(tqdm(timesteps)):
+                latent_model_input = torch.stack(latents)
+                timestep = torch.stack([t])
+                noise_pred_cond = self.transformer(latent_model_input, t=timestep, context=context)[0]
+                noise_pred_uncond = self.transformer(latent_model_input, t=timestep, context=context_null)[0]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                temp_x0 = self.scheduler.step(
+                    noise_pred.unsqueeze(0), t, latents[0].unsqueeze(0), return_dict=False, generator=generator
+                )[0]
+                latents = [temp_x0.squeeze(0)]
+            if self.offload:
+                self.transformer.cpu()
+                torch.cuda.empty_cache()
+            videos = self.vae.decode(latents[0])
+            videos = (videos / 2 + 0.5).clamp(0, 1)
+            videos = [video for video in videos]
+            videos = [video.permute(1, 2, 3, 0) * 255 for video in videos]
+            videos = [video.cpu().numpy().astype(np.uint8) for video in videos]
+        return videos

skyreels_v2_infer/scheduler/__init__.py ADDED Viewed

File without changes

skyreels_v2_infer/scheduler/fm_solvers_unipc.py ADDED Viewed

	@@ -0,0 +1,759 @@

+# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
+# Convert unipc for flow matching
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.configuration_utils import register_to_config
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.schedulers.scheduling_utils import SchedulerOutput
+from diffusers.utils import deprecate
+class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        solver_order: int = 2,
+        prediction_type: str = "flow_prediction",
+        shift: Optional[float] = 1.0,
+        use_dynamic_shifting=False,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: SchedulerMixin = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1]  # pyright: ignore
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)  # pyright: ignore
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+        return sample
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        if self.predict_x0:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+            return x0_pred
+        else:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+            return epsilon
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        order: int = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = sample.device
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype)
+        else:
+            D1s = None
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor = None,
+        this_sample: torch.Tensor = None,
+        order: int = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+        generator=None,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        use_corrector = (
+            self.step_index > 0
+            and self.step_index - 1 not in self.disable_corrector
+            and self.last_sample is not None  # pyright: ignore
+        )
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep  # pyright: ignore
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index)  # pyright: ignore
+        else:
+            this_order = self.config.solver_order
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+    def __len__(self):
+        return self.config.num_train_timesteps