Spaces:

yanboding
/

MTVCrafter

Runtime error

App Files Files Community

yanboding commited on 20 days ago

Commit

30a0a93

verified ·

1 Parent(s): 7b12bef

Upload 32 files

Browse files

Files changed (32) hide show

__pycache__/draw_pose.cpython-313.pyc +0 -0
__pycache__/inference_engine.cpython-313.pyc +0 -0
__pycache__/motion_extractor.cpython-313.pyc +0 -0
__pycache__/utils.cpython-313.pyc +0 -0
app.py +103 -0
data/mean.npy +3 -0
data/std.npy +3 -0
draw_pose.py +115 -0
inference_engine.py +117 -0
models/__init__.py +2 -0
models/__pycache__/__init__.cpython-311.pyc +0 -0
models/__pycache__/__init__.cpython-313.pyc +0 -0
models/dit/__init__.py +2 -0
models/dit/__pycache__/__init__.cpython-311.pyc +0 -0
models/dit/__pycache__/__init__.cpython-313.pyc +0 -0
models/dit/__pycache__/mvdit_transformer.cpython-311.pyc +0 -0
models/dit/__pycache__/mvdit_transformer.cpython-313.pyc +0 -0
models/dit/__pycache__/pipeline_mtvcrafter.cpython-311.pyc +0 -0
models/dit/__pycache__/pipeline_mtvcrafter.cpython-313.pyc +0 -0
models/dit/mvdit_transformer.py +758 -0
models/dit/pipeline_mtvcrafter.py +728 -0
models/motion4d/__init__.py +2 -0
models/motion4d/__pycache__/__init__.cpython-311.pyc +0 -0
models/motion4d/__pycache__/__init__.cpython-313.pyc +0 -0
models/motion4d/__pycache__/loss.cpython-311.pyc +0 -0
models/motion4d/__pycache__/loss.cpython-313.pyc +0 -0
models/motion4d/__pycache__/vqvae.cpython-311.pyc +0 -0
models/motion4d/__pycache__/vqvae.cpython-313.pyc +0 -0
models/motion4d/loss.py +30 -0
models/motion4d/vqvae.py +501 -0
motion_extractor.py +62 -0
utils.py +77 -0

__pycache__/draw_pose.cpython-313.pyc ADDED Viewed

Binary file (5.18 kB). View file

__pycache__/inference_engine.cpython-313.pyc ADDED Viewed

Binary file (7.56 kB). View file

__pycache__/motion_extractor.cpython-313.pyc ADDED Viewed

Binary file (2.97 kB). View file

__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (4.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import torch
+import gradio as gr
+import cv2
+from PIL import Image
+from inference_engine import run_inference
+from motion_extractor import extract_pkl_from_video
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def full_pipeline(video_file, ref_image=None, width=512, height=512, steps=50, scale=3.0, seed=6666):
+    # 1. 提取 motion pkl
+    video_path = video_file.name
+    motion_pkl_path = extract_pkl_from_video(video_path)
+    gr.Info("⏳ Extract motion finished and begin animation...", visible=True)
+    # 2. 处理参考图像（可选）
+    if ref_image is not None:
+        ref_path = "temp_ref.png"
+        ref_image.save(ref_path)
+    else:
+        ref_path = ""
+    # 3. 推理
+    output_path = run_inference(
+        device,
+        motion_pkl_path,
+        ref_path,
+        dst_width=width,
+        dst_height=height,
+        num_inference_steps=steps,
+        guidance_scale=scale,
+        seed=seed,
+        )
+    return output_path
+def run_pipeline_with_feedback(video_file, ref_image, width, height, steps, scale, seed):
+    try:
+        if video_file is None:
+            raise gr.Error("Please upload a dancing video (.mp4/.mov/.avi).")
+        # 添加进度提示
+        gr.Info("⏳ Processing... Please wait several minutes.", visible=True)
+        result = full_pipeline(video_file, ref_image, width, height, steps, scale, seed)
+        gr.Info("✅ Inference done, please enjoy it!", visible=True)
+        return result
+    except Exception as e:
+        traceback.print_exc()
+        gr.Warning("⚠️ Inference failed: " + str(e))
+        return None
+# 构建 UI
+with gr.Blocks(title="MTVCrafter Inference Demo") as demo:
+    gr.Markdown(
+    """
+    # 🎨💃 MTVCrafter Inference Demo
+    💡 **Tip:** Upload a dancing video in **MP4/MOV/AVI** format, and optionally a reference image (e.g., PNG or JPG).
+    This demo will extract human motion from the input video and animate the reference image accordingly.
+    If no reference image is provided, the **first frame** of the video will be used as the reference.
+    🎞️ **Note:** The generated output video will contain exactly **49 frames**.
+    """
+)
+    with gr.Row():
+        with gr.Column(scale=1):
+            video_input = gr.File(label="📹 Input Video (Required)", file_types=[".mp4", ".mov", ".avi"])
+            video_preview = gr.Video(label="👀 Preview of Uploaded Video", height=280)  # 固定高度，避免对齐错位
+            def show_video_preview(video_file):
+                return video_file.name if video_file else None
+            video_input.change(fn=show_video_preview, inputs=video_input, outputs=video_preview)
+        with gr.Column(scale=1):
+            ref_image = gr.Image(type="pil", label="🖼️ Reference Image (Optional)", height=538)
+    with gr.Accordion("⚙️ Advanced Settings", open=False):
+        with gr.Row():
+            width = gr.Slider(384, 1024, value=512, step=16, label="Output Width")
+            height = gr.Slider(384, 1024, value=512, step=16, label="Output Height")
+        with gr.Row():
+            steps = gr.Slider(20, 100, value=50, step=5, label="Inference Steps")
+            scale = gr.Slider(0.0, 10.0, value=3.0, step=0.25, label="Guidance Scale")
+            seed = gr.Number(value=6666, label="Random Seed")
+    with gr.Row(scale=1):
+        output_video = gr.Video(label="🎬 Generated Video", interactive=False)
+    run_btn = gr.Button("🚀 Run MTVCrafter", variant="primary")
+    run_btn.click(
+        fn=run_pipeline_with_feedback,
+        inputs=[video_input, ref_image, width, height, steps, scale, seed],
+        outputs=output_video,
+    )
+if __name__ == "__main__":
+    os.environ["HF_ENDPOINT"] = "https://hf-mirror.com/"
+    os.environ["NO_PROXY"] = "localhost,127.0.0.1/8,::1"
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

data/mean.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ababeaabf5ac096ce7c7714ada14aa1de8355c0016de25695be611d51285141
+size 416

data/std.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:650e46902a0878e6947be401e4e1995e54a8fd407f2be3ded0dda62bda99a9b3
+size 416

draw_pose.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import cv2
+import math
+import numpy as np
+from PIL import Image
+def intrinsic_matrix_from_field_of_view(imshape, fov_degrees:float =55 ):   # nlf default fov_degrees 55
+    imshape = np.array(imshape)
+    fov_radians = fov_degrees * np.array(np.pi / 180)
+    larger_side = np.max(imshape)
+    focal_length = larger_side / (np.tan(fov_radians / 2) * 2)
+    # intrinsic_matrix 3*3
+    return np.array([
+        [focal_length, 0, imshape[1] / 2],
+        [0, focal_length, imshape[0] / 2],
+        [0, 0, 1],
+    ])
+def p3d_to_p2d(point_3d, height, width):    # point3d n*1024*3
+    camera_matrix = intrinsic_matrix_from_field_of_view((height,width))
+    camera_matrix = np.expand_dims(camera_matrix, axis=0)
+    camera_matrix = np.expand_dims(camera_matrix, axis=0)    # 1*1*3*3
+    point_3d = np.expand_dims(point_3d,axis=-1)     # n*1024*3*1
+    point_2d = (camera_matrix@point_3d).squeeze(-1)
+    point_2d[:,:,:2] = point_2d[:,:,:2]/point_2d[:,:,2:3]
+    return point_2d[:,:,:]      # n*1024*2
+def get_pose_images(smpl_data, offset):
+    pose_images = []
+    for data in smpl_data:
+        if isinstance(data, np.ndarray):
+            joints3d = data
+        else:
+            joints3d = data.numpy()
+        canvas = np.zeros(shape=(offset[0], offset[1], 3), dtype=np.uint8)
+        joints3d = p3d_to_p2d(joints3d, offset[0], offset[1])
+        canvas = draw_3d_points(canvas, joints3d[0], stickwidth=int(offset[1]/350))
+        pose_images.append(Image.fromarray(canvas))
+    return pose_images
+def draw_3d_points(canvas, points, stickwidth=2, r=2, draw_line=True):
+    colors = [
+        [255, 0, 0],    # 0
+        [0, 255, 0],    # 1
+        [0, 0, 255],    # 2
+        [255, 0, 255],  # 3
+        [255, 255, 0],  # 4
+        [85, 255, 0],   # 5
+        [0, 75, 255],   # 6
+        [0, 255, 85],   # 7
+        [0, 255, 170],  # 8
+        [170, 0, 255],  # 9
+        [85, 0, 255],   # 10
+        [0, 85, 255],   # 11
+        [0, 255, 255],  # 12
+        [85, 0, 255],   # 13
+        [170, 0, 255],  # 14
+        [255, 0, 255],  # 15
+        [255, 0, 170],  # 16
+        [255, 0, 85],   # 17
+    ]
+    connetions = [
+        [15,12],[12, 16],[16, 18],[18, 20],[20, 22],
+        [12,17],[17,19],[19,21],
+        [21,23],[12,9],[9,6],
+        [6,3],[3,0],[0,1],
+        [1,4],[4,7],[7,10],[0,2],[2,5],[5,8],[8,11]
+    ]
+    connection_colors = [
+        [255, 0, 0],    # 0
+        [0, 255, 0],    # 1
+        [0, 0, 255],    # 2
+        [255, 255, 0],  # 3
+        [255, 0, 255],  # 4
+        [0, 255, 0],    # 5
+        [0, 85, 255],   # 6
+        [255, 175, 0],  # 7
+        [0, 0, 255],    # 8
+        [255, 85, 0],   # 9
+        [0, 255, 85],   # 10
+        [255, 0, 255],  # 11
+        [255, 0, 0],    # 12
+        [0, 175, 255],  # 13
+        [255, 255, 0],  # 14
+        [0, 0, 255],    # 15
+        [0, 255, 0],    # 16
+    ]
+    # draw point
+    for i in range(len(points)):
+        x,y = points[i][0:2]
+        x,y = int(x),int(y)
+        if i==13 or i == 14:
+            continue
+        cv2.circle(canvas, (x, y), r, colors[i%17], thickness=-1)
+    # draw line
+    if draw_line:
+        for i in range(len(connetions)):
+            point1_idx,point2_idx = connetions[i][0:2]
+            point1 = points[point1_idx]
+            point2 = points[point2_idx]
+            Y = [point2[0],point1[0]]
+            X = [point2[1],point1[1]]
+            mX = int(np.mean(X))
+            mY = int(np.mean(Y))
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((mY, mX), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+            cv2.fillConvexPoly(canvas, polygon, connection_colors[i%17])
+    return canvas

inference_engine.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# inference_engine.py
+import os
+import torch
+import decord
+import imageio
+from PIL import Image
+from models import MTVCrafterPipeline, Encoder, VectorQuantizer, Decoder, SMPL_VQVAE
+from torchvision.transforms import ToPILImage, transforms, InterpolationMode, functional as F
+import numpy as np
+import pickle
+import copy
+from draw_pose import get_pose_images
+from utils import concat_images_grid, sample_video, get_sample_indexes, get_new_height_width
+def run_inference(device, motion_data_path, ref_image_path='', dst_width=512, dst_height=512, num_inference_steps=50, guidance_scale=3.0, seed=6666):
+    num_frames = 49
+    to_pil = ToPILImage()
+    normalize = transforms.Normalize([0.5], [0.5])
+    pretrained_model_path = "/gemini/space/human_guozz2/dyb/models/CogVideoX"
+    transformer_path = "/gemini/space/human_guozz2/dyb/models/MTVCrafter/MV-DiT/CogVideoX"
+    tokenizer_path = "/gemini/space/human_guozz2/dyb/models/MTVCrafter/4DMoT/mp_rank_00_model_states.pt"
+    with open(motion_data_path, 'rb') as f:
+        data_list = pickle.load(f)
+    if not isinstance(data_list, list):
+        data_list = [data_list]
+    pe_mean = np.load('data/mean.npy')
+    pe_std = np.load('data/std.npy')
+    pipe = MTVCrafterPipeline.from_pretrained(
+        model_path=pretrained_model_path,
+        transformer_model_path=transformer_path,
+        torch_dtype=torch.bfloat16,
+        scheduler_type='dpm',
+    ).to(device)
+    pipe.vae.enable_tiling()
+    pipe.vae.enable_slicing()
+    # load VQVAE
+    state_dict = torch.load(tokenizer_path, map_location="cpu")
+    motion_encoder = Encoder(in_channels=3, mid_channels=[128, 512], out_channels=3072, downsample_time=[2, 2], downsample_joint=[1, 1])
+    motion_quant = VectorQuantizer(nb_code=8192, code_dim=3072, is_train=False)
+    motion_decoder = Decoder(in_channels=3072, mid_channels=[512, 128], out_channels=3, upsample_rate=2.0, frame_upsample_rate=[2.0, 2.0], joint_upsample_rate=[1.0, 1.0])
+    vqvae = SMPL_VQVAE(motion_encoder, motion_decoder, motion_quant).to(device)
+    vqvae.load_state_dict(state_dict['module'], strict=True)
+    # 这里只跑第一个样本
+    data = data_list[0]
+    new_height, new_width = get_new_height_width(data, dst_height, dst_width)
+    x1 = (new_width - dst_width) // 2
+    y1 = (new_height - dst_height) // 2
+    sample_indexes = get_sample_indexes(data['video_length'], num_frames, stride=1)
+    input_images = sample_video(decord.VideoReader(data['video_path']), sample_indexes)
+    input_images = torch.from_numpy(input_images).permute(0, 3, 1, 2).contiguous()
+    input_images = F.resize(input_images, (new_height, new_width), InterpolationMode.BILINEAR)
+    input_images = F.crop(input_images, y1, x1, dst_height, dst_width)
+    if ref_image_path != '':
+        ref_image = Image.open(ref_image_path).convert("RGB")
+        ref_image = torch.from_numpy(np.array(ref_image)).permute(2, 0, 1).contiguous()
+        ref_images = torch.stack([ref_image.clone() for _ in range(num_frames)])
+        ref_images = F.resize(ref_images, (new_height, new_width), InterpolationMode.BILINEAR)
+        ref_images = F.crop(ref_images, y1, x1, dst_height, dst_width)
+    else:
+        ref_images = copy.deepcopy(input_images)
+        frame0 = input_images[0]
+        ref_images[:, :, :, :] = frame0
+    try:
+        smpl_poses = np.array([pose[0][0].cpu().numpy() for pose in data['pose']['joints3d_nonparam']])
+        poses = smpl_poses[sample_indexes]
+    except:
+        poses = data['pose'][sample_indexes]
+    norm_poses = torch.tensor((poses - pe_mean) / pe_std)
+    offset = [data['video_height'], data['video_width'], 0]
+    pose_images_before = get_pose_images(copy.deepcopy(poses), offset)
+    pose_images_before = [image.resize((new_width, new_height)).crop((x1, y1, x1+dst_width, y1+dst_height)) for image in pose_images_before]
+    input_smpl_joints = norm_poses.unsqueeze(0).to(device)
+    motion_tokens, vq_loss = vqvae(input_smpl_joints, return_vq=True)
+    output_motion, _ =  vqvae(input_smpl_joints)
+    pose_images_after = get_pose_images(output_motion[0].cpu().detach() * pe_std + pe_mean, offset)
+    pose_images_after = [image.resize((new_width, new_height)).crop((x1, y1, x1+dst_width, y1+dst_height)) for image in pose_images_after]
+    # normalize images
+    input_images = input_images / 255.0
+    ref_images = ref_images / 255.0
+    input_images = normalize(input_images)
+    ref_images = normalize(ref_images)
+    # infer
+    output_images = pipe(
+        height=dst_height,
+        width=dst_width,
+        num_frames=num_frames,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        seed=seed,
+        ref_images=ref_images,
+        motion_embeds=motion_tokens,
+        joint_mean=pe_mean,
+        joint_std=pe_std,
+    ).frames[0]
+    # save result
+    vis_images = []
+    for k in range(len(output_images)):
+        vis_image = [to_pil(((input_images[k] + 1) * 127.5).clamp(0, 255).to(torch.uint8)), pose_images_before[k], pose_images_after[k], output_images[k]]
+        vis_image = concat_images_grid(vis_image, cols=len(vis_image), pad=2)
+        vis_images.append(vis_image)
+    output_path = "output.mp4"
+    imageio.mimsave(output_path, vis_images, fps=15)
+    return output_path

models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .dit import *
2	+ from .motion4d import *

models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (235 Bytes). View file

models/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (205 Bytes). View file

models/dit/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .mvdit_transformer import Transformer3DModel
2	+ from .pipeline_mtvcrafter import MTVCrafterPipeline

models/dit/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (333 Bytes). View file

models/dit/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (287 Bytes). View file

models/dit/__pycache__/mvdit_transformer.cpython-311.pyc ADDED Viewed

Binary file (38.1 kB). View file

models/dit/__pycache__/mvdit_transformer.cpython-313.pyc ADDED Viewed

Binary file (35.1 kB). View file

models/dit/__pycache__/pipeline_mtvcrafter.cpython-311.pyc ADDED Viewed

Binary file (39.5 kB). View file

models/dit/__pycache__/pipeline_mtvcrafter.cpython-313.pyc ADDED Viewed

Binary file (37 kB). View file

models/dit/mvdit_transformer.py ADDED Viewed

	@@ -0,0 +1,758 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import AttentionProcessor, FusedCogVideoXAttnProcessor2_0
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNorm
+from diffusers.utils import is_torch_version, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class CogVideoXLayerNormZero(nn.Module):
+    def __init__(
+        self,
+        conditioning_dim: int,
+        embedding_dim: int,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_dim, 6 * embedding_dim, bias=bias)
+        self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shift, scale, gate, _, _, _ = self.linear(self.silu(temb)).chunk(6, dim=1)
+        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return hidden_states, gate[:, None, :]
+class CogVideoXAttnProcessor1_0:
+    r"""Processor for implementing scaled dot-product attention for the
+    CogVideoX model.
+    It applies a rotary embedding on query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, 'scaled_dot_product_attention'):
+            raise ImportError('CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.')
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        motion_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)    # [batch_size, heads, seq_len, dim]
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            if motion_rotary_emb is not None:
+                key = apply_rotary_emb(key, motion_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class CogVideoXAttnProcessor2_0:
+    r"""Processor for implementing scaled dot-product attention for the
+    CogVideoX model.
+    It applies a rotary embedding on query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, 'scaled_dot_product_attention'):
+            raise ImportError('CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.')
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        motion_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)    # [batch_size, heads, seq_len, dim]
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            image_seq_length = image_rotary_emb[0].shape[0]
+            query[:, :, :image_seq_length] = apply_rotary_emb(query[:, :, :image_seq_length], image_rotary_emb)
+            if motion_rotary_emb is not None:
+                query[:, :, image_seq_length:] = apply_rotary_emb(query[:, :, image_seq_length:], motion_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, :image_seq_length] = apply_rotary_emb(key[:, :, :image_seq_length], image_rotary_emb)
+                if motion_rotary_emb is not None:
+                    key[:, :, image_seq_length:] = apply_rotary_emb(key[:, :, image_seq_length:], motion_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class CogVideoXPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        embed_dim: int = 1920,
+        text_embed_dim: int = 4096,
+        bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_positional_embeddings: bool = True,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.sample_height = sample_height
+        self.sample_width = sample_width
+        self.sample_frames = sample_frames
+        self.temporal_compression_ratio = temporal_compression_ratio
+        self.max_text_seq_length = max_text_seq_length
+        self.spatial_interpolation_scale = spatial_interpolation_scale
+        self.temporal_interpolation_scale = temporal_interpolation_scale
+        self.use_positional_embeddings = use_positional_embeddings
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        self.text_proj = nn.Linear(text_embed_dim, embed_dim)
+        if use_positional_embeddings:
+            pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
+            self.register_buffer('pos_embedding', pos_embedding, persistent=False)
+    def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
+        post_patch_height = sample_height // self.patch_size
+        post_patch_width = sample_width // self.patch_size
+        post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
+        num_patches = post_patch_height * post_patch_width * post_time_compression_frames
+        pos_embedding = get_3d_sincos_pos_embed(
+            self.embed_dim,
+            (post_patch_width, post_patch_height),
+            post_time_compression_frames,
+            self.spatial_interpolation_scale,
+            self.temporal_interpolation_scale,
+        )
+        pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
+        joint_pos_embedding = torch.zeros(
+            1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
+        )
+        joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
+        return joint_pos_embedding
+    def forward(self, image_embeds: torch.Tensor):
+        r"""
+        Args:
+            text_embeds (`torch.Tensor`):
+                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
+            image_embeds (`torch.Tensor`):
+                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+        """
+        batch, num_frames, channels, height, width = image_embeds.shape
+        image_embeds = image_embeds.reshape(-1, channels, height, width)
+        image_embeds = self.proj(image_embeds)      # [2*7, 3072, h/8/2, w/8/2]
+        image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
+        image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+        image_embeds = image_embeds.flatten(1, 2).contiguous()  # [batch, num_frames x height x width, channels]
+        if self.use_positional_embeddings:
+            pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+            if (
+                self.sample_height != height
+                or self.sample_width != width
+                or self.sample_frames != pre_time_compression_frames
+            ):
+                pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
+                pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype)
+            else:
+                pos_embedding = self.pos_embedding
+            embeds = embeds + pos_embedding
+        return image_embeds
+@maybe_allow_in_graph
+class CogVideoXBlock(nn.Module):
+    r"""
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        motion_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = 'gelu-approximate',
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+        cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.is_cross_attention = cross_attention
+        if self.is_cross_attention:
+            self.attn0 = Attention(
+                query_dim=dim,
+                cross_attention_dim=dim,
+                dim_head=attention_head_dim,
+                heads=num_attention_heads,
+                qk_norm='layer_norm' if qk_norm else None,
+                eps=1e-6,
+                bias=attention_bias,
+                out_bias=attention_out_bias,
+                processor=CogVideoXAttnProcessor1_0(),
+            )
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm='layer_norm' if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CogVideoXAttnProcessor2_0(),
+        )
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        motion_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        # norm & modulate
+        norm_hidden_states, gate_msa = self.norm1(hidden_states, temb)
+        # self attention
+        attn_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        if self.is_cross_attention:
+            cross_attn_hidden_states = self.attn0(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                image_rotary_emb=image_rotary_emb,
+                motion_rotary_emb=motion_rotary_emb,
+            )
+            hidden_states = hidden_states + cross_attn_hidden_states
+        # norm & modulate
+        norm_hidden_states, gate_ff = self.norm2(hidden_states, temb)
+        # feed-forward
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_ff * ff_output
+        return hidden_states
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    """
+    Parameters:
+        num_attention_heads (`int`, defaults to `30`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `64`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        attention_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in the attention projection layers.
+        sample_width (`int`, defaults to `90`):
+            The width of the input latents.
+        sample_height (`int`, defaults to `60`):
+            The height of the input latents.
+        sample_frames (`int`, defaults to `49`):
+            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
+            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
+            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        temporal_compression_ratio (`int`, defaults to `4`):
+            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
+        max_text_seq_length (`int`, defaults to `226`):
+            The maximum sequence length of the input text embeddings.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        timestep_activation_fn (`str`, defaults to `"silu"`):
+            Activation function to use when generating the timestep embeddings.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether or not to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        spatial_interpolation_scale (`float`, defaults to `1.875`):
+            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
+        temporal_interpolation_scale (`float`, defaults to `1.0`):
+            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30,
+        attention_head_dim: int = 64,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        text_embed_dim: int = 4096,
+        motion_dim: int = 168,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = 'gelu-approximate',
+        timestep_activation_fn: str = 'silu',
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim    # 48 * 64 = 3072
+        self.unconditional_motion_token = torch.nn.Parameter(torch.randn(312, 3072))
+        print(self.unconditional_motion_token[0])
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            text_embed_dim=text_embed_dim,
+            bias=True,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+        )
+        self.embedding_dropout = nn.Dropout(dropout)
+        # 2. Time embeddings
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)      # 3072 --> 512
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    motion_dim=motion_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    cross_attention=True,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
+        # 4. Output blocks
+        self.norm_out = AdaLayerNorm(
+            embedding_dim=time_embed_dim,
+            output_dim=2 * inner_dim,
+            norm_elementwise_affine=norm_elementwise_affine,
+            norm_eps=norm_eps,
+            chunk_dim=1,
+        )
+        self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, 'get_processor'):
+                processors[f'{name}.processor'] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f'{name}.{sub_name}', child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f'A dict of processors was passed, but the number of processors {len(processor)} does not match the'
+                f' number of attention layers: {count}. Please make sure to pass {count} processor classes.'
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, 'set_processor'):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f'{name}.processor'))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f'{name}.{sub_name}', child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with
+    # FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """Enables fused QKV projections. For self-attention modules, all
+        projection matrices (i.e., query, key, value) are fused. For cross-
+        attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if 'Added' in str(attn_processor.__class__.__name__):
+                raise ValueError('`fuse_qkv_projections()` is not supported for models having added KV projections.')
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        timestep_cond: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        motion_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        motion_emb: Optional[torch.Tensor] = None,
+        camera_emb: Optional[torch.Tensor] = None,
+        need_broadcast: bool = True,
+        return_dict: bool = True,
+    ):
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)      # (2, 3072)
+        emb = self.time_embedding(t_emb, timestep_cond)  # (2, 3072) --> (2, 512)
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(hidden_states)      # (2, 226+9450, dim=3072)
+        hidden_states = self.embedding_dropout(hidden_states)
+        image_seq_length = image_rotary_emb[0].shape[0]
+        motion_seq_length = motion_emb.shape[1]   # 168
+        # hidden_states = hidden_states[:, motion_seq_length:]
+        encoder_hidden_states = motion_emb
+        # encoder_hidden_states = self.motion_proj(motion_emb)
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:    # train with gradient checkpointing to save memory
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    motion_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                    motion_rotary_emb=motion_rotary_emb,
+                )
+        # 4. Final block
+        hidden_states = self.norm_final(hidden_states)
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+        output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

models/dit/pipeline_mtvcrafter.py ADDED Viewed

	@@ -0,0 +1,728 @@

+import os
+import math
+import inspect
+import numpy as np
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.models import AutoencoderKLCogVideoX
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from einops import rearrange
+from PIL import Image
+from torchvision import transforms
+from .mvdit_transformer import Transformer3DModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[np.ndarray, int],
+    theta: float = 10000.0,
+    use_real=False,
+    linear_factor=1.0,
+    ntk_factor=1.0,
+    repeat_interleave_real=True,
+    freqs_dtype=torch.float32,      # torch.float32, torch.float64 (flux)
+):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
+    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
+    data type.
+    Args:
+        dim (`int`): Dimension of the frequency tensor.
+        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
+        theta (`float`, *optional*, defaults to 10000.0):
+            Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (`bool`, *optional*):
+            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+        linear_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for the context extrapolation. Defaults to 1.0.
+        ntk_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
+        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
+            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
+            Otherwise, they are concateanted with themselves.
+        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
+            the dtype of the frequency tensor.
+    Returns:
+        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
+    """
+    assert dim % 2 == 0
+    if isinstance(pos, int):
+        pos = torch.arange(pos)
+    if isinstance(pos, np.ndarray):
+        pos = torch.from_numpy(pos)  # type: ignore  # [S]
+    theta = theta * ntk_factor
+    freqs = (
+        1.0
+        / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
+        / linear_factor
+    )  # [D/2]
+    freqs = torch.outer(pos, freqs)  # type: ignore   # [S, D/2]
+    if use_real and repeat_interleave_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float()  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    elif use_real:
+        freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float()  # [S, D]
+        freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
+        return freqs_cis
+def get_3d_rotary_pos_embed(
+    embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    RoPE for video tokens with 3D structure.
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size, corresponding to hidden_size_head.
+    crops_coords (`Tuple[int]`):
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the spatial positional embedding (height, width).
+    temporal_size (`int`):
+        The size of the temporal dimension.
+    theta (`float`):
+        Scaling factor for frequency computation.
+    Returns:
+        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+    """
+    if use_real is not True:
+        raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
+    start, stop = crops_coords
+    grid_size_h, grid_size_w = grid_size
+    grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
+    grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
+    grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+    # Compute dimensions for each axis
+    dim_t = embed_dim // 4
+    dim_h = embed_dim // 8 * 3
+    dim_w = embed_dim // 8 * 3
+    # Temporal frequencies
+    freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
+    # Spatial frequencies for height and width
+    freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
+    freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
+    # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
+    def combine_time_height_width(freqs_t, freqs_h, freqs_w):
+        freqs_t = freqs_t[:, None, None, :].expand(
+            -1, grid_size_h, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_w, dim_t
+        freqs_h = freqs_h[None, :, None, :].expand(
+            temporal_size, -1, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_h
+        freqs_w = freqs_w[None, None, :, :].expand(
+            temporal_size, grid_size_h, -1, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_w
+        freqs = torch.cat(
+            [freqs_t, freqs_h, freqs_w], dim=-1
+        )  # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
+        freqs = freqs.view(
+            temporal_size * grid_size_h * grid_size_w, -1
+        )  # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
+        return freqs
+    t_cos, t_sin = freqs_t  # both t_cos and t_sin has shape: temporal_size, dim_t
+    h_cos, h_sin = freqs_h  # both h_cos and h_sin has shape: grid_size_h, dim_h
+    w_cos, w_sin = freqs_w  # both w_cos and w_sin has shape: grid_size_w, dim_w
+    cos = combine_time_height_width(t_cos, h_cos, w_cos)
+    sin = combine_time_height_width(t_sin, h_sin, w_sin)
+    return cos, sin
+def get_3d_motion_spatial_embed(
+    embed_dim: int, num_joints: int, joints_mean: np.ndarray, joints_std: np.ndarray, theta: float = 10000.0
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    """
+    assert embed_dim % 2 == 0 and embed_dim % 3 == 0
+    def create_rope_pe(dim, pos, freqs_dtype=torch.float32):
+        if isinstance(pos, np.ndarray):
+            pos = torch.from_numpy(pos)
+        freqs = (
+            1.0
+            / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device)[: (dim // 2)] / dim))
+        )  # [D/2]
+        freqs = torch.outer(pos, freqs)  # type: ignore   # [S, D/2]
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float()  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    # 为每个轴创建位置编码
+    # relative_pos_x = joints_mean[:, 0] - joints_mean[0, 0]
+    # relative_pos_y = joints_mean[:, 1] - joints_mean[0, 1]
+    # relative_pos_z = joints_mean[:, 2] - joints_mean[0, 2]
+    # normalized_pos_x = relative_pos_x / joints_std[:, 0].mean()
+    # normalized_pos_y = relative_pos_y / joints_std[:, 1].mean()
+    # normalized_pos_z = relative_pos_z / joints_std[:, 2].mean()
+    pos_x = joints_mean[:, 0]
+    pos_y = joints_mean[:, 1]
+    pos_z = joints_mean[:, 2]
+    normalized_pos_x = (pos_x - pos_x.mean())
+    normalized_pos_y = (pos_y - pos_y.mean())
+    normalized_pos_z = (pos_z - pos_z.mean())
+    freqs_cos_x, freqs_sin_x = create_rope_pe(embed_dim // 3, normalized_pos_x)
+    freqs_cos_y, freqs_sin_y = create_rope_pe(embed_dim // 3, normalized_pos_y)
+    freqs_cos_z, freqs_sin_z = create_rope_pe(embed_dim // 3, normalized_pos_z)
+    freqs_cos = torch.cat([freqs_cos_x, freqs_cos_y, freqs_cos_z], dim=-1)
+    freqs_sin = torch.cat([freqs_sin_x, freqs_sin_y, freqs_sin_z], dim=-1)
+    return freqs_cos, freqs_sin
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError('Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values')
+    if timesteps is not None:
+        accepts_timesteps = 'timesteps' in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f' timestep schedules. Please check whether you are using the correct scheduler.'
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = 'sigmas' in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f' sigmas schedules. Please check whether you are using the correct scheduler.'
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class MTVCrafterPipelineOutput(BaseOutput):
+    r"""Output class for the MTVCrafter pipeline.
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+    frames: torch.Tensor
+class MTVCrafterPipeline(DiffusionPipeline):
+    r"""Pipeline for MTVCrafter.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        transformer ([`Transformer3DModel`]):
+            A image conditioned `Transformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+    _callback_tensor_inputs = [
+        'latents',
+        'prompt_embeds',
+        'negative_prompt_embeds',
+    ]
+    def __init__(
+        self,
+        vae: AutoencoderKLCogVideoX,
+        transformer: Transformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, 'vae') and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, 'vae') and self.vae is not None else 4
+        )
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.normalize = transforms.Normalize([0.5], [0.5])
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path,
+        transformer_model_path=None,
+        scheduler_type='ddim',
+        torch_dtype=None,
+        **kwargs,
+    ):
+        if transformer_model_path is None:
+            transformer_model_path = os.path.join(model_path, 'transformer')
+        transformer = Transformer3DModel.from_pretrained(
+            transformer_model_path, torch_dtype=torch_dtype, **kwargs
+        )
+        if scheduler_type == 'ddim':
+            scheduler = CogVideoXDDIMScheduler.from_pretrained(model_path, subfolder='scheduler')
+        elif scheduler_type == 'dpm':
+            scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder='scheduler')
+        else:
+            assert False
+        pipe = super().from_pretrained(
+            model_path, transformer=transformer, scheduler=scheduler, torch_dtype=torch_dtype, **kwargs
+        )
+        return pipe
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f'You have passed a list of generators of length {len(generator)}, but requested an effective batch'
+                f' size of {batch_size}. Make sure the batch size matches the length of the generators.'
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae.config.scaling_factor * latents
+        frames = self.vae.decode(latents).sample
+        return frames
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper and should be between [0, 1]
+        accepts_eta = 'eta' in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs['eta'] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = 'generator' in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs['generator'] = generator
+        return extra_step_kwargs
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f'`height` and `width` have to be divisible by 8 but are {height} and {width}.')
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f'`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found '
+                f'{[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}'
+            )
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_crops_coords = ((0, 0), (grid_height, grid_width))
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=self.transformer.config.attention_head_dim,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+        )
+        freqs_cos = freqs_cos.to(device=device, dtype=dtype)
+        freqs_sin = freqs_sin.to(device=device, dtype=dtype)
+        return freqs_cos, freqs_sin
+    def _prepare_motion_embeddings(self, num_frames, num_joints, joints_mean, joints_std, device, dtype):
+        time_embed = get_1d_rotary_pos_embed(self.transformer.config.attention_head_dim // 4, num_frames, use_real=True)
+        time_embed_cos = time_embed[0][:, None, :].expand(-1, num_joints, -1).reshape(num_frames*num_joints, -1)
+        time_embed_sin = time_embed[1][:, None, :].expand(-1, num_joints, -1).reshape(num_frames*num_joints, -1)
+        spatial_motion_embed = get_3d_motion_spatial_embed(self.transformer.config.attention_head_dim // 4 * 3, num_joints, joints_mean, joints_std)
+        spatial_embed_cos = spatial_motion_embed[0][None, :, :].expand(num_frames, -1, -1).reshape(num_frames*num_joints, -1)
+        spatial_embed_sin = spatial_motion_embed[1][None, :, :].expand(num_frames, -1, -1).reshape(num_frames*num_joints, -1)
+        motion_embed_cos = torch.cat([time_embed_cos, spatial_embed_cos], dim=-1).to(device=device, dtype=dtype)
+        motion_embed_sin = torch.cat([time_embed_sin, spatial_embed_sin], dim=-1).to(device=device, dtype=dtype)
+        return motion_embed_cos, motion_embed_sin
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        seed: Optional[int] = -1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = 'pil',
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ['latents'],
+        max_sequence_length: int = 226,
+        ref_images: List[Image.Image] = None,
+        motion_embeds: Optional[torch.FloatTensor] = None,
+        joint_mean: Optional[np.ndarray] = None,
+        joint_std: Optional[np.ndarray] = None,
+    ) -> Union[MTVCrafterPipelineOutput, Tuple]:
+        """Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance]. Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)]
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
+        # 720 * 480
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            height,
+            width,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif prompt is None:
+            batch_size = 1
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        if seed > 0:
+            generator = torch.Generator(device=device)
+            generator.manual_seed(seed)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        # 4. Prepare latents.
+        latent_channels = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            self.vae.dtype,
+            device,
+            generator,
+            latents,
+        )   # [1, x, 16, h/8, w/8]
+        if ref_images is not None:
+            ref_images = rearrange(ref_images.unsqueeze(0), 'b f c h w -> b c f h w')
+            ref_latents = self.vae.encode(
+                ref_images.to(dtype=self.vae.dtype, device=self.vae.device)
+            ).latent_dist.sample()
+            ref_latents = rearrange(ref_latents, 'b c f h w -> b f c h w')
+            if do_classifier_free_guidance:
+                ref_latents = torch.cat([ref_latents, ref_latents], dim=0)
+        motion_embeds = motion_embeds.to(latents.dtype)
+        if motion_embeds is not None and do_classifier_free_guidance:
+            motion_embeds = torch.cat([self.transformer.unconditional_motion_token.unsqueeze(0), motion_embeds], dim=0)
+        # 5. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device, dtype=latents.dtype)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        motion_rotary_emb = self._prepare_motion_embeddings(latents.size(1), 24, joint_mean, joint_std, device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                if ref_images is not None:
+                    latent_model_input = torch.cat([latent_model_input, ref_latents], dim=2)
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep.long(),
+                    image_rotary_emb=image_rotary_emb,
+                    motion_rotary_emb=motion_rotary_emb,
+                    motion_emb=motion_embeds,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()     # [b, f, c, h, w]
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(self.vae.dtype)
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop('latents', latents)
+                    prompt_embeds = callback_outputs.pop('prompt_embeds', prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop('negative_prompt_embeds', negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == 'latent':
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return MTVCrafterPipelineOutput(frames=video)

models/motion4d/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .vqvae import SMPL_VQVAE, VectorQuantizer, Encoder, Decoder
2	+ from .loss import ReConsLoss

models/motion4d/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (375 Bytes). View file

models/motion4d/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (314 Bytes). View file

models/motion4d/__pycache__/loss.cpython-311.pyc ADDED Viewed

Binary file (2.13 kB). View file

models/motion4d/__pycache__/loss.cpython-313.pyc ADDED Viewed

Binary file (2 kB). View file

models/motion4d/__pycache__/vqvae.cpython-311.pyc ADDED Viewed

Binary file (28.4 kB). View file

models/motion4d/__pycache__/vqvae.cpython-313.pyc ADDED Viewed

Binary file (26.5 kB). View file

models/motion4d/loss.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+class ReConsLoss(nn.Module):
+    def __init__(self, recons_loss, nb_joints):
+        super(ReConsLoss, self).__init__()
+        if recons_loss == 'l1':
+            self.Loss = torch.nn.L1Loss()
+        elif recons_loss == 'l2' :
+            self.Loss = torch.nn.MSELoss()
+        elif recons_loss == 'l1_smooth' :
+            self.Loss = torch.nn.SmoothL1Loss()
+        # 4 global motion associated to root
+        # 12 local motion (3 local xyz, 3 vel xyz, 6 rot6d)
+        # 3 global vel xyz
+        # 4 foot contact
+        self.nb_joints = nb_joints
+        self.motion_dim = (nb_joints - 1) * 12 + 4 + 3 + 4
+    def forward(self, motion_pred, motion_gt) :
+        loss = self.Loss(motion_pred[..., : self.motion_dim], motion_gt[..., :self.motion_dim])
+        return loss
+    def forward_joint(self, motion_pred, motion_gt) :
+        loss = self.Loss(motion_pred[..., 4 : (self.nb_joints - 1) * 3 + 4], motion_gt[..., 4 : (self.nb_joints - 1) * 3 + 4])
+        return loss

models/motion4d/vqvae.py ADDED Viewed

	@@ -0,0 +1,501 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Any, Dict, Optional, Tuple, Union
+from diffusers.models.attention import Attention
+class AttnProcessor:
+    r"""Processor for implementing scaled dot-product attention for the
+    CogVideoX model.
+    It applies a rotary embedding on query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, 'scaled_dot_product_attention'):
+            raise ImportError('AttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.')
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        motion_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        import pdb; pdb.set_trace()
+        batch_size, sequence_length, _ = hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)    # [batch_size, heads, seq_len, dim]
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        sp_group = get_sequence_parallel_group()
+        if sp_group is not None:
+            sp_size = dist.get_world_size(sp_group)
+            query = _all_in_all_with_text(query, text_seq_length, sp_group, sp_size, mode=1)
+            key = _all_in_all_with_text(key, text_seq_length, sp_group, sp_size, mode=1)
+            value = _all_in_all_with_text(value, text_seq_length, sp_group, sp_size, mode=1)
+            text_seq_length *= sp_size
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            image_seq_length = image_rotary_emb[0].shape[0]
+            query[:, :, :image_seq_length] = apply_rotary_emb(query[:, :, :image_seq_length], image_rotary_emb)
+            if motion_rotary_emb is not None:
+                query[:, :, image_seq_length:] = apply_rotary_emb(query[:, :, image_seq_length:], motion_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, :image_seq_length] = apply_rotary_emb(key[:, :, :image_seq_length], image_rotary_emb)
+                if motion_rotary_emb is not None:
+                    key[:, :, image_seq_length:] = apply_rotary_emb(key[:, :, image_seq_length:], motion_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        if sp_group is not None:
+            hidden_states = _all_in_all_with_text(hidden_states, text_seq_length, sp_group, sp_size, mode=2)
+            text_seq_length = text_seq_length // sp_size
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=3,
+        mid_channels=[128, 512],
+        out_channels=3072,
+        downsample_time=[1, 1],
+        downsample_joint=[1, 1],
+        num_attention_heads=8,
+        attention_head_dim=64,
+        dim=3072,
+        ):
+        super(Encoder, self).__init__()
+        self.conv_in = nn.Conv2d(in_channels, mid_channels[0], kernel_size=3, stride=1, padding=1)
+        self.resnet1 = nn.ModuleList([ResBlock(mid_channels[0], mid_channels[0]) for _ in range(3)])
+        self.downsample1 = Downsample(mid_channels[0], mid_channels[0], downsample_time[0], downsample_joint[0])
+        self.resnet2 = ResBlock(mid_channels[0], mid_channels[1])
+        self.resnet3 = nn.ModuleList([ResBlock(mid_channels[1], mid_channels[1]) for _ in range(3)])
+        self.downsample2 = Downsample(mid_channels[1], mid_channels[1], downsample_time[1], downsample_joint[1])
+        # self.attn = Attention(
+        #     query_dim=dim,
+        #     dim_head=attention_head_dim,
+        #     heads=num_attention_heads,
+        #     qk_norm='layer_norm',
+        #     eps=1e-6,
+        #     bias=True,
+        #     out_bias=True,
+        #     processor=AttnProcessor(),
+        # )
+        self.conv_out = nn.Conv2d(mid_channels[-1], out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = self.conv_in(x)
+        for resnet in self.resnet1:
+            x = resnet(x)
+        x = self.downsample1(x)
+        x = self.resnet2(x)
+        for resnet in self.resnet3:
+            x = resnet(x)
+        x = self.downsample2(x)
+        # x = x + self.attn(x)
+        x = self.conv_out(x)
+        return x
+class VectorQuantizer(nn.Module):
+    def __init__(self, nb_code, code_dim, is_train=True):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = 0.99
+        self.reset_codebook()
+        self.reset_count = 0
+        self.usage = torch.zeros((self.nb_code, 1))
+        self.is_train = is_train
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else:
+            out = x
+        return out
+    def init_codebook(self, x):
+        if torch.all(self.codebook == 0):
+            out = self._tile(x)
+            self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        if self.is_train:
+          self.init = True
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+        code_sum = torch.matmul(code_onehot, x)  # [nb_code, code_dim]
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        out = self._tile(x)
+        code_rand = out[torch.randperm(out.shape[0])[:self.nb_code]]
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+        self.usage = self.usage.to(usage.device)
+        if self.reset_count >= 20:      # reset codebook every 20 steps for stability
+            self.reset_count = 0
+            usage = (usage + self.usage >= 1.0).float()
+        else:
+            self.reset_count += 1
+            self.usage = (usage + self.usage >= 1.0).float()
+            usage = torch.ones_like(self.usage, device=x.device)
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+        self.codebook = usage * code_update + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def preprocess(self, x):
+        # [bs, c, f, j] -> [bs * f * j, c]
+        x = x.permute(0, 2, 3, 1).contiguous()
+        x = x.view(-1, x.shape[-1])
+        return x
+    def quantize(self, x):
+        # [bs * f * j, dim=3072]
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0, keepdim=True)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)    # indexing: [bs * f * j, 32]
+        return x
+    def forward(self, x, return_vq=False):
+        # import pdb; pdb.set_trace()
+        bs, c, f, j = x.shape   # SMPL data frames: [bs, 3072, f, j]
+        # Preprocess
+        x = self.preprocess(x)
+        # return x.view(bs, f*j, c).contiguous(), None
+        assert x.shape[-1] == self.code_dim
+        # Init codebook if not inited
+        if not self.init and self.is_train:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.is_train:
+            perplexity = self.update_codebook(x, code_idx)
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+        if return_vq:
+            return x_d.view(bs, f*j, c).contiguous(), commit_loss
+            # return (x_d, x_d.view(bs, f, j, c).permute(0, 3, 1, 2).contiguous()), commit_loss, perplexity
+        # Postprocess
+        x_d = x_d.view(bs, f, j, c).permute(0, 3, 1, 2).contiguous()
+        if self.is_train:
+            return x_d, commit_loss, perplexity
+        else:
+            return x_d, commit_loss
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=3072,
+        mid_channels=[512, 128],
+        out_channels=3,
+        upsample_rate=None,
+        frame_upsample_rate=[1.0, 1.0],
+        joint_upsample_rate=[1.0, 1.0],
+        dim=128,
+        attention_head_dim=64,
+        num_attention_heads=8,
+        ):
+        super(Decoder, self).__init__()
+        self.conv_in = nn.Conv2d(in_channels, mid_channels[0], kernel_size=3, stride=1, padding=1)
+        self.resnet1 = nn.ModuleList([ResBlock(mid_channels[0], mid_channels[0]) for _ in range(3)])
+        self.upsample1 = Upsample(mid_channels[0], mid_channels[0], frame_upsample_rate=frame_upsample_rate[0], joint_upsample_rate=joint_upsample_rate[0])
+        self.resnet2 = ResBlock(mid_channels[0], mid_channels[1])
+        self.resnet3 = nn.ModuleList([ResBlock(mid_channels[1], mid_channels[1]) for _ in range(3)])
+        self.upsample2 = Upsample(mid_channels[1], mid_channels[1], frame_upsample_rate=frame_upsample_rate[1], joint_upsample_rate=joint_upsample_rate[1])
+        # self.attn = Attention(
+        #     query_dim=dim,
+        #     dim_head=attention_head_dim,
+        #     heads=num_attention_heads,
+        #     qk_norm='layer_norm',
+        #     eps=1e-6,
+        #     bias=True,
+        #     out_bias=True,
+        #     processor=AttnProcessor(),
+        # )
+        self.conv_out = nn.Conv2d(mid_channels[-1], out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = self.conv_in(x)
+        for resnet in self.resnet1:
+            x = resnet(x)
+        x = self.upsample1(x)
+        x = self.resnet2(x)
+        for resnet in self.resnet3:
+            x = resnet(x)
+        x = self.upsample2(x)
+        # x = x + self.attn(x)
+        x = self.conv_out(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        upsample_rate=None,
+        frame_upsample_rate=None,
+        joint_upsample_rate=None,
+        ):
+        super(Upsample, self).__init__()
+        self.upsampler = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.upsample_rate = upsample_rate
+        self.frame_upsample_rate = frame_upsample_rate
+        self.joint_upsample_rate = joint_upsample_rate
+        self.upsample_rate = upsample_rate
+    def forward(self, inputs):
+        if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
+            # split first frame
+            x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
+            if self.upsample_rate is not None:
+                # import pdb; pdb.set_trace()
+                x_first = F.interpolate(x_first, scale_factor=self.upsample_rate)
+                x_rest = F.interpolate(x_rest, scale_factor=self.upsample_rate)
+            else:
+                # import pdb; pdb.set_trace()
+                # x_first = F.interpolate(x_first, scale_factor=(self.frame_upsample_rate, self.joint_upsample_rate), mode="bilinear", align_corners=True)
+                x_rest = F.interpolate(x_rest, scale_factor=(self.frame_upsample_rate, self.joint_upsample_rate), mode="bilinear", align_corners=True)
+            x_first = x_first[:, :, None, :]
+            inputs = torch.cat([x_first, x_rest], dim=2)
+        elif inputs.shape[2] > 1:
+            if self.upsample_rate is not None:
+                inputs = F.interpolate(inputs, scale_factor=self.upsample_rate)
+            else:
+                inputs = F.interpolate(inputs, scale_factor=(self.frame_upsample_rate, self.joint_upsample_rate), mode="bilinear", align_corners=True)
+        else:
+            inputs = inputs.squeeze(2)
+            if self.upsample_rate is not None:
+                inputs = F.interpolate(inputs, scale_factor=self.upsample_rate)
+            else:
+                inputs = F.interpolate(inputs, scale_factor=(self.frame_upsample_rate, self.joint_upsample_rate), mode="linear", align_corners=True)
+            inputs = inputs[:, :, None, :, :]
+        b, c, t, j = inputs.shape
+        inputs = inputs.permute(0, 2, 1, 3).reshape(b * t, c, j)
+        inputs = self.upsampler(inputs)
+        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3)
+        return inputs
+class Downsample(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        frame_downsample_rate,
+        joint_downsample_rate
+        ):
+        super(Downsample, self).__init__()
+        self.frame_downsample_rate = frame_downsample_rate
+        self.joint_downsample_rate = joint_downsample_rate
+        self.joint_downsample = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=self.joint_downsample_rate, padding=1)
+    def forward(self, x):
+        # (batch_size, channels, frames, joints) -> (batch_size * joints, channels, frames)
+        if self.frame_downsample_rate > 1:
+            batch_size, channels, frames, joints = x.shape
+            x = x.permute(0, 3, 1, 2).reshape(batch_size * joints, channels, frames)
+            if x.shape[-1] % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if x_rest.shape[-1] > 0:
+                    # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2)
+                    x_rest = F.avg_pool1d(x_rest, kernel_size=self.frame_downsample_rate, stride=self.frame_downsample_rate)
+                x = torch.cat([x_first[..., None], x_rest], dim=-1)
+                # (batch_size * joints, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, joints)
+                x = x.reshape(batch_size, joints, channels, x.shape[-1]).permute(0, 2, 3, 1)
+            else:
+                # (batch_size * joints, channels, frames) -> (batch_size * joints, channels, frames // 2)
+                x = F.avg_pool1d(x, kernel_size=2, stride=2)
+                # (batch_size * joints, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width)
+                x = x.reshape(batch_size, joints, channels, x.shape[-1]).permute(0, 2, 3, 1)
+        # Pad the tensor
+        # pad = (0, 1)
+        # x = F.pad(x, pad, mode="constant", value=0)
+        batch_size, channels, frames, joints = x.shape
+        # (batch_size, channels, frames, joints) -> (batch_size * frames, channels, joints)
+        x = x.permute(0, 2, 1, 3).reshape(batch_size * frames, channels, joints)
+        x = self.joint_downsample(x)
+        # (batch_size * frames, channels, joints) -> (batch_size, channels, frames, joints)
+        x = x.reshape(batch_size, frames, x.shape[1], x.shape[2]).permute(0, 2, 1, 3)
+        return x
+class ResBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 group_num=32,
+                 max_channels=512):
+        super(ResBlock, self).__init__()
+        skip = max(1, max_channels // out_channels - 1)
+        self.block = nn.Sequential(
+            nn.GroupNorm(group_num, in_channels, eps=1e-06, affine=True),
+            nn.SiLU(),
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=skip, dilation=skip),
+            nn.GroupNorm(group_num, out_channels, eps=1e-06, affine=True),
+            nn.SiLU(),
+            nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, padding=0),
+        )
+        self.conv_short = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) if in_channels != out_channels else nn.Identity()
+    def forward(self, x):
+        hidden_states = self.block(x)
+        if hidden_states.shape != x.shape:
+            x = self.conv_short(x)
+        x = x + hidden_states
+        return x
+class SMPL_VQVAE(nn.Module):
+    def __init__(self, encoder, decoder, vq):
+        super(SMPL_VQVAE, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.vq = vq
+    def to(self, device):
+        self.encoder = self.encoder.to(device)
+        self.decoder = self.decoder.to(device)
+        self.vq = self.vq.to(device)
+        self.device = device
+        return self
+    def encdec_slice_frames(self, x, frame_batch_size, encdec, return_vq):
+        num_frames = x.shape[2]
+        remaining_frames = num_frames % frame_batch_size
+        x_output = []
+        loss_output = []
+        perplexity_output = []
+        for i in range(num_frames // frame_batch_size):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            x_intermediate = x[:, :, start_frame:end_frame]
+            x_intermediate = encdec(x_intermediate)
+            # if encdec == self.encoder and self.vq is not None:
+            #     x_intermediate, loss, perplexity = self.vq(x_intermediate)
+            #     x_output.append(x_intermediate)
+            #     loss_output.append(loss)
+            #     perplexity_output.append(perplexity)
+            # else:
+            #     x_output.append(x_intermediate)
+            x_output.append(x_intermediate)
+        if encdec == self.encoder and self.vq is not None and not self.vq.is_train:
+            x_output, loss = self.vq(torch.cat(x_output, dim=2), return_vq=return_vq)
+            return x_output, loss
+        elif encdec == self.encoder and self.vq is not None and self.vq.is_train:
+            x_output, loss, preplexity = self.vq(torch.cat(x_output, dim=2))
+            return x_output, loss, preplexity
+        else:
+            return torch.cat(x_output, dim=2), None, None
+    def forward(self, x, return_vq=False):
+        x = x.permute(0, 3, 1, 2)
+        if not self.vq.is_train:
+            x, loss = self.encdec_slice_frames(x, frame_batch_size=8, encdec=self.encoder, return_vq=return_vq)
+        else:
+            x, loss, perplexity = self.encdec_slice_frames(x, frame_batch_size=8, encdec=self.encoder, return_vq=return_vq)
+        if return_vq:
+            return x, loss
+        x, _, _ = self.encdec_slice_frames(x, frame_batch_size=2, encdec=self.decoder, return_vq=return_vq)
+        x = x.permute(0, 2, 3, 1)
+        if self.vq.is_train:
+            return x, loss, perplexity
+        return x, loss

motion_extractor.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# motion_extractor.py
+import os
+import sys
+import cv2
+import torch
+import pickle
+import torchvision
+# Load the TorchScript model once at the top
+model_path = '/gemini/space/human_guozz2/dyb/MTVCrafter-main/nlf_l_multi_0.3.2.torchscript'
+assert os.path.exists(model_path), f"Model file not found at {model_path}"
+model = torch.jit.load(model_path).cuda().eval()
+def extract_pkl_from_video(video_path):
+    output_file = "temp_motion.pkl"
+    cap = cv2.VideoCapture(video_path)
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    pose_results = {
+        'joints3d_nonparam': [],
+    }
+    with torch.inference_mode(), torch.device('cuda'):
+        frame_idx = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Convert frame to tensor
+            frame_tensor = torch.from_numpy(frame).cuda()
+            frame_batch = frame_tensor.unsqueeze(0).permute(0,3,1,2)
+            # Model inference
+            pred = model.detect_smpl_batched(frame_batch)
+            # Collect pose data
+            for key in pose_results.keys():
+                if key in pred:
+                    #pose_results[key].append(pred[key].cpu().numpy())
+                    pose_results[key].append(pred[key])
+                else:
+                    pose_results[key].append(None)
+            frame_idx += 1
+    cap.release()
+    # Prepare output data
+    output_data = {
+        'video_path': video_path,
+        'video_length': frame_count,
+        'video_width': video_width,
+        'video_height': video_height,
+        'pose': pose_results
+    }
+    # Save to pkl file
+    with open(output_file, 'wb') as f:
+        pickle.dump(output_data, f)
+    return output_file

utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import random
+import numpy as np
+from PIL import Image
+def concat_images(images, direction='horizontal', pad=0, pad_value=0):
+    if len(images) == 1:
+        return images[0]
+    is_pil = isinstance(images[0], Image.Image)
+    if is_pil:
+        images = [np.array(image) for image in images]
+    if direction == 'horizontal':
+        height = max([image.shape[0] for image in images])
+        width = sum([image.shape[1] for image in images]) + pad * (len(images) - 1)
+        new_image = np.full((height, width, images[0].shape[2]), pad_value, dtype=images[0].dtype)
+        begin = 0
+        for image in images:
+            end = begin + image.shape[1]
+            new_image[: image.shape[0], begin:end] = image
+            begin = end + pad
+    elif direction == 'vertical':
+        height = sum([image.shape[0] for image in images]) + pad * (len(images) - 1)
+        width = max([image.shape[1] for image in images])
+        new_image = np.full((height, width, images[0].shape[2]), pad_value, dtype=images[0].dtype)
+        begin = 0
+        for image in images:
+            end = begin + image.shape[0]
+            new_image[begin:end, : image.shape[1]] = image
+            begin = end + pad
+    else:
+        assert False
+    if is_pil:
+        new_image = Image.fromarray(new_image)
+    return new_image
+def concat_images_grid(images, cols, pad=0, pad_value=0):
+    new_images = []
+    while len(images) > 0:
+        new_image = concat_images(images[:cols], pad=pad, pad_value=pad_value)
+        new_images.append(new_image)
+        images = images[cols:]
+    new_image = concat_images(new_images, direction='vertical', pad=pad, pad_value=pad_value)
+    return new_image
+def sample_video(video, indexes, method=2):
+    if method == 1:
+        frames = video.get_batch(indexes)
+        frames = frames.numpy() if isinstance(frames, torch.Tensor) else frames.asnumpy()
+    elif method == 2:
+        max_idx = indexes.max() + 1
+        all_indexes = np.arange(max_idx, dtype=int)
+        frames = video.get_batch(all_indexes)
+        frames = frames.numpy() if isinstance(frames, torch.Tensor) else frames.asnumpy()
+        frames = frames[indexes]
+    else:
+        assert False
+    return frames
+def get_sample_indexes(video_length, num_frames, stride):
+    assert num_frames * stride <= video_length
+    sample_length = min(video_length, (num_frames - 1) * stride + 1)
+    start_idx = 0 + random.randint(0, video_length - sample_length)
+    sample_indexes = np.linspace(start_idx, start_idx + sample_length - 1, num_frames, dtype=int)
+    return sample_indexes
+def get_new_height_width(data_dict, dst_height, dst_width):
+    height = data_dict['video_height']
+    width = data_dict['video_width']
+    if float(dst_height) / height < float(dst_width) / width:
+        new_height = int(round(float(dst_width) / width * height))
+        new_width = dst_width
+    else:
+        new_height = dst_height
+        new_width = int(round(float(dst_height) / height * width))
+    assert dst_width <= new_width and dst_height <= new_height
+    return new_height, new_width