LTX-Video-Playground

Running

App Files Files Community

Sapir commited on Oct 27

Commit

325137b

•

1 Parent(s): 41b1cab

Lint: added ruff.

Browse files

Files changed (23) hide show

.github/workflows/pylint.yml +27 -0
.gitignore +1 -1
.pre-commit-config.yaml +16 -0
scripts/to_safetensors.py +64 -30
setup.py +11 -7
xora/__init__.py +0 -1
xora/examples/image_to_video.py +94 -44
xora/examples/text_to_video.py +29 -13
xora/models/autoencoders/causal_conv3d.py +9 -3
xora/models/autoencoders/causal_video_autoencoder.py +133 -33
xora/models/autoencoders/conv_nd_factory.py +6 -2
xora/models/autoencoders/dual_conv3d.py +36 -6
xora/models/autoencoders/vae.py +74 -24
xora/models/autoencoders/vae_encode.py +62 -17
xora/models/autoencoders/video_autoencoder.py +170 -46
xora/models/transformers/attention.py +174 -53
xora/models/transformers/embeddings.py +6 -2
xora/models/transformers/symmetric_patchifier.py +19 -4
xora/models/transformers/transformer3d.py +86 -23
xora/pipelines/pipeline_video_pixart_alpha.py +205 -63
xora/schedulers/rf.py +43 -13
xora/utils/conditioning_method.py +2 -1
xora/utils/torch_utils.py +5 -1

.github/workflows/pylint.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+name: Ruff
+on: [push]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - name: Checkout repository and submodules
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff==0.2.2 black==24.2.0
+    - name: Analyzing the code with ruff
+      run: |
+        ruff $(git ls-files '*.py')
+    - name: Verify that no Black changes are required
+      run: |
+        black --check $(git ls-files '*.py')

.gitignore CHANGED Viewed

@@ -159,4 +159,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.2.2
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [--fix]  # Automatically fix issues if possible.
+        types: [python]  # Ensure it only runs on .py files.
+  - repo: https://github.com/psf/black
+    rev: 24.2.0  # Specify the version of Black you want
+    hooks:
+      - id: black
+        name: Black code formatter
+        language_version: python3  # Use the Python version you're targeting (e.g., 3.10)

scripts/to_safetensors.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import argparse
 from pathlib import Path
-from typing import Any, Dict
 import safetensors.torch
 import torch
 import json
@@ -8,12 +8,14 @@ import shutil
 def load_text_encoder(index_path: Path) -> Dict:
-    with open(index_path, 'r') as f:
         index: Dict = json.load(f)
     loaded_tensors = {}
     for part_file in set(index.get("weight_map", {}).values()):
-        tensors = safetensors.torch.load_file(index_path.parent / part_file, device='cpu')
         for tensor_name in tensors:
             loaded_tensors[tensor_name] = tensors[tensor_name]
@@ -30,23 +32,30 @@ def convert_vae(vae_path: Path, add_prefix=True) -> Dict:
     state_dict = torch.load(vae_path / "autoencoder.pth", weights_only=True)
     stats_path = vae_path / "per_channel_statistics.json"
     if stats_path.exists():
-        with open(stats_path, 'r') as f:
             data = json.load(f)
         transposed_data = list(zip(*data["data"]))
         data_dict = {
-            f"{'vae.' if add_prefix else ''}per_channel_statistics.{col}": torch.tensor(vals)
             for col, vals in zip(data["columns"], transposed_data)
         }
     else:
         data_dict = {}
-    result = {("vae." if add_prefix else "") + key: value for key, value in state_dict.items()}
     result.update(data_dict)
     return result
 def convert_encoder(encoder: Dict) -> Dict:
-    return {"text_encoders.t5xxl.transformer." + key: value for key, value in encoder.items()}
 def save_config(config_src: str, config_dst: str):
@@ -60,50 +69,75 @@ def load_vae_config(vae_path: Path) -> str:
     return str(config_path)
-def main(unet_path: str, vae_path: str, out_path: str, mode: str,
-         unet_config_path: str = None, scheduler_config_path: str = None) -> None:
-    unet = convert_unet(torch.load(unet_path, weights_only=True), add_prefix=(mode == 'single'))
     # Load VAE from directory and config
-    vae = convert_vae(Path(vae_path), add_prefix=(mode == 'single'))
     vae_config_path = load_vae_config(Path(vae_path))
-    if mode == 'single':
         result = {**unet, **vae}
         safetensors.torch.save_file(result, out_path)
-    elif mode == 'separate':
         # Create directories for unet, vae, and scheduler
-        unet_dir = Path(out_path) / 'unet'
-        vae_dir = Path(out_path) / 'vae'
-        scheduler_dir = Path(out_path) / 'scheduler'
         unet_dir.mkdir(parents=True, exist_ok=True)
         vae_dir.mkdir(parents=True, exist_ok=True)
         scheduler_dir.mkdir(parents=True, exist_ok=True)
         # Save unet and vae safetensors with the name diffusion_pytorch_model.safetensors
-        safetensors.torch.save_file(unet, unet_dir / 'diffusion_pytorch_model.safetensors')
-        safetensors.torch.save_file(vae, vae_dir / 'diffusion_pytorch_model.safetensors')
         # Save config files for unet, vae, and scheduler
         if unet_config_path:
-            save_config(unet_config_path, unet_dir / 'config.json')
         if vae_config_path:
-            save_config(vae_config_path, vae_dir / 'config.json')
         if scheduler_config_path:
-            save_config(scheduler_config_path, scheduler_dir / 'scheduler_config.json')
-if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--unet_path', '-u', type=str, default='unet/ema-002.pt')
-    parser.add_argument('--vae_path', '-v', type=str, default='vae/')
-    parser.add_argument('--out_path', '-o', type=str, default='xora.safetensors')
-    parser.add_argument('--mode', '-m', type=str, choices=['single', 'separate'], default='single',
-                        help="Choose 'single' for the original behavior, 'separate' to save unet and vae separately.")
-    parser.add_argument('--unet_config_path', type=str, help="Path to the UNet config file (for separate mode)")
-    parser.add_argument('--scheduler_config_path', type=str,
-                        help="Path to the Scheduler config file (for separate mode)")
     args = parser.parse_args()
     main(**args.__dict__)

 import argparse
 from pathlib import Path
+from typing import Dict
 import safetensors.torch
 import torch
 import json
 def load_text_encoder(index_path: Path) -> Dict:
+    with open(index_path, "r") as f:
         index: Dict = json.load(f)
     loaded_tensors = {}
     for part_file in set(index.get("weight_map", {}).values()):
+        tensors = safetensors.torch.load_file(
+            index_path.parent / part_file, device="cpu"
+        )
         for tensor_name in tensors:
             loaded_tensors[tensor_name] = tensors[tensor_name]
     state_dict = torch.load(vae_path / "autoencoder.pth", weights_only=True)
     stats_path = vae_path / "per_channel_statistics.json"
     if stats_path.exists():
+        with open(stats_path, "r") as f:
             data = json.load(f)
         transposed_data = list(zip(*data["data"]))
         data_dict = {
+            f"{'vae.' if add_prefix else ''}per_channel_statistics.{col}": torch.tensor(
+                vals
+            )
             for col, vals in zip(data["columns"], transposed_data)
         }
     else:
         data_dict = {}
+    result = {
+        ("vae." if add_prefix else "") + key: value for key, value in state_dict.items()
+    }
     result.update(data_dict)
     return result
 def convert_encoder(encoder: Dict) -> Dict:
+    return {
+        "text_encoders.t5xxl.transformer." + key: value
+        for key, value in encoder.items()
+    }
 def save_config(config_src: str, config_dst: str):
     return str(config_path)
+def main(
+    unet_path: str,
+    vae_path: str,
+    out_path: str,
+    mode: str,
+    unet_config_path: str = None,
+    scheduler_config_path: str = None,
+) -> None:
+    unet = convert_unet(
+        torch.load(unet_path, weights_only=True), add_prefix=(mode == "single")
+    )
     # Load VAE from directory and config
+    vae = convert_vae(Path(vae_path), add_prefix=(mode == "single"))
     vae_config_path = load_vae_config(Path(vae_path))
+    if mode == "single":
         result = {**unet, **vae}
         safetensors.torch.save_file(result, out_path)
+    elif mode == "separate":
         # Create directories for unet, vae, and scheduler
+        unet_dir = Path(out_path) / "unet"
+        vae_dir = Path(out_path) / "vae"
+        scheduler_dir = Path(out_path) / "scheduler"
         unet_dir.mkdir(parents=True, exist_ok=True)
         vae_dir.mkdir(parents=True, exist_ok=True)
         scheduler_dir.mkdir(parents=True, exist_ok=True)
         # Save unet and vae safetensors with the name diffusion_pytorch_model.safetensors
+        safetensors.torch.save_file(
+            unet, unet_dir / "diffusion_pytorch_model.safetensors"
+        )
+        safetensors.torch.save_file(
+            vae, vae_dir / "diffusion_pytorch_model.safetensors"
+        )
         # Save config files for unet, vae, and scheduler
         if unet_config_path:
+            save_config(unet_config_path, unet_dir / "config.json")
         if vae_config_path:
+            save_config(vae_config_path, vae_dir / "config.json")
         if scheduler_config_path:
+            save_config(scheduler_config_path, scheduler_dir / "scheduler_config.json")
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--unet_path", "-u", type=str, default="unet/ema-002.pt")
+    parser.add_argument("--vae_path", "-v", type=str, default="vae/")
+    parser.add_argument("--out_path", "-o", type=str, default="xora.safetensors")
+    parser.add_argument(
+        "--mode",
+        "-m",
+        type=str,
+        choices=["single", "separate"],
+        default="single",
+        help="Choose 'single' for the original behavior, 'separate' to save unet and vae separately.",
+    )
+    parser.add_argument(
+        "--unet_config_path",
+        type=str,
+        help="Path to the UNet config file (for separate mode)",
+    )
+    parser.add_argument(
+        "--scheduler_config_path",
+        type=str,
+        help="Path to the Scheduler config file (for separate mode)",
+    )
     args = parser.parse_args()
     main(**args.__dict__)

setup.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from setuptools import setup, find_packages
 def parse_requirements(filename):
     """Load requirements from a pip requirements file."""
-    with open(filename, 'r') as file:
         return file.read().splitlines()
@@ -13,11 +15,13 @@ setup(
     author_email="[email protected]",  # Your email
     url="https://github.com/LightricksResearch/xora-core",  # URL for the project (GitHub, etc.)
     packages=find_packages(),  # Automatically find all packages inside `xora`
-    install_requires=parse_requirements('requirements.txt'),  # Install dependencies from requirements.txt
     classifiers=[
-        'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: MIT License',
-        'Operating System :: OS Independent',
     ],
-    python_requires='>=3.10',  # Specify Python version compatibility
-)

 from setuptools import setup, find_packages
 def parse_requirements(filename):
     """Load requirements from a pip requirements file."""
+    with open(filename, "r") as file:
         return file.read().splitlines()
     author_email="[email protected]",  # Your email
     url="https://github.com/LightricksResearch/xora-core",  # URL for the project (GitHub, etc.)
     packages=find_packages(),  # Automatically find all packages inside `xora`
+    install_requires=parse_requirements(
+        "requirements.txt"
+    ),  # Install dependencies from requirements.txt
     classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
     ],
+    python_requires=">=3.10",  # Specify Python version compatibility
+)

xora/__init__.py CHANGED Viewed

	@@ -1 +0,0 @@
1	- from .pipelines import *

xora/examples/image_to_video.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import time
 import torch
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.transformers.transformer3d import Transformer3DModel
@@ -15,19 +14,20 @@ import os
 import numpy as np
 import cv2
 from PIL import Image
-from tqdm import tqdm
 import random
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
-    with open(vae_config_path, 'r') as f:
         vae_config = json.load(f)
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     vae.load_state_dict(vae_state_dict)
     return vae.cuda().to(torch.bfloat16)
 def load_unet(unet_dir):
     unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
@@ -37,11 +37,13 @@ def load_unet(unet_dir):
     transformer.load_state_dict(unet_state_dict, strict=True)
     return transformer.cuda()
 def load_scheduler(scheduler_dir):
     scheduler_config_path = scheduler_dir / "scheduler_config.json"
     scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
     return RectifiedFlowScheduler.from_config(scheduler_config)
 def center_crop_and_resize(frame, target_height, target_width):
     h, w, _ = frame.shape
     aspect_ratio_target = target_width / target_height
@@ -49,14 +51,15 @@ def center_crop_and_resize(frame, target_height, target_width):
     if aspect_ratio_frame > aspect_ratio_target:
         new_width = int(h * aspect_ratio_target)
         x_start = (w - new_width) // 2
-        frame_cropped = frame[:, x_start:x_start + new_width]
     else:
         new_height = int(w / aspect_ratio_target)
         y_start = (h - new_height) // 2
-        frame_cropped = frame[y_start:y_start + new_height, :]
     frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
     return frame_resized
 def load_video_to_tensor_with_resize(video_path, target_height=512, target_width=768):
     cap = cv2.VideoCapture(video_path)
     frames = []
@@ -72,6 +75,7 @@ def load_video_to_tensor_with_resize(video_path, target_height=512, target_width
     video_tensor = torch.tensor(video_np).permute(3, 0, 1, 2).float()
     return video_tensor
 def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
     image = Image.open(image_path).convert("RGB")
     image_np = np.array(image)
@@ -81,51 +85,90 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
     # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
     return frame_tensor.unsqueeze(0).unsqueeze(2)
 def main():
-    parser = argparse.ArgumentParser(description='Load models from separate directories and run the pipeline.')
     # Directories
-    parser.add_argument('--ckpt_dir', type=str, required=True,
-                        help='Path to the directory containing unet, vae, and scheduler subdirectories')
-    parser.add_argument('--video_path', type=str,
-                        help='Path to the input video file (first frame used)')
-    parser.add_argument('--image_path', type=str,
-                        help='Path to the input image file')
-    parser.add_argument('--seed', type=int, default="171198")
     # Pipeline parameters
-    parser.add_argument('--num_inference_steps', type=int, default=40, help='Number of inference steps')
-    parser.add_argument('--num_images_per_prompt', type=int, default=1, help='Number of images per prompt')
-    parser.add_argument('--guidance_scale', type=float, default=3, help='Guidance scale for the pipeline')
-    parser.add_argument('--height', type=int, default=512, help='Height of the output video frames')
-    parser.add_argument('--width', type=int, default=768, help='Width of the output video frames')
-    parser.add_argument('--num_frames', type=int, default=121, help='Number of frames to generate in the output video')
-    parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate for the output video')
     # Prompts
-    parser.add_argument('--prompt', type=str,
-                        default='A man wearing a black leather jacket and blue jeans is riding a Harley Davidson motorcycle down a paved road. The man has short brown hair and is wearing a black helmet. The motorcycle is a dark red color with a large front fairing. The road is surrounded by green grass and trees. There is a gas station on the left side of the road with a red and white sign that says "Oil" and "Diner".',
-                        help='Text prompt to guide generation')
-    parser.add_argument('--negative_prompt', type=str,
-                        default='worst quality, inconsistent motion, blurry, jittery, distorted',
-                        help='Negative prompt for undesired features')
     args = parser.parse_args()
     # Paths for the separate mode directories
     ckpt_dir = Path(args.ckpt_dir)
-    unet_dir = ckpt_dir / 'unet'
-    vae_dir = ckpt_dir / 'vae'
-    scheduler_dir = ckpt_dir / 'scheduler'
     # Load models
     vae = load_vae(vae_dir)
     unet = load_unet(unet_dir)
     scheduler = load_scheduler(scheduler_dir)
     patchifier = SymmetricPatchifier(patch_size=1)
-    text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(
-        "cuda")
-    tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
     # Use submodels for the pipeline
     submodel_dict = {
@@ -141,22 +184,25 @@ def main():
     # Load media (video or image)
     if args.video_path:
-        media_items = load_video_to_tensor_with_resize(args.video_path, args.height, args.width).unsqueeze(0)
     elif args.image_path:
-        media_items = load_image_to_tensor_with_resize(args.image_path, args.height, args.width)
     else:
         raise ValueError("Either --video_path or --image_path must be provided.")
     # Prepare input for the pipeline
     sample = {
         "prompt": args.prompt,
-        'prompt_attention_mask': None,
-        'negative_prompt': args.negative_prompt,
-        'negative_prompt_attention_mask': None,
-        'media_items': media_items,
     }
-    start_time = time.time()
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
@@ -177,16 +223,18 @@ def main():
         **sample,
         is_video=True,
         vae_per_channel_normalize=True,
-        conditioning_method=ConditioningMethod.FIRST_FRAME
     ).images
     # Save output video
-    def get_unique_filename(base, ext, dir='.', index_range=1000):
         for i in range(index_range):
             filename = os.path.join(dir, f"{base}_{i}{ext}")
             if not os.path.exists(filename):
                 return filename
-        raise FileExistsError(f"Could not find a unique filename after {index_range} attempts.")
     for i in range(images.shape[0]):
         video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
@@ -195,7 +243,9 @@ def main():
         height, width = video_np.shape[1:3]
         output_filename = get_unique_filename(f"video_output_{i}", ".mp4", ".")
-        out = cv2.VideoWriter(output_filename, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
         for frame in video_np[..., ::-1]:
             out.write(frame)

 import torch
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.transformers.transformer3d import Transformer3DModel
 import numpy as np
 import cv2
 from PIL import Image
 import random
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
+    with open(vae_config_path, "r") as f:
         vae_config = json.load(f)
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     vae.load_state_dict(vae_state_dict)
     return vae.cuda().to(torch.bfloat16)
 def load_unet(unet_dir):
     unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
     transformer.load_state_dict(unet_state_dict, strict=True)
     return transformer.cuda()
 def load_scheduler(scheduler_dir):
     scheduler_config_path = scheduler_dir / "scheduler_config.json"
     scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
     return RectifiedFlowScheduler.from_config(scheduler_config)
 def center_crop_and_resize(frame, target_height, target_width):
     h, w, _ = frame.shape
     aspect_ratio_target = target_width / target_height
     if aspect_ratio_frame > aspect_ratio_target:
         new_width = int(h * aspect_ratio_target)
         x_start = (w - new_width) // 2
+        frame_cropped = frame[:, x_start : x_start + new_width]
     else:
         new_height = int(w / aspect_ratio_target)
         y_start = (h - new_height) // 2
+        frame_cropped = frame[y_start : y_start + new_height, :]
     frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
     return frame_resized
 def load_video_to_tensor_with_resize(video_path, target_height=512, target_width=768):
     cap = cv2.VideoCapture(video_path)
     frames = []
     video_tensor = torch.tensor(video_np).permute(3, 0, 1, 2).float()
     return video_tensor
 def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
     image = Image.open(image_path).convert("RGB")
     image_np = np.array(image)
     # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
     return frame_tensor.unsqueeze(0).unsqueeze(2)
 def main():
+    parser = argparse.ArgumentParser(
+        description="Load models from separate directories and run the pipeline."
+    )
     # Directories
+    parser.add_argument(
+        "--ckpt_dir",
+        type=str,
+        required=True,
+        help="Path to the directory containing unet, vae, and scheduler subdirectories",
+    )
+    parser.add_argument(
+        "--video_path", type=str, help="Path to the input video file (first frame used)"
+    )
+    parser.add_argument("--image_path", type=str, help="Path to the input image file")
+    parser.add_argument("--seed", type=int, default="171198")
     # Pipeline parameters
+    parser.add_argument(
+        "--num_inference_steps", type=int, default=40, help="Number of inference steps"
+    )
+    parser.add_argument(
+        "--num_images_per_prompt",
+        type=int,
+        default=1,
+        help="Number of images per prompt",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3,
+        help="Guidance scale for the pipeline",
+    )
+    parser.add_argument(
+        "--height", type=int, default=512, help="Height of the output video frames"
+    )
+    parser.add_argument(
+        "--width", type=int, default=768, help="Width of the output video frames"
+    )
+    parser.add_argument(
+        "--num_frames",
+        type=int,
+        default=121,
+        help="Number of frames to generate in the output video",
+    )
+    parser.add_argument(
+        "--frame_rate", type=int, default=25, help="Frame rate for the output video"
+    )
     # Prompts
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default='A man wearing a black leather jacket and blue jeans is riding a Harley Davidson motorcycle down a paved road. The man has short brown hair and is wearing a black helmet. The motorcycle is a dark red color with a large front fairing. The road is surrounded by green grass and trees. There is a gas station on the left side of the road with a red and white sign that says "Oil" and "Diner".',
+        help="Text prompt to guide generation",
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="worst quality, inconsistent motion, blurry, jittery, distorted",
+        help="Negative prompt for undesired features",
+    )
     args = parser.parse_args()
     # Paths for the separate mode directories
     ckpt_dir = Path(args.ckpt_dir)
+    unet_dir = ckpt_dir / "unet"
+    vae_dir = ckpt_dir / "vae"
+    scheduler_dir = ckpt_dir / "scheduler"
     # Load models
     vae = load_vae(vae_dir)
     unet = load_unet(unet_dir)
     scheduler = load_scheduler(scheduler_dir)
     patchifier = SymmetricPatchifier(patch_size=1)
+    text_encoder = T5EncoderModel.from_pretrained(
+        "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
+    ).to("cuda")
+    tokenizer = T5Tokenizer.from_pretrained(
+        "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
+    )
     # Use submodels for the pipeline
     submodel_dict = {
     # Load media (video or image)
     if args.video_path:
+        media_items = load_video_to_tensor_with_resize(
+            args.video_path, args.height, args.width
+        ).unsqueeze(0)
     elif args.image_path:
+        media_items = load_image_to_tensor_with_resize(
+            args.image_path, args.height, args.width
+        )
     else:
         raise ValueError("Either --video_path or --image_path must be provided.")
     # Prepare input for the pipeline
     sample = {
         "prompt": args.prompt,
+        "prompt_attention_mask": None,
+        "negative_prompt": args.negative_prompt,
+        "negative_prompt_attention_mask": None,
+        "media_items": media_items,
     }
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
         **sample,
         is_video=True,
         vae_per_channel_normalize=True,
+        conditioning_method=ConditioningMethod.FIRST_FRAME,
     ).images
     # Save output video
+    def get_unique_filename(base, ext, dir=".", index_range=1000):
         for i in range(index_range):
             filename = os.path.join(dir, f"{base}_{i}{ext}")
             if not os.path.exists(filename):
                 return filename
+        raise FileExistsError(
+            f"Could not find a unique filename after {index_range} attempts."
+        )
     for i in range(images.shape[0]):
         video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
         height, width = video_np.shape[1:3]
         output_filename = get_unique_filename(f"video_output_{i}", ".mp4", ".")
+        out = cv2.VideoWriter(
+            output_filename, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
+        )
         for frame in video_np[..., ::-1]:
             out.write(frame)

xora/examples/text_to_video.py CHANGED Viewed

@@ -10,16 +10,18 @@ import safetensors.torch
 import json
 import argparse
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
-    with open(vae_config_path, 'r') as f:
         vae_config = json.load(f)
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     vae.load_state_dict(vae_state_dict)
     return vae.cuda().to(torch.bfloat16)
 def load_unet(unet_dir):
     unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
@@ -29,22 +31,31 @@ def load_unet(unet_dir):
     transformer.load_state_dict(unet_state_dict, strict=True)
     return transformer.cuda()
 def load_scheduler(scheduler_dir):
     scheduler_config_path = scheduler_dir / "scheduler_config.json"
     scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
     return RectifiedFlowScheduler.from_config(scheduler_config)
 def main():
     # Parse command line arguments
-    parser = argparse.ArgumentParser(description='Load models from separate directories')
-    parser.add_argument('--separate_dir', type=str, required=True, help='Path to the directory containing unet, vae, and scheduler subdirectories')
     args = parser.parse_args()
     # Paths for the separate mode directories
     separate_dir = Path(args.separate_dir)
-    unet_dir = separate_dir / 'unet'
-    vae_dir = separate_dir / 'vae'
-    scheduler_dir = separate_dir / 'scheduler'
     # Load models
     vae = load_vae(vae_dir)
@@ -54,8 +65,12 @@ def main():
     # Patchifier (remains the same)
     patchifier = SymmetricPatchifier(patch_size=1)
-    text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to("cuda")
-    tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
     # Use submodels for the pipeline
     submodel_dict = {
@@ -79,14 +94,14 @@ def main():
     frame_rate = 25
     sample = {
         "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
-                  "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
-        'prompt_attention_mask': None,  # Adjust attention masks as needed
-        'negative_prompt': "Ugly deformed",
-        'negative_prompt_attention_mask': None
     }
     # Generate images (video frames)
-    images = pipeline(
         num_inference_steps=num_inference_steps,
         num_images_per_prompt=num_images_per_prompt,
         guidance_scale=guidance_scale,
@@ -104,5 +119,6 @@ def main():
     print("Generated images (video frames).")
 if __name__ == "__main__":
     main()

 import json
 import argparse
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
+    with open(vae_config_path, "r") as f:
         vae_config = json.load(f)
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     vae.load_state_dict(vae_state_dict)
     return vae.cuda().to(torch.bfloat16)
 def load_unet(unet_dir):
     unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
     transformer.load_state_dict(unet_state_dict, strict=True)
     return transformer.cuda()
 def load_scheduler(scheduler_dir):
     scheduler_config_path = scheduler_dir / "scheduler_config.json"
     scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
     return RectifiedFlowScheduler.from_config(scheduler_config)
 def main():
     # Parse command line arguments
+    parser = argparse.ArgumentParser(
+        description="Load models from separate directories"
+    )
+    parser.add_argument(
+        "--separate_dir",
+        type=str,
+        required=True,
+        help="Path to the directory containing unet, vae, and scheduler subdirectories",
+    )
     args = parser.parse_args()
     # Paths for the separate mode directories
     separate_dir = Path(args.separate_dir)
+    unet_dir = separate_dir / "unet"
+    vae_dir = separate_dir / "vae"
+    scheduler_dir = separate_dir / "scheduler"
     # Load models
     vae = load_vae(vae_dir)
     # Patchifier (remains the same)
     patchifier = SymmetricPatchifier(patch_size=1)
+    text_encoder = T5EncoderModel.from_pretrained(
+        "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
+    ).to("cuda")
+    tokenizer = T5Tokenizer.from_pretrained(
+        "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
+    )
     # Use submodels for the pipeline
     submodel_dict = {
     frame_rate = 25
     sample = {
         "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
+        "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
+        "prompt_attention_mask": None,  # Adjust attention masks as needed
+        "negative_prompt": "Ugly deformed",
+        "negative_prompt_attention_mask": None,
     }
     # Generate images (video frames)
+    _ = pipeline(
         num_inference_steps=num_inference_steps,
         num_images_per_prompt=num_images_per_prompt,
         guidance_scale=guidance_scale,
     print("Generated images (video frames).")
 if __name__ == "__main__":
     main()

xora/models/autoencoders/causal_conv3d.py CHANGED Viewed

@@ -40,11 +40,17 @@ class CausalConv3d(nn.Module):
     def forward(self, x, causal: bool = True):
         if causal:
-            first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, self.time_kernel_size - 1, 1, 1))
             x = torch.concatenate((first_frame_pad, x), dim=2)
         else:
-            first_frame_pad = x[:, :, :1, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1))
-            last_frame_pad = x[:, :, -1:, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1))
             x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
         x = self.conv(x)
         return x

     def forward(self, x, causal: bool = True):
         if causal:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, self.time_kernel_size - 1, 1, 1)
+            )
             x = torch.concatenate((first_frame_pad, x), dim=2)
         else:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            last_frame_pad = x[:, :, -1:, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
             x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
         x = self.conv(x)
         return x

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -16,9 +16,15 @@ from xora.models.autoencoders.vae import AutoencoderKLWrapper
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class CausalVideoAutoencoder(AutoencoderKLWrapper):
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *args, **kwargs):
         config_local_path = pretrained_model_name_or_path / "config.json"
         config = cls.load_config(config_local_path, **kwargs)
         video_vae = cls.from_config(config)
@@ -28,29 +34,41 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
         ckpt_state_dict = torch.load(model_local_path, map_location=torch.device("cpu"))
         video_vae.load_state_dict(ckpt_state_dict)
-        statistics_local_path = pretrained_model_name_or_path / "per_channel_statistics.json"
         if statistics_local_path.exists():
             with open(statistics_local_path, "r") as file:
                 data = json.load(file)
             transposed_data = list(zip(*data["data"]))
-            data_dict = {col: torch.tensor(vals) for col, vals in zip(data["columns"], transposed_data)}
             video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
             video_vae.register_buffer(
-                "mean_of_means", data_dict.get("mean-of-means", torch.zeros_like(data_dict["std-of-means"]))
             )
         return video_vae
     @staticmethod
     def from_config(config):
-        assert config["_class_name"] == "CausalVideoAutoencoder", "config must have _class_name=CausalVideoAutoencoder"
         if isinstance(config["dims"], list):
             config["dims"] = tuple(config["dims"])
         assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
         double_z = config.get("double_z", True)
-        latent_log_var = config.get("latent_log_var", "per_channel" if double_z else "none")
         use_quant_conv = config.get("use_quant_conv", True)
         if use_quant_conv and latent_log_var == "uniform":
@@ -91,7 +109,8 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             _class_name="CausalVideoAutoencoder",
             dims=self.dims,
             in_channels=self.encoder.conv_in.in_channels // self.encoder.patch_size**2,
-            out_channels=self.decoder.conv_out.out_channels // self.decoder.patch_size**2,
             latent_channels=self.decoder.conv_in.in_channels,
             blocks=self.encoder.blocks_desc,
             scaling_factor=1.0,
@@ -112,13 +131,26 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
     @property
     def spatial_downscale_factor(self):
         return (
-            2 ** len([block for block in self.encoder.blocks_desc if block[0] in ["compress_space", "compress_all"]])
             * self.encoder.patch_size
         )
     @property
     def temporal_downscale_factor(self):
-        return 2 ** len([block for block in self.encoder.blocks_desc if block[0] in ["compress_time", "compress_all"]])
     def to_json_string(self) -> str:
         import json
@@ -146,7 +178,9 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
                 key = key.replace(k, v)
             if "norm" in key and key not in model_keys:
-                logger.info(f"Removing key {key} from state_dict as it is not present in the model")
                 continue
             converted_state_dict[key] = value
@@ -293,7 +327,9 @@ class Encoder(nn.Module):
         # out
         if norm_layer == "group_norm":
-            self.conv_norm_out = nn.GroupNorm(num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6)
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         elif norm_layer == "layer_norm":
@@ -308,7 +344,9 @@ class Encoder(nn.Module):
             conv_out_channels += 1
         elif latent_log_var != "none":
             raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
-        self.conv_out = make_conv_nd(dims, output_channel, conv_out_channels, 3, padding=1, causal=True)
         self.gradient_checkpointing = False
@@ -337,11 +375,15 @@ class Encoder(nn.Module):
             if num_dims == 4:
                 # For shape (B, C, H, W)
-                repeated_last_channel = last_channel.repeat(1, sample.shape[1] - 2, 1, 1)
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             elif num_dims == 5:
                 # For shape (B, C, F, H, W)
-                repeated_last_channel = last_channel.repeat(1, sample.shape[1] - 2, 1, 1, 1)
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             else:
                 raise ValueError(f"Invalid input shape: {sample.shape}")
@@ -430,25 +472,35 @@ class Decoder(nn.Module):
                     norm_layer=norm_layer,
                 )
             elif block_name == "compress_time":
-                block = DepthToSpaceUpsample(dims=dims, in_channels=input_channel, stride=(2, 1, 1))
             elif block_name == "compress_space":
-                block = DepthToSpaceUpsample(dims=dims, in_channels=input_channel, stride=(1, 2, 2))
             elif block_name == "compress_all":
-                block = DepthToSpaceUpsample(dims=dims, in_channels=input_channel, stride=(2, 2, 2))
             else:
                 raise ValueError(f"unknown layer: {block_name}")
             self.up_blocks.append(block)
         if norm_layer == "group_norm":
-            self.conv_norm_out = nn.GroupNorm(num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6)
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         elif norm_layer == "layer_norm":
             self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
         self.conv_act = nn.SiLU()
-        self.conv_out = make_conv_nd(dims, output_channel, out_channels, 3, padding=1, causal=True)
         self.gradient_checkpointing = False
@@ -509,7 +561,9 @@ class UNetMidBlock3D(nn.Module):
         norm_layer: str = "group_norm",
     ):
         super().__init__()
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         self.res_blocks = nn.ModuleList(
             [
@@ -526,7 +580,9 @@ class UNetMidBlock3D(nn.Module):
             ]
         )
-    def forward(self, hidden_states: torch.FloatTensor, causal: bool = True) -> torch.FloatTensor:
         for resnet in self.res_blocks:
             hidden_states = resnet(hidden_states, causal=causal)
@@ -604,7 +660,9 @@ class ResnetBlock3D(nn.Module):
         self.use_conv_shortcut = conv_shortcut
         if norm_layer == "group_norm":
-            self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
         elif norm_layer == "pixel_norm":
             self.norm1 = PixelNorm()
         elif norm_layer == "layer_norm":
@@ -612,10 +670,20 @@ class ResnetBlock3D(nn.Module):
         self.non_linearity = nn.SiLU()
-        self.conv1 = make_conv_nd(dims, in_channels, out_channels, kernel_size=3, stride=1, padding=1, causal=True)
         if norm_layer == "group_norm":
-            self.norm2 = nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
         elif norm_layer == "pixel_norm":
             self.norm2 = PixelNorm()
         elif norm_layer == "layer_norm":
@@ -623,16 +691,28 @@ class ResnetBlock3D(nn.Module):
         self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = make_conv_nd(dims, out_channels, out_channels, kernel_size=3, stride=1, padding=1, causal=True)
         self.conv_shortcut = (
-            make_linear_nd(dims=dims, in_channels=in_channels, out_channels=out_channels)
             if in_channels != out_channels
             else nn.Identity()
         )
         self.norm3 = (
-            LayerNorm(in_channels, eps=eps, elementwise_affine=True) if in_channels != out_channels else nn.Identity()
         )
     def forward(
@@ -669,9 +749,17 @@ def patchify(x, patch_size_hw, patch_size_t=1):
     if patch_size_hw == 1 and patch_size_t == 1:
         return x
     if x.dim() == 4:
-        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw)
     elif x.dim() == 5:
-        x = rearrange(x, "b c (f p) (h q) (w r) -> b (c p r q) f h w", p=patch_size_t, q=patch_size_hw, r=patch_size_hw)
     else:
         raise ValueError(f"Invalid input shape: {x.shape}")
@@ -683,9 +771,17 @@ def unpatchify(x, patch_size_hw, patch_size_t=1):
         return x
     if x.dim() == 4:
-        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw)
     elif x.dim() == 5:
-        x = rearrange(x, "b (c p r q) f h w -> b c (f p) (h q) (w r)", p=patch_size_t, q=patch_size_hw, r=patch_size_hw)
     return x
@@ -755,14 +851,18 @@ def demo_video_autoencoder_forward_backward():
     print(f"input shape={input_videos.shape}")
     print(f"latent shape={latent.shape}")
-    reconstructed_videos = video_autoencoder.decode(latent, target_shape=input_videos.shape).sample
     print(f"reconstructed shape={reconstructed_videos.shape}")
     # Validate that single image gets treated the same way as first frame
     input_image = input_videos[:, :, :1, :, :]
     image_latent = video_autoencoder.encode(input_image).latent_dist.mode()
-    reconstructed_image = video_autoencoder.decode(image_latent, target_shape=image_latent.shape).sample
     first_frame_latent = latent[:, :, :1, :, :]

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class CausalVideoAutoencoder(AutoencoderKLWrapper):
     @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *args,
+        **kwargs,
+    ):
         config_local_path = pretrained_model_name_or_path / "config.json"
         config = cls.load_config(config_local_path, **kwargs)
         video_vae = cls.from_config(config)
         ckpt_state_dict = torch.load(model_local_path, map_location=torch.device("cpu"))
         video_vae.load_state_dict(ckpt_state_dict)
+        statistics_local_path = (
+            pretrained_model_name_or_path / "per_channel_statistics.json"
+        )
         if statistics_local_path.exists():
             with open(statistics_local_path, "r") as file:
                 data = json.load(file)
             transposed_data = list(zip(*data["data"]))
+            data_dict = {
+                col: torch.tensor(vals)
+                for col, vals in zip(data["columns"], transposed_data)
+            }
             video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
             video_vae.register_buffer(
+                "mean_of_means",
+                data_dict.get(
+                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+                ),
             )
         return video_vae
     @staticmethod
     def from_config(config):
+        assert (
+            config["_class_name"] == "CausalVideoAutoencoder"
+        ), "config must have _class_name=CausalVideoAutoencoder"
         if isinstance(config["dims"], list):
             config["dims"] = tuple(config["dims"])
         assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
         double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
         use_quant_conv = config.get("use_quant_conv", True)
         if use_quant_conv and latent_log_var == "uniform":
             _class_name="CausalVideoAutoencoder",
             dims=self.dims,
             in_channels=self.encoder.conv_in.in_channels // self.encoder.patch_size**2,
+            out_channels=self.decoder.conv_out.out_channels
+            // self.decoder.patch_size**2,
             latent_channels=self.decoder.conv_in.in_channels,
             blocks=self.encoder.blocks_desc,
             scaling_factor=1.0,
     @property
     def spatial_downscale_factor(self):
         return (
+            2
+            ** len(
+                [
+                    block
+                    for block in self.encoder.blocks_desc
+                    if block[0] in ["compress_space", "compress_all"]
+                ]
+            )
             * self.encoder.patch_size
         )
     @property
     def temporal_downscale_factor(self):
+        return 2 ** len(
+            [
+                block
+                for block in self.encoder.blocks_desc
+                if block[0] in ["compress_time", "compress_all"]
+            ]
+        )
     def to_json_string(self) -> str:
         import json
                 key = key.replace(k, v)
             if "norm" in key and key not in model_keys:
+                logger.info(
+                    f"Removing key {key} from state_dict as it is not present in the model"
+                )
                 continue
             converted_state_dict[key] = value
         # out
         if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         elif norm_layer == "layer_norm":
             conv_out_channels += 1
         elif latent_log_var != "none":
             raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
+        )
         self.gradient_checkpointing = False
             if num_dims == 4:
                 # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             elif num_dims == 5:
                 # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             else:
                 raise ValueError(f"Invalid input shape: {sample.shape}")
                     norm_layer=norm_layer,
                 )
             elif block_name == "compress_time":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
+                )
             elif block_name == "compress_space":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
+                )
             elif block_name == "compress_all":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(2, 2, 2)
+                )
             else:
                 raise ValueError(f"unknown layer: {block_name}")
             self.up_blocks.append(block)
         if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         elif norm_layer == "layer_norm":
             self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
         self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims, output_channel, out_channels, 3, padding=1, causal=True
+        )
         self.gradient_checkpointing = False
         norm_layer: str = "group_norm",
     ):
         super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
         self.res_blocks = nn.ModuleList(
             [
             ]
         )
+    def forward(
+        self, hidden_states: torch.FloatTensor, causal: bool = True
+    ) -> torch.FloatTensor:
         for resnet in self.res_blocks:
             hidden_states = resnet(hidden_states, causal=causal)
         self.use_conv_shortcut = conv_shortcut
         if norm_layer == "group_norm":
+            self.norm1 = nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
         elif norm_layer == "pixel_norm":
             self.norm1 = PixelNorm()
         elif norm_layer == "layer_norm":
         self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(
+            dims,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
         if norm_layer == "group_norm":
+            self.norm2 = nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
         elif norm_layer == "pixel_norm":
             self.norm2 = PixelNorm()
         elif norm_layer == "layer_norm":
         self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(
+            dims,
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
         self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
             if in_channels != out_channels
             else nn.Identity()
         )
         self.norm3 = (
+            LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+            if in_channels != out_channels
+            else nn.Identity()
         )
     def forward(
     if patch_size_hw == 1 and patch_size_t == 1:
         return x
     if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
     elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
     else:
         raise ValueError(f"Invalid input shape: {x.shape}")
         return x
     if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
     elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
     return x
     print(f"input shape={input_videos.shape}")
     print(f"latent shape={latent.shape}")
+    reconstructed_videos = video_autoencoder.decode(
+        latent, target_shape=input_videos.shape
+    ).sample
     print(f"reconstructed shape={reconstructed_videos.shape}")
     # Validate that single image gets treated the same way as first frame
     input_image = input_videos[:, :, :1, :, :]
     image_latent = video_autoencoder.encode(input_image).latent_dist.mode()
+    reconstructed_image = video_autoencoder.decode(
+        image_latent, target_shape=image_latent.shape
+    ).sample
     first_frame_latent = latent[:, :, :1, :, :]

xora/models/autoencoders/conv_nd_factory.py CHANGED Viewed

@@ -71,8 +71,12 @@ def make_linear_nd(
     bias=True,
 ):
     if dims == 2:
-        return torch.nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias)
     elif dims == 3 or dims == (2, 1):
-        return torch.nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias)
     else:
         raise ValueError(f"unsupported dimensions: {dims}")

     bias=True,
 ):
     if dims == 2:
+        return torch.nn.Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
     elif dims == 3 or dims == (2, 1):
+        return torch.nn.Conv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
     else:
         raise ValueError(f"unsupported dimensions: {dims}")

xora/models/autoencoders/dual_conv3d.py CHANGED Viewed

@@ -27,7 +27,9 @@ class DualConv3d(nn.Module):
         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size, kernel_size)
         if kernel_size == (1, 1, 1):
-            raise ValueError("kernel_size must be greater than 1. Use make_linear_nd instead.")
         if isinstance(stride, int):
             stride = (stride, stride, stride)
         if isinstance(padding, int):
@@ -40,11 +42,19 @@ class DualConv3d(nn.Module):
         self.bias = bias
         # Define the size of the channels after the first convolution
-        intermediate_channels = out_channels if in_channels < out_channels else in_channels
         # Define parameters for the first convolution
         self.weight1 = nn.Parameter(
-            torch.Tensor(intermediate_channels, in_channels // groups, 1, kernel_size[1], kernel_size[2])
         )
         self.stride1 = (1, stride[1], stride[2])
         self.padding1 = (0, padding[1], padding[2])
@@ -55,7 +65,11 @@ class DualConv3d(nn.Module):
             self.register_parameter("bias1", None)
         # Define parameters for the second convolution
-        self.weight2 = nn.Parameter(torch.Tensor(out_channels, intermediate_channels // groups, kernel_size[0], 1, 1))
         self.stride2 = (stride[0], 1, 1)
         self.padding2 = (padding[0], 0, 0)
         self.dilation2 = (dilation[0], 1, 1)
@@ -86,13 +100,29 @@ class DualConv3d(nn.Module):
     def forward_with_3d(self, x, skip_time_conv):
         # First convolution
-        x = F.conv3d(x, self.weight1, self.bias1, self.stride1, self.padding1, self.dilation1, self.groups)
         if skip_time_conv:
             return x
         # Second convolution
-        x = F.conv3d(x, self.weight2, self.bias2, self.stride2, self.padding2, self.dilation2, self.groups)
         return x

         if isinstance(kernel_size, int):
             kernel_size = (kernel_size, kernel_size, kernel_size)
         if kernel_size == (1, 1, 1):
+            raise ValueError(
+                "kernel_size must be greater than 1. Use make_linear_nd instead."
+            )
         if isinstance(stride, int):
             stride = (stride, stride, stride)
         if isinstance(padding, int):
         self.bias = bias
         # Define the size of the channels after the first convolution
+        intermediate_channels = (
+            out_channels if in_channels < out_channels else in_channels
+        )
         # Define parameters for the first convolution
         self.weight1 = nn.Parameter(
+            torch.Tensor(
+                intermediate_channels,
+                in_channels // groups,
+                1,
+                kernel_size[1],
+                kernel_size[2],
+            )
         )
         self.stride1 = (1, stride[1], stride[2])
         self.padding1 = (0, padding[1], padding[2])
             self.register_parameter("bias1", None)
         # Define parameters for the second convolution
+        self.weight2 = nn.Parameter(
+            torch.Tensor(
+                out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
+            )
+        )
         self.stride2 = (stride[0], 1, 1)
         self.padding2 = (padding[0], 0, 0)
         self.dilation2 = (dilation[0], 1, 1)
     def forward_with_3d(self, x, skip_time_conv):
         # First convolution
+        x = F.conv3d(
+            x,
+            self.weight1,
+            self.bias1,
+            self.stride1,
+            self.padding1,
+            self.dilation1,
+            self.groups,
+        )
         if skip_time_conv:
             return x
         # Second convolution
+        x = F.conv3d(
+            x,
+            self.weight2,
+            self.bias2,
+            self.stride2,
+            self.padding2,
+            self.dilation2,
+            self.groups,
+        )
         return x

xora/models/autoencoders/vae.py CHANGED Viewed

@@ -4,7 +4,10 @@ import torch
 import math
 import torch.nn as nn
 from diffusers import ConfigMixin, ModelMixin
-from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd
@@ -43,8 +46,12 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         quant_dims = 2 if dims == 2 else 3
         self.decoder = decoder
         if use_quant_conv:
-            self.quant_conv = make_conv_nd(quant_dims, 2 * latent_channels, 2 * latent_channels, 1)
-            self.post_quant_conv = make_conv_nd(quant_dims, latent_channels, latent_channels, 1)
         else:
             self.quant_conv = nn.Identity()
             self.post_quant_conv = nn.Identity()
@@ -104,7 +111,13 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         for i in range(0, x.shape[3], overlap_size):
             row = []
             for j in range(0, x.shape[4], overlap_size):
-                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
                 tile = self.encoder(tile)
                 tile = self.quant_conv(tile)
                 row.append(tile)
@@ -125,42 +138,58 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         moments = torch.cat(result_rows, dim=3)
         return moments
-    def blend_z(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[2], b.shape[2], blend_extent)
         for z in range(blend_extent):
-            b[:, :, z, :, :] = a[:, :, -blend_extent + z, :, :] * (1 - z / blend_extent) + b[:, :, z, :, :] * (
-                z / blend_extent
-            )
         return b
-    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[3], b.shape[3], blend_extent)
         for y in range(blend_extent):
-            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
-                y / blend_extent
-            )
         return b
-    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[4], b.shape[4], blend_extent)
         for x in range(blend_extent):
-            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
-                x / blend_extent
-            )
         return b
     def _hw_tiled_decode(self, z: torch.FloatTensor, target_shape):
         overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
         row_limit = self.tile_sample_min_size - blend_extent
-        tile_target_shape = (*target_shape[:3], self.tile_sample_min_size, self.tile_sample_min_size)
         # Split z into overlapping 64x64 tiles and decode them separately.
         # The tiles have an overlap to avoid seams between tiles.
         rows = []
         for i in range(0, z.shape[3], overlap_size):
             row = []
             for j in range(0, z.shape[4], overlap_size):
-                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
                 tile = self.post_quant_conv(tile)
                 decoded = self.decoder(tile, target_shape=tile_target_shape)
                 row.append(decoded)
@@ -181,20 +210,34 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         dec = torch.cat(result_rows, dim=3)
         return dec
-    def encode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
         if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
             num_splits = z.shape[2] // self.z_sample_size
             sizes = [self.z_sample_size] * num_splits
-            sizes = sizes + [z.shape[2] - sum(sizes)] if z.shape[2] - sum(sizes) > 0 else sizes
             tiles = z.split(sizes, dim=2)
             moments_tiles = [
-                self._hw_tiled_encode(z_tile, return_dict) if self.use_hw_tiling else self._encode(z_tile)
                 for z_tile in tiles
             ]
             moments = torch.cat(moments_tiles, dim=2)
         else:
-            moments = self._hw_tiled_encode(z, return_dict) if self.use_hw_tiling else self._encode(z)
         posterior = DiagonalGaussianDistribution(moments)
         if not return_dict:
@@ -207,7 +250,9 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         moments = self.quant_conv(h)
         return moments
-    def _decode(self, z: torch.FloatTensor, target_shape=None) -> Union[DecoderOutput, torch.FloatTensor]:
         z = self.post_quant_conv(z)
         dec = self.decoder(z, target_shape=target_shape)
         return dec
@@ -219,7 +264,12 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
             reduction_factor = int(
                 self.encoder.patch_size_t
-                * 2 ** (len(self.encoder.down_blocks) - 1 - math.sqrt(self.encoder.patch_size))
             )
             split_size = self.z_sample_size // reduction_factor
             num_splits = z.shape[2] // split_size

 import math
 import torch.nn as nn
 from diffusers import ConfigMixin, ModelMixin
+from diffusers.models.autoencoders.vae import (
+    DecoderOutput,
+    DiagonalGaussianDistribution,
+)
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd
         quant_dims = 2 if dims == 2 else 3
         self.decoder = decoder
         if use_quant_conv:
+            self.quant_conv = make_conv_nd(
+                quant_dims, 2 * latent_channels, 2 * latent_channels, 1
+            )
+            self.post_quant_conv = make_conv_nd(
+                quant_dims, latent_channels, latent_channels, 1
+            )
         else:
             self.quant_conv = nn.Identity()
             self.post_quant_conv = nn.Identity()
         for i in range(0, x.shape[3], overlap_size):
             row = []
             for j in range(0, x.shape[4], overlap_size):
+                tile = x[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_size,
+                    j : j + self.tile_sample_min_size,
+                ]
                 tile = self.encoder(tile)
                 tile = self.quant_conv(tile)
                 row.append(tile)
         moments = torch.cat(result_rows, dim=3)
         return moments
+    def blend_z(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
         blend_extent = min(a.shape[2], b.shape[2], blend_extent)
         for z in range(blend_extent):
+            b[:, :, z, :, :] = a[:, :, -blend_extent + z, :, :] * (
+                1 - z / blend_extent
+            ) + b[:, :, z, :, :] * (z / blend_extent)
         return b
+    def blend_v(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
         blend_extent = min(a.shape[3], b.shape[3], blend_extent)
         for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (
+                1 - y / blend_extent
+            ) + b[:, :, :, y, :] * (y / blend_extent)
         return b
+    def blend_h(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
         blend_extent = min(a.shape[4], b.shape[4], blend_extent)
         for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (
+                1 - x / blend_extent
+            ) + b[:, :, :, :, x] * (x / blend_extent)
         return b
     def _hw_tiled_decode(self, z: torch.FloatTensor, target_shape):
         overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
         row_limit = self.tile_sample_min_size - blend_extent
+        tile_target_shape = (
+            *target_shape[:3],
+            self.tile_sample_min_size,
+            self.tile_sample_min_size,
+        )
         # Split z into overlapping 64x64 tiles and decode them separately.
         # The tiles have an overlap to avoid seams between tiles.
         rows = []
         for i in range(0, z.shape[3], overlap_size):
             row = []
             for j in range(0, z.shape[4], overlap_size):
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
                 tile = self.post_quant_conv(tile)
                 decoded = self.decoder(tile, target_shape=tile_target_shape)
                 row.append(decoded)
         dec = torch.cat(result_rows, dim=3)
         return dec
+    def encode(
+        self, z: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
         if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
             num_splits = z.shape[2] // self.z_sample_size
             sizes = [self.z_sample_size] * num_splits
+            sizes = (
+                sizes + [z.shape[2] - sum(sizes)]
+                if z.shape[2] - sum(sizes) > 0
+                else sizes
+            )
             tiles = z.split(sizes, dim=2)
             moments_tiles = [
+                (
+                    self._hw_tiled_encode(z_tile, return_dict)
+                    if self.use_hw_tiling
+                    else self._encode(z_tile)
+                )
                 for z_tile in tiles
             ]
             moments = torch.cat(moments_tiles, dim=2)
         else:
+            moments = (
+                self._hw_tiled_encode(z, return_dict)
+                if self.use_hw_tiling
+                else self._encode(z)
+            )
         posterior = DiagonalGaussianDistribution(moments)
         if not return_dict:
         moments = self.quant_conv(h)
         return moments
+    def _decode(
+        self, z: torch.FloatTensor, target_shape=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
         z = self.post_quant_conv(z)
         dec = self.decoder(z, target_shape=target_shape)
         return dec
         if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
             reduction_factor = int(
                 self.encoder.patch_size_t
+                * 2
+                ** (
+                    len(self.encoder.down_blocks)
+                    - 1
+                    - math.sqrt(self.encoder.patch_size)
+                )
             )
             split_size = self.z_sample_size // reduction_factor
             num_splits = z.shape[2] // split_size

xora/models/autoencoders/vae_encode.py CHANGED Viewed

@@ -6,12 +6,19 @@ from torch import Tensor
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.autoencoders.video_autoencoder import Downsample3D, VideoAutoencoder
 try:
     import torch_xla.core.xla_model as xm
-except:
-    pass
-def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae_per_channel_normalize=False) -> Tensor:
     """
     Encodes media items (images or videos) into latent representations using a specified VAE model.
     The function supports processing batches of images or video frames and can handle the processing
@@ -48,11 +55,15 @@ def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae
     if channels != 3:
         raise ValueError(f"Expects tensors with 3 channels, got {channels}.")
-    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(media_items) % split_size != 0:
-            raise ValueError("Error: The batch size must be divisible by 'train.vae_bs_split")
         encode_bs = len(media_items) // split_size
         # latents = [vae.encode(image_batch).latent_dist.sample() for image_batch in media_items.split(encode_bs)]
         latents = []
@@ -67,22 +78,32 @@ def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae
         latents = vae.encode(media_items).latent_dist.sample()
     latents = normalize_latents(latents, vae, vae_per_channel_normalize)
-    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         latents = rearrange(latents, "(b n) c h w -> b c n h w", b=batch_size)
     return latents
 def vae_decode(
-    latents: Tensor, vae: AutoencoderKL, is_video: bool = True, split_size: int = 1, vae_per_channel_normalize=False
 ) -> Tensor:
     is_video_shaped = latents.dim() == 5
     batch_size = latents.shape[0]
-    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         latents = rearrange(latents, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(latents) % split_size != 0:
-            raise ValueError("Error: The batch size must be divisible by 'train.vae_bs_split")
         encode_bs = len(latents) // split_size
         image_batch = [
             _run_decoder(latent_batch, vae, is_video, vae_per_channel_normalize)
@@ -92,12 +113,16 @@ def vae_decode(
     else:
         images = _run_decoder(latents, vae, is_video, vae_per_channel_normalize)
-    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         images = rearrange(images, "(b n) c h w -> b c n h w", b=batch_size)
     return images
-def _run_decoder(latents: Tensor, vae: AutoencoderKL, is_video: bool, vae_per_channel_normalize=False) -> Tensor:
     if isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         *_, fl, hl, wl = latents.shape
         temporal_scale, spatial_scale, _ = get_vae_size_scale_factor(vae)
@@ -105,7 +130,13 @@ def _run_decoder(latents: Tensor, vae: AutoencoderKL, is_video: bool, vae_per_ch
         image = vae.decode(
             un_normalize_latents(latents, vae, vae_per_channel_normalize),
             return_dict=False,
-            target_shape=(1, 3, fl * temporal_scale if is_video else 1, hl * spatial_scale, wl * spatial_scale),
         )[0]
     else:
         image = vae.decode(
@@ -120,14 +151,26 @@ def get_vae_size_scale_factor(vae: AutoencoderKL) -> float:
         spatial = vae.spatial_downscale_factor
         temporal = vae.temporal_downscale_factor
     else:
-        down_blocks = len([block for block in vae.encoder.down_blocks if isinstance(block.downsample, Downsample3D)])
         spatial = vae.config.patch_size * 2**down_blocks
-        temporal = vae.config.patch_size_t * 2 ** down_blocks if isinstance(vae, VideoAutoencoder) else 1
     return (temporal, spatial, spatial)
-def normalize_latents(latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False) -> Tensor:
     return (
         (latents - vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1))
         / vae.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
@@ -136,10 +179,12 @@ def normalize_latents(latents: Tensor, vae: AutoencoderKL, vae_per_channel_norma
     )
-def un_normalize_latents(latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False) -> Tensor:
     return (
         latents * vae.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
         + vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
         if vae_per_channel_normalize
         else latents / vae.config.scaling_factor
-    )

 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.autoencoders.video_autoencoder import Downsample3D, VideoAutoencoder
 try:
     import torch_xla.core.xla_model as xm
+except ImportError:
+    xm = None
+def vae_encode(
+    media_items: Tensor,
+    vae: AutoencoderKL,
+    split_size: int = 1,
+    vae_per_channel_normalize=False,
+) -> Tensor:
     """
     Encodes media items (images or videos) into latent representations using a specified VAE model.
     The function supports processing batches of images or video frames and can handle the processing
     if channels != 3:
         raise ValueError(f"Expects tensors with 3 channels, got {channels}.")
+    if is_video_shaped and not isinstance(
+        vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    ):
         media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(media_items) % split_size != 0:
+            raise ValueError(
+                "Error: The batch size must be divisible by 'train.vae_bs_split"
+            )
         encode_bs = len(media_items) // split_size
         # latents = [vae.encode(image_batch).latent_dist.sample() for image_batch in media_items.split(encode_bs)]
         latents = []
         latents = vae.encode(media_items).latent_dist.sample()
     latents = normalize_latents(latents, vae, vae_per_channel_normalize)
+    if is_video_shaped and not isinstance(
+        vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    ):
         latents = rearrange(latents, "(b n) c h w -> b c n h w", b=batch_size)
     return latents
 def vae_decode(
+    latents: Tensor,
+    vae: AutoencoderKL,
+    is_video: bool = True,
+    split_size: int = 1,
+    vae_per_channel_normalize=False,
 ) -> Tensor:
     is_video_shaped = latents.dim() == 5
     batch_size = latents.shape[0]
+    if is_video_shaped and not isinstance(
+        vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    ):
         latents = rearrange(latents, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(latents) % split_size != 0:
+            raise ValueError(
+                "Error: The batch size must be divisible by 'train.vae_bs_split"
+            )
         encode_bs = len(latents) // split_size
         image_batch = [
             _run_decoder(latent_batch, vae, is_video, vae_per_channel_normalize)
     else:
         images = _run_decoder(latents, vae, is_video, vae_per_channel_normalize)
+    if is_video_shaped and not isinstance(
+        vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    ):
         images = rearrange(images, "(b n) c h w -> b c n h w", b=batch_size)
     return images
+def _run_decoder(
+    latents: Tensor, vae: AutoencoderKL, is_video: bool, vae_per_channel_normalize=False
+) -> Tensor:
     if isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         *_, fl, hl, wl = latents.shape
         temporal_scale, spatial_scale, _ = get_vae_size_scale_factor(vae)
         image = vae.decode(
             un_normalize_latents(latents, vae, vae_per_channel_normalize),
             return_dict=False,
+            target_shape=(
+                1,
+                3,
+                fl * temporal_scale if is_video else 1,
+                hl * spatial_scale,
+                wl * spatial_scale,
+            ),
         )[0]
     else:
         image = vae.decode(
         spatial = vae.spatial_downscale_factor
         temporal = vae.temporal_downscale_factor
     else:
+        down_blocks = len(
+            [
+                block
+                for block in vae.encoder.down_blocks
+                if isinstance(block.downsample, Downsample3D)
+            ]
+        )
         spatial = vae.config.patch_size * 2**down_blocks
+        temporal = (
+            vae.config.patch_size_t * 2**down_blocks
+            if isinstance(vae, VideoAutoencoder)
+            else 1
+        )
     return (temporal, spatial, spatial)
+def normalize_latents(
+    latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False
+) -> Tensor:
     return (
         (latents - vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1))
         / vae.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
     )
+def un_normalize_latents(
+    latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False
+) -> Tensor:
     return (
         latents * vae.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
         + vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
         if vae_per_channel_normalize
         else latents / vae.config.scaling_factor
+    )

xora/models/autoencoders/video_autoencoder.py CHANGED Viewed

@@ -21,7 +21,12 @@ logger = logging.get_logger(__name__)
 class VideoAutoencoder(AutoencoderKLWrapper):
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *args, **kwargs):
         config_local_path = pretrained_model_name_or_path / "config.json"
         config = cls.load_config(config_local_path, **kwargs)
         video_vae = cls.from_config(config)
@@ -31,29 +36,41 @@ class VideoAutoencoder(AutoencoderKLWrapper):
         ckpt_state_dict = torch.load(model_local_path)
         video_vae.load_state_dict(ckpt_state_dict)
-        statistics_local_path = pretrained_model_name_or_path / "per_channel_statistics.json"
         if statistics_local_path.exists():
             with open(statistics_local_path, "r") as file:
                 data = json.load(file)
             transposed_data = list(zip(*data["data"]))
-            data_dict = {col: torch.tensor(vals) for col, vals in zip(data["columns"], transposed_data)}
             video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
             video_vae.register_buffer(
-                "mean_of_means", data_dict.get("mean-of-means", torch.zeros_like(data_dict["std-of-means"]))
             )
         return video_vae
     @staticmethod
     def from_config(config):
-        assert config["_class_name"] == "VideoAutoencoder", "config must have _class_name=VideoAutoencoder"
         if isinstance(config["dims"], list):
             config["dims"] = tuple(config["dims"])
         assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
         double_z = config.get("double_z", True)
-        latent_log_var = config.get("latent_log_var", "per_channel" if double_z else "none")
         use_quant_conv = config.get("use_quant_conv", True)
         if use_quant_conv and latent_log_var == "uniform":
@@ -96,8 +113,10 @@ class VideoAutoencoder(AutoencoderKLWrapper):
         return SimpleNamespace(
             _class_name="VideoAutoencoder",
             dims=self.dims,
-            in_channels=self.encoder.conv_in.in_channels // (self.encoder.patch_size_t * self.encoder.patch_size**2),
-            out_channels=self.decoder.conv_out.out_channels // (self.decoder.patch_size_t * self.decoder.patch_size**2),
             latent_channels=self.decoder.conv_in.in_channels,
             block_out_channels=[
                 self.encoder.down_blocks[i].res_blocks[-1].conv1.out_channels
@@ -143,7 +162,9 @@ class VideoAutoencoder(AutoencoderKLWrapper):
                 key = key.replace(k, v)
             if "norm" in key and key not in model_keys:
-                logger.info(f"Removing key {key} from state_dict as it is not present in the model")
                 continue
             converted_state_dict[key] = value
@@ -253,7 +274,11 @@ class Encoder(nn.Module):
         # out
         if norm_layer == "group_norm":
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         self.conv_act = nn.SiLU()
@@ -265,14 +290,23 @@ class Encoder(nn.Module):
             conv_out_channels += 1
         elif latent_log_var != "none":
             raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
-        self.conv_out = make_conv_nd(dims, block_out_channels[-1], conv_out_channels, 3, padding=1)
         self.gradient_checkpointing = False
     @property
     def downscale_factor(self):
         return (
-            2 ** len([block for block in self.down_blocks if isinstance(block.downsample, Downsample3D)])
             * self.patch_size
         )
@@ -299,7 +333,9 @@ class Encoder(nn.Module):
         )
         for down_block in self.down_blocks:
-            sample = checkpoint_fn(down_block)(sample, downsample_in_time=downsample_in_time)
         sample = checkpoint_fn(self.mid_block)(sample)
@@ -314,11 +350,15 @@ class Encoder(nn.Module):
             if num_dims == 4:
                 # For shape (B, C, H, W)
-                repeated_last_channel = last_channel.repeat(1, sample.shape[1] - 2, 1, 1)
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             elif num_dims == 5:
                 # For shape (B, C, F, H, W)
-                repeated_last_channel = last_channel.repeat(1, sample.shape[1] - 2, 1, 1, 1)
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             else:
                 raise ValueError(f"Invalid input shape: {sample.shape}")
@@ -405,7 +445,8 @@ class Decoder(nn.Module):
                 num_layers=self.layers_per_block + 1,
                 in_channels=prev_output_channel,
                 out_channels=output_channel,
-                add_upsample=not is_final_block and 2 ** (len(block_out_channels) - i - 1) > patch_size,
                 resnet_eps=1e-6,
                 resnet_groups=norm_num_groups,
                 norm_layer=norm_layer,
@@ -413,12 +454,16 @@ class Decoder(nn.Module):
             self.up_blocks.append(up_block)
         if norm_layer == "group_norm":
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         self.conv_act = nn.SiLU()
-        self.conv_out = make_conv_nd(dims, block_out_channels[0], out_channels, 3, padding=1)
         self.gradient_checkpointing = False
@@ -494,15 +539,24 @@ class DownEncoderBlock3D(nn.Module):
         self.res_blocks = nn.ModuleList(res_blocks)
         if add_downsample:
-            self.downsample = Downsample3D(dims, out_channels, out_channels=out_channels, padding=downsample_padding)
         else:
             self.downsample = Identity()
-    def forward(self, hidden_states: torch.FloatTensor, downsample_in_time) -> torch.FloatTensor:
         for resnet in self.res_blocks:
             hidden_states = resnet(hidden_states)
-        hidden_states = self.downsample(hidden_states, downsample_in_time=downsample_in_time)
         return hidden_states
@@ -536,7 +590,9 @@ class UNetMidBlock3D(nn.Module):
         norm_layer: str = "group_norm",
     ):
         super().__init__()
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         self.res_blocks = nn.ModuleList(
             [
@@ -595,13 +651,17 @@ class UpDecoderBlock3D(nn.Module):
         self.res_blocks = nn.ModuleList(res_blocks)
         if add_upsample:
-            self.upsample = Upsample3D(dims=dims, channels=out_channels, out_channels=out_channels)
         else:
             self.upsample = Identity()
         self.resolution_idx = resolution_idx
-    def forward(self, hidden_states: torch.FloatTensor, upsample_in_time=True) -> torch.FloatTensor:
         for resnet in self.res_blocks:
             hidden_states = resnet(hidden_states)
@@ -641,25 +701,35 @@ class ResnetBlock3D(nn.Module):
         self.use_conv_shortcut = conv_shortcut
         if norm_layer == "group_norm":
-            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
         elif norm_layer == "pixel_norm":
             self.norm1 = PixelNorm()
         self.non_linearity = nn.SiLU()
-        self.conv1 = make_conv_nd(dims, in_channels, out_channels, kernel_size=3, stride=1, padding=1)
         if norm_layer == "group_norm":
-            self.norm2 = torch.nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
         elif norm_layer == "pixel_norm":
             self.norm2 = PixelNorm()
         self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = make_conv_nd(dims, out_channels, out_channels, kernel_size=3, stride=1, padding=1)
         self.conv_shortcut = (
-            make_linear_nd(dims=dims, in_channels=in_channels, out_channels=out_channels)
             if in_channels != out_channels
             else nn.Identity()
         )
@@ -692,7 +762,14 @@ class ResnetBlock3D(nn.Module):
 class Downsample3D(nn.Module):
-    def __init__(self, dims, in_channels: int, out_channels: int, kernel_size: int = 3, padding: int = 1):
         super().__init__()
         stride: int = 2
         self.padding = padding
@@ -735,18 +812,24 @@ class Upsample3D(nn.Module):
         self.dims = dims
         self.channels = channels
         self.out_channels = out_channels or channels
-        self.conv = make_conv_nd(dims, channels, out_channels, kernel_size=3, padding=1, bias=True)
     def forward(self, x, upsample_in_time):
         if self.dims == 2:
-            x = functional.interpolate(x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest")
         else:
             time_scale_factor = 2 if upsample_in_time else 1
             # print("before:", x.shape)
             b, c, d, h, w = x.shape
             x = rearrange(x, "b c d h w -> (b d) c h w")
             # height and width interpolate
-            x = functional.interpolate(x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest")
             _, _, h, w = x.shape
             if not upsample_in_time and self.dims == (2, 1):
@@ -760,7 +843,9 @@ class Upsample3D(nn.Module):
             new_d = x.shape[-1] * time_scale_factor
             x = functional.interpolate(x, (1, new_d), mode="nearest")
             # (b h w) c 1 new_d
-            x = rearrange(x, "(b h w) c 1 new_d  -> b c new_d h w", b=b, h=h, w=w, new_d=new_d)
             # b c d h w
             # x = functional.interpolate(
@@ -775,13 +860,25 @@ def patchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
     if patch_size_hw == 1 and patch_size_t == 1:
         return x
     if x.dim() == 4:
-        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw)
     elif x.dim() == 5:
-        x = rearrange(x, "b c (f p) (h q) (w r) -> b (c p r q) f h w", p=patch_size_t, q=patch_size_hw, r=patch_size_hw)
     else:
         raise ValueError(f"Invalid input shape: {x.shape}")
-    if (x.dim() == 5) and (patch_size_hw > patch_size_t) and (patch_size_t > 1 or add_channel_padding):
         channels_to_pad = x.shape[1] * (patch_size_hw // patch_size_t) - x.shape[1]
         padding_zeros = torch.zeros(
             x.shape[0],
@@ -801,14 +898,26 @@ def unpatchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
     if patch_size_hw == 1 and patch_size_t == 1:
         return x
-    if (x.dim() == 5) and (patch_size_hw > patch_size_t) and (patch_size_t > 1 or add_channel_padding):
         channels_to_keep = int(x.shape[1] * (patch_size_t / patch_size_hw))
         x = x[:, :channels_to_keep, :, :, :]
     if x.dim() == 4:
-        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw)
     elif x.dim() == 5:
-        x = rearrange(x, "b (c p r q) f h w -> b c (f p) (h q) (w r)", p=patch_size_t, q=patch_size_hw, r=patch_size_hw)
     return x
@@ -818,11 +927,19 @@ def create_video_autoencoder_config(
 ):
     config = {
         "_class_name": "VideoAutoencoder",
-        "dims": (2, 1),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
         "in_channels": 3,  # Number of input color channels (e.g., RGB)
         "out_channels": 3,  # Number of output color channels
         "latent_channels": latent_channels,  # Number of channels in the latent space representation
-        "block_out_channels": [128, 256, 512, 512],  # Number of output channels of each encoder / decoder inner block
         "patch_size": 1,
     }
@@ -834,11 +951,15 @@ def create_video_autoencoder_pathify4x4x4_config(
 ):
     config = {
         "_class_name": "VideoAutoencoder",
-        "dims": (2, 1),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
         "in_channels": 3,  # Number of input color channels (e.g., RGB)
         "out_channels": 3,  # Number of output color channels
         "latent_channels": latent_channels,  # Number of channels in the latent space representation
-        "block_out_channels": [512] * 4,  # Number of output channels of each encoder / decoder inner block
         "patch_size": 4,
         "latent_log_var": "uniform",
     }
@@ -855,7 +976,8 @@ def create_video_autoencoder_pathify4x4_config(
         "in_channels": 3,  # Number of input color channels (e.g., RGB)
         "out_channels": 3,  # Number of output color channels
         "latent_channels": latent_channels,  # Number of channels in the latent space representation
-        "block_out_channels": [512] * 4,  # Number of output channels of each encoder / decoder inner block
         "patch_size": 4,
         "norm_layer": "pixel_norm",
     }
@@ -894,7 +1016,9 @@ def demo_video_autoencoder_forward_backward():
     latent = video_autoencoder.encode(input_videos).latent_dist.mode()
     print(f"input shape={input_videos.shape}")
     print(f"latent shape={latent.shape}")
-    reconstructed_videos = video_autoencoder.decode(latent, target_shape=input_videos.shape).sample
     print(f"reconstructed shape={reconstructed_videos.shape}")

 class VideoAutoencoder(AutoencoderKLWrapper):
     @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *args,
+        **kwargs,
+    ):
         config_local_path = pretrained_model_name_or_path / "config.json"
         config = cls.load_config(config_local_path, **kwargs)
         video_vae = cls.from_config(config)
         ckpt_state_dict = torch.load(model_local_path)
         video_vae.load_state_dict(ckpt_state_dict)
+        statistics_local_path = (
+            pretrained_model_name_or_path / "per_channel_statistics.json"
+        )
         if statistics_local_path.exists():
             with open(statistics_local_path, "r") as file:
                 data = json.load(file)
             transposed_data = list(zip(*data["data"]))
+            data_dict = {
+                col: torch.tensor(vals)
+                for col, vals in zip(data["columns"], transposed_data)
+            }
             video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
             video_vae.register_buffer(
+                "mean_of_means",
+                data_dict.get(
+                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+                ),
             )
         return video_vae
     @staticmethod
     def from_config(config):
+        assert (
+            config["_class_name"] == "VideoAutoencoder"
+        ), "config must have _class_name=VideoAutoencoder"
         if isinstance(config["dims"], list):
             config["dims"] = tuple(config["dims"])
         assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
         double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
         use_quant_conv = config.get("use_quant_conv", True)
         if use_quant_conv and latent_log_var == "uniform":
         return SimpleNamespace(
             _class_name="VideoAutoencoder",
             dims=self.dims,
+            in_channels=self.encoder.conv_in.in_channels
+            // (self.encoder.patch_size_t * self.encoder.patch_size**2),
+            out_channels=self.decoder.conv_out.out_channels
+            // (self.decoder.patch_size_t * self.decoder.patch_size**2),
             latent_channels=self.decoder.conv_in.in_channels,
             block_out_channels=[
                 self.encoder.down_blocks[i].res_blocks[-1].conv1.out_channels
                 key = key.replace(k, v)
             if "norm" in key and key not in model_keys:
+                logger.info(
+                    f"Removing key {key} from state_dict as it is not present in the model"
+                )
                 continue
             converted_state_dict[key] = value
         # out
         if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[-1],
+                num_groups=norm_num_groups,
+                eps=1e-6,
+            )
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         self.conv_act = nn.SiLU()
             conv_out_channels += 1
         elif latent_log_var != "none":
             raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims, block_out_channels[-1], conv_out_channels, 3, padding=1
+        )
         self.gradient_checkpointing = False
     @property
     def downscale_factor(self):
         return (
+            2
+            ** len(
+                [
+                    block
+                    for block in self.down_blocks
+                    if isinstance(block.downsample, Downsample3D)
+                ]
+            )
             * self.patch_size
         )
         )
         for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(
+                sample, downsample_in_time=downsample_in_time
+            )
         sample = checkpoint_fn(self.mid_block)(sample)
             if num_dims == 4:
                 # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             elif num_dims == 5:
                 # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
                 sample = torch.cat([sample, repeated_last_channel], dim=1)
             else:
                 raise ValueError(f"Invalid input shape: {sample.shape}")
                 num_layers=self.layers_per_block + 1,
                 in_channels=prev_output_channel,
                 out_channels=output_channel,
+                add_upsample=not is_final_block
+                and 2 ** (len(block_out_channels) - i - 1) > patch_size,
                 resnet_eps=1e-6,
                 resnet_groups=norm_num_groups,
                 norm_layer=norm_layer,
             self.up_blocks.append(up_block)
         if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6
+            )
         elif norm_layer == "pixel_norm":
             self.conv_norm_out = PixelNorm()
         self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims, block_out_channels[0], out_channels, 3, padding=1
+        )
         self.gradient_checkpointing = False
         self.res_blocks = nn.ModuleList(res_blocks)
         if add_downsample:
+            self.downsample = Downsample3D(
+                dims,
+                out_channels,
+                out_channels=out_channels,
+                padding=downsample_padding,
+            )
         else:
             self.downsample = Identity()
+    def forward(
+        self, hidden_states: torch.FloatTensor, downsample_in_time
+    ) -> torch.FloatTensor:
         for resnet in self.res_blocks:
             hidden_states = resnet(hidden_states)
+        hidden_states = self.downsample(
+            hidden_states, downsample_in_time=downsample_in_time
+        )
         return hidden_states
         norm_layer: str = "group_norm",
     ):
         super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
         self.res_blocks = nn.ModuleList(
             [
         self.res_blocks = nn.ModuleList(res_blocks)
         if add_upsample:
+            self.upsample = Upsample3D(
+                dims=dims, channels=out_channels, out_channels=out_channels
+            )
         else:
             self.upsample = Identity()
         self.resolution_idx = resolution_idx
+    def forward(
+        self, hidden_states: torch.FloatTensor, upsample_in_time=True
+    ) -> torch.FloatTensor:
         for resnet in self.res_blocks:
             hidden_states = resnet(hidden_states)
         self.use_conv_shortcut = conv_shortcut
         if norm_layer == "group_norm":
+            self.norm1 = torch.nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
         elif norm_layer == "pixel_norm":
             self.norm1 = PixelNorm()
         self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(
+            dims, in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
         if norm_layer == "group_norm":
+            self.norm2 = torch.nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
         elif norm_layer == "pixel_norm":
             self.norm2 = PixelNorm()
         self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(
+            dims, out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
         self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
             if in_channels != out_channels
             else nn.Identity()
         )
 class Downsample3D(nn.Module):
+    def __init__(
+        self,
+        dims,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        padding: int = 1,
+    ):
         super().__init__()
         stride: int = 2
         self.padding = padding
         self.dims = dims
         self.channels = channels
         self.out_channels = out_channels or channels
+        self.conv = make_conv_nd(
+            dims, channels, out_channels, kernel_size=3, padding=1, bias=True
+        )
     def forward(self, x, upsample_in_time):
         if self.dims == 2:
+            x = functional.interpolate(
+                x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest"
+            )
         else:
             time_scale_factor = 2 if upsample_in_time else 1
             # print("before:", x.shape)
             b, c, d, h, w = x.shape
             x = rearrange(x, "b c d h w -> (b d) c h w")
             # height and width interpolate
+            x = functional.interpolate(
+                x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest"
+            )
             _, _, h, w = x.shape
             if not upsample_in_time and self.dims == (2, 1):
             new_d = x.shape[-1] * time_scale_factor
             x = functional.interpolate(x, (1, new_d), mode="nearest")
             # (b h w) c 1 new_d
+            x = rearrange(
+                x, "(b h w) c 1 new_d  -> b c new_d h w", b=b, h=h, w=w, new_d=new_d
+            )
             # b c d h w
             # x = functional.interpolate(
     if patch_size_hw == 1 and patch_size_t == 1:
         return x
     if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
     elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
     else:
         raise ValueError(f"Invalid input shape: {x.shape}")
+    if (
+        (x.dim() == 5)
+        and (patch_size_hw > patch_size_t)
+        and (patch_size_t > 1 or add_channel_padding)
+    ):
         channels_to_pad = x.shape[1] * (patch_size_hw // patch_size_t) - x.shape[1]
         padding_zeros = torch.zeros(
             x.shape[0],
     if patch_size_hw == 1 and patch_size_t == 1:
         return x
+    if (
+        (x.dim() == 5)
+        and (patch_size_hw > patch_size_t)
+        and (patch_size_t > 1 or add_channel_padding)
+    ):
         channels_to_keep = int(x.shape[1] * (patch_size_t / patch_size_hw))
         x = x[:, :channels_to_keep, :, :, :]
     if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
     elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
     return x
 ):
     config = {
         "_class_name": "VideoAutoencoder",
+        "dims": (
+            2,
+            1,
+        ),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
         "in_channels": 3,  # Number of input color channels (e.g., RGB)
         "out_channels": 3,  # Number of output color channels
         "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [
+            128,
+            256,
+            512,
+            512,
+        ],  # Number of output channels of each encoder / decoder inner block
         "patch_size": 1,
     }
 ):
     config = {
         "_class_name": "VideoAutoencoder",
+        "dims": (
+            2,
+            1,
+        ),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
         "in_channels": 3,  # Number of input color channels (e.g., RGB)
         "out_channels": 3,  # Number of output color channels
         "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [512]
+        * 4,  # Number of output channels of each encoder / decoder inner block
         "patch_size": 4,
         "latent_log_var": "uniform",
     }
         "in_channels": 3,  # Number of input color channels (e.g., RGB)
         "out_channels": 3,  # Number of output color channels
         "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [512]
+        * 4,  # Number of output channels of each encoder / decoder inner block
         "patch_size": 4,
         "norm_layer": "pixel_norm",
     }
     latent = video_autoencoder.encode(input_videos).latent_dist.mode()
     print(f"input shape={input_videos.shape}")
     print(f"latent shape={latent.shape}")
+    reconstructed_videos = video_autoencoder.decode(
+        latent, target_shape=input_videos.shape
+    ).sample
     print(f"reconstructed shape={reconstructed_videos.shape}")

xora/models/transformers/attention.py CHANGED Viewed

@@ -106,11 +106,15 @@ class BasicTransformerBlock(nn.Module):
         assert standardization_norm in ["layer_norm", "rms_norm"]
         assert adaptive_norm in ["single_scale_shift", "single_scale", "none"]
-        make_norm_layer = nn.LayerNorm if standardization_norm == "layer_norm" else RMSNorm
         # Define 3 blocks. Each block has its own normalization layer.
         # 1. Self-Attn
-        self.norm1 = make_norm_layer(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.attn1 = Attention(
             query_dim=dim,
@@ -130,7 +134,9 @@ class BasicTransformerBlock(nn.Module):
         if cross_attention_dim is not None or double_self_attention:
             self.attn2 = Attention(
                 query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
@@ -143,7 +149,9 @@ class BasicTransformerBlock(nn.Module):
             )  # is self-attn if encoder_hidden_states is none
             if adaptive_norm == "none":
-                self.attn2_norm = make_norm_layer(dim, norm_eps, norm_elementwise_affine)
         else:
             self.attn2 = None
             self.attn2_norm = None
@@ -163,7 +171,9 @@ class BasicTransformerBlock(nn.Module):
         # 5. Scale-shift for PixArt-Alpha.
         if adaptive_norm != "none":
             num_ada_params = 4 if adaptive_norm == "single_scale" else 6
-            self.scale_shift_table = nn.Parameter(torch.randn(num_ada_params, dim) / dim**0.5)
         # let chunk size default to None
         self._chunk_size = None
@@ -198,7 +208,9 @@ class BasicTransformerBlock(nn.Module):
     ) -> torch.FloatTensor:
         if cross_attention_kwargs is not None:
             if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
         # Notice that normalization is always applied before the real computation in the following blocks.
         # 0. Self-Attention
@@ -214,7 +226,9 @@ class BasicTransformerBlock(nn.Module):
                 batch_size, timestep.shape[1], num_ada_params, -1
             )
             if self.adaptive_norm == "single_scale_shift":
-                shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
                 norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
             else:
                 scale_msa, gate_msa, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
@@ -224,15 +238,21 @@ class BasicTransformerBlock(nn.Module):
         else:
             raise ValueError(f"Unknown adaptive norm type: {self.adaptive_norm}")
-        norm_hidden_states = norm_hidden_states.squeeze(1)  # TODO: Check if this is needed
         # 1. Prepare GLIGEN inputs
-        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
         attn_output = self.attn1(
             norm_hidden_states,
             freqs_cis=freqs_cis,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
@@ -271,7 +291,9 @@ class BasicTransformerBlock(nn.Module):
         if self._chunk_size is not None:
             # "feed_forward_chunk_size" can be used to save memory
-            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
         else:
             ff_output = self.ff(norm_hidden_states)
         if gate_mlp is not None:
@@ -371,7 +393,9 @@ class Attention(nn.Module):
         self.query_dim = query_dim
         self.use_bias = bias
         self.is_cross_attention = cross_attention_dim is not None
-        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
         self.upcast_attention = upcast_attention
         self.upcast_softmax = upcast_softmax
         self.rescale_output_factor = rescale_output_factor
@@ -416,12 +440,16 @@ class Attention(nn.Module):
             )
         if norm_num_groups is not None:
-            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
         else:
             self.group_norm = None
         if spatial_norm_dim is not None:
-            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
         else:
             self.spatial_norm = None
@@ -441,7 +469,10 @@ class Attention(nn.Module):
                 norm_cross_num_channels = self.cross_attention_dim
             self.norm_cross = nn.GroupNorm(
-                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
             )
         else:
             raise ValueError(
@@ -499,12 +530,16 @@ class Attention(nn.Module):
             and isinstance(self.processor, torch.nn.Module)
             and not isinstance(processor, torch.nn.Module)
         ):
-            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
             self._modules.pop("processor")
         self.processor = processor
-    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":  # noqa: F821
         r"""
         Get the attention processor in use.
@@ -542,12 +577,18 @@ class Attention(nn.Module):
         # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
         non_lora_processor_cls_name = self.processor.__class__.__name__
-        lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)
         hidden_size = self.inner_dim
         # now create a LoRA attention processor from the LoRA layers
-        if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor]:
             kwargs = {
                 "cross_attention_dim": self.cross_attention_dim,
                 "rank": self.to_q.lora_layer.rank,
@@ -569,7 +610,9 @@ class Attention(nn.Module):
             lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
             lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
             lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
-            lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
         elif lora_processor_cls == LoRAAttnAddedKVProcessor:
             lora_processor = lora_processor_cls(
                 hidden_size,
@@ -580,12 +623,18 @@ class Attention(nn.Module):
             lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
             lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
             lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
-            lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
             # only save if used
             if self.add_k_proj.lora_layer is not None:
-                lora_processor.add_k_proj_lora.load_state_dict(self.add_k_proj.lora_layer.state_dict())
-                lora_processor.add_v_proj_lora.load_state_dict(self.add_v_proj.lora_layer.state_dict())
             else:
                 lora_processor.add_k_proj_lora = None
                 lora_processor.add_v_proj_lora = None
@@ -622,14 +671,20 @@ class Attention(nn.Module):
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
-        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        unused_kwargs = [k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters]
         if len(unused_kwargs) > 0:
             logger.warning(
                 f"cross_attention_kwargs {unused_kwargs} are not expected by"
                 f" {self.processor.__class__.__name__} and will be ignored."
             )
-        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
         return self.processor(
             self,
@@ -654,7 +709,9 @@ class Attention(nn.Module):
         head_size = self.heads
         batch_size, seq_len, dim = tensor.shape
         tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
         return tensor
     def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
@@ -677,16 +734,23 @@ class Attention(nn.Module):
             extra_dim = 1
         else:
             batch_size, extra_dim, seq_len, dim = tensor.shape
-        tensor = tensor.reshape(batch_size, seq_len * extra_dim, head_size, dim // head_size)
         tensor = tensor.permute(0, 2, 1, 3)
         if out_dim == 3:
-            tensor = tensor.reshape(batch_size * head_size, seq_len * extra_dim, dim // head_size)
         return tensor
     def get_attention_scores(
-        self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> torch.Tensor:
         r"""
         Compute the attention scores.
@@ -706,7 +770,11 @@ class Attention(nn.Module):
         if attention_mask is None:
             baddbmm_input = torch.empty(
-                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
             )
             beta = 0
         else:
@@ -733,7 +801,11 @@ class Attention(nn.Module):
         return attention_probs
     def prepare_attention_mask(
-        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
     ) -> torch.Tensor:
         r"""
         Prepare the attention mask for the attention computation.
@@ -760,8 +832,16 @@ class Attention(nn.Module):
             if attention_mask.device.type == "mps":
                 # HACK: MPS: Does not support padding by greater than dimension of input tensor.
                 # Instead, we can manually construct the padding tensor.
-                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
-                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
                 attention_mask = torch.cat([attention_mask, padding], dim=2)
             else:
                 # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
@@ -779,7 +859,9 @@ class Attention(nn.Module):
         return attention_mask
-    def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
         r"""
         Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
         `Attention` class.
@@ -790,7 +872,9 @@ class Attention(nn.Module):
         Returns:
             `torch.Tensor`: The normalized encoder hidden states.
         """
-        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
         if isinstance(self.norm_cross, nn.LayerNorm):
             encoder_hidden_states = self.norm_cross(encoder_hidden_states)
@@ -857,27 +941,39 @@ class AttnProcessor2_0:
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         )
         if (attention_mask is not None) and (not attn.use_tpu_flash_attention):
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
             # scaled_dot_product_attention expects attention_mask shape to be
             # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
         if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
         query = attn.to_q(hidden_states)
         query = attn.q_norm(query)
         if encoder_hidden_states is not None:
             if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
             key = attn.to_k(encoder_hidden_states)
             key = attn.k_norm(key)
         else:  # if no context provided do self-attention
@@ -901,10 +997,14 @@ class AttnProcessor2_0:
         if attn.use_tpu_flash_attention:  # use tpu attention offload 'flash attention'
             q_segment_indexes = None
-            if attention_mask is not None:  # if mask is required need to tune both segmenIds fields
                 # attention_mask = torch.squeeze(attention_mask).to(torch.float32)
                 attention_mask = attention_mask.to(torch.float32)
-                q_segment_indexes = torch.ones(batch_size, query.shape[2], device=query.device, dtype=torch.float32)
                 assert (
                     attention_mask.shape[1] == key.shape[2]
                 ), f"ERROR: KEY SHAPE must be same as attention mask [{key.shape[2]}, {attention_mask.shape[1]}]"
@@ -927,10 +1027,17 @@ class AttnProcessor2_0:
             )
         else:
             hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
             )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
         # linear proj
@@ -939,7 +1046,9 @@ class AttnProcessor2_0:
         hidden_states = attn.to_out[1](hidden_states)
         if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
         if attn.residual_connection:
             hidden_states = hidden_states + residual
@@ -977,22 +1086,32 @@ class AttnProcessor:
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
         batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
         if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
         query = attn.to_q(hidden_states)
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
@@ -1014,7 +1133,9 @@ class AttnProcessor:
         hidden_states = attn.to_out[1](hidden_states)
         if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
         if attn.residual_connection:
             hidden_states = hidden_states + residual

         assert standardization_norm in ["layer_norm", "rms_norm"]
         assert adaptive_norm in ["single_scale_shift", "single_scale", "none"]
+        make_norm_layer = (
+            nn.LayerNorm if standardization_norm == "layer_norm" else RMSNorm
+        )
         # Define 3 blocks. Each block has its own normalization layer.
         # 1. Self-Attn
+        self.norm1 = make_norm_layer(
+            dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+        )
         self.attn1 = Attention(
             query_dim=dim,
         if cross_attention_dim is not None or double_self_attention:
             self.attn2 = Attention(
                 query_dim=dim,
+                cross_attention_dim=(
+                    cross_attention_dim if not double_self_attention else None
+                ),
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
             )  # is self-attn if encoder_hidden_states is none
             if adaptive_norm == "none":
+                self.attn2_norm = make_norm_layer(
+                    dim, norm_eps, norm_elementwise_affine
+                )
         else:
             self.attn2 = None
             self.attn2_norm = None
         # 5. Scale-shift for PixArt-Alpha.
         if adaptive_norm != "none":
             num_ada_params = 4 if adaptive_norm == "single_scale" else 6
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(num_ada_params, dim) / dim**0.5
+            )
         # let chunk size default to None
         self._chunk_size = None
     ) -> torch.FloatTensor:
         if cross_attention_kwargs is not None:
             if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored."
+                )
         # Notice that normalization is always applied before the real computation in the following blocks.
         # 0. Self-Attention
                 batch_size, timestep.shape[1], num_ada_params, -1
             )
             if self.adaptive_norm == "single_scale_shift":
+                shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                    ada_values.unbind(dim=2)
+                )
                 norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
             else:
                 scale_msa, gate_msa, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
         else:
             raise ValueError(f"Unknown adaptive norm type: {self.adaptive_norm}")
+        norm_hidden_states = norm_hidden_states.squeeze(
+            1
+        )  # TODO: Check if this is needed
         # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = (
+            cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        )
         attn_output = self.attn1(
             norm_hidden_states,
             freqs_cis=freqs_cis,
+            encoder_hidden_states=(
+                encoder_hidden_states if self.only_cross_attention else None
+            ),
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
         if self._chunk_size is not None:
             # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size
+            )
         else:
             ff_output = self.ff(norm_hidden_states)
         if gate_mlp is not None:
         self.query_dim = query_dim
         self.use_bias = bias
         self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = (
+            cross_attention_dim if cross_attention_dim is not None else query_dim
+        )
         self.upcast_attention = upcast_attention
         self.upcast_softmax = upcast_softmax
         self.rescale_output_factor = rescale_output_factor
             )
         if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(
+                num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True
+            )
         else:
             self.group_norm = None
         if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(
+                f_channels=query_dim, zq_channels=spatial_norm_dim
+            )
         else:
             self.spatial_norm = None
                 norm_cross_num_channels = self.cross_attention_dim
             self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels,
+                num_groups=cross_attention_norm_num_groups,
+                eps=1e-5,
+                affine=True,
             )
         else:
             raise ValueError(
             and isinstance(self.processor, torch.nn.Module)
             and not isinstance(processor, torch.nn.Module)
         ):
+            logger.info(
+                f"You are removing possibly trained weights of {self.processor} with {processor}"
+            )
             self._modules.pop("processor")
         self.processor = processor
+    def get_processor(
+        self, return_deprecated_lora: bool = False
+    ) -> "AttentionProcessor":  # noqa: F821
         r"""
         Get the attention processor in use.
         # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
         non_lora_processor_cls_name = self.processor.__class__.__name__
+        lora_processor_cls = getattr(
+            import_module(__name__), "LoRA" + non_lora_processor_cls_name
+        )
         hidden_size = self.inner_dim
         # now create a LoRA attention processor from the LoRA layers
+        if lora_processor_cls in [
+            LoRAAttnProcessor,
+            LoRAAttnProcessor2_0,
+            LoRAXFormersAttnProcessor,
+        ]:
             kwargs = {
                 "cross_attention_dim": self.cross_attention_dim,
                 "rank": self.to_q.lora_layer.rank,
             lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
             lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
             lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(
+                self.to_out[0].lora_layer.state_dict()
+            )
         elif lora_processor_cls == LoRAAttnAddedKVProcessor:
             lora_processor = lora_processor_cls(
                 hidden_size,
             lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
             lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
             lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(
+                self.to_out[0].lora_layer.state_dict()
+            )
             # only save if used
             if self.add_k_proj.lora_layer is not None:
+                lora_processor.add_k_proj_lora.load_state_dict(
+                    self.add_k_proj.lora_layer.state_dict()
+                )
+                lora_processor.add_v_proj_lora.load_state_dict(
+                    self.add_v_proj.lora_layer.state_dict()
+                )
             else:
                 lora_processor.add_k_proj_lora = None
                 lora_processor.add_v_proj_lora = None
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        attn_parameters = set(
+            inspect.signature(self.processor.__call__).parameters.keys()
+        )
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters
+        ]
         if len(unused_kwargs) > 0:
             logger.warning(
                 f"cross_attention_kwargs {unused_kwargs} are not expected by"
                 f" {self.processor.__class__.__name__} and will be ignored."
             )
+        cross_attention_kwargs = {
+            k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters
+        }
         return self.processor(
             self,
         head_size = self.heads
         batch_size, seq_len, dim = tensor.shape
         tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size // head_size, seq_len, dim * head_size
+        )
         return tensor
     def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
             extra_dim = 1
         else:
             batch_size, extra_dim, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(
+            batch_size, seq_len * extra_dim, head_size, dim // head_size
+        )
         tensor = tensor.permute(0, 2, 1, 3)
         if out_dim == 3:
+            tensor = tensor.reshape(
+                batch_size * head_size, seq_len * extra_dim, dim // head_size
+            )
         return tensor
     def get_attention_scores(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        attention_mask: torch.Tensor = None,
     ) -> torch.Tensor:
         r"""
         Compute the attention scores.
         if attention_mask is None:
             baddbmm_input = torch.empty(
+                query.shape[0],
+                query.shape[1],
+                key.shape[1],
+                dtype=query.dtype,
+                device=query.device,
             )
             beta = 0
         else:
         return attention_probs
     def prepare_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        target_length: int,
+        batch_size: int,
+        out_dim: int = 3,
     ) -> torch.Tensor:
         r"""
         Prepare the attention mask for the attention computation.
             if attention_mask.device.type == "mps":
                 # HACK: MPS: Does not support padding by greater than dimension of input tensor.
                 # Instead, we can manually construct the padding tensor.
+                padding_shape = (
+                    attention_mask.shape[0],
+                    attention_mask.shape[1],
+                    target_length,
+                )
+                padding = torch.zeros(
+                    padding_shape,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
                 attention_mask = torch.cat([attention_mask, padding], dim=2)
             else:
                 # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
         return attention_mask
+    def norm_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor
+    ) -> torch.Tensor:
         r"""
         Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
         `Attention` class.
         Returns:
             `torch.Tensor`: The normalized encoder hidden states.
         """
+        assert (
+            self.norm_cross is not None
+        ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
         if isinstance(self.norm_cross, nn.LayerNorm):
             encoder_hidden_states = self.norm_cross(encoder_hidden_states)
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
         batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
         )
         if (attention_mask is not None) and (not attn.use_tpu_flash_attention):
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
             # scaled_dot_product_attention expects attention_mask shape to be
             # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
         if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
         query = attn.to_q(hidden_states)
         query = attn.q_norm(query)
         if encoder_hidden_states is not None:
             if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(
+                    encoder_hidden_states
+                )
             key = attn.to_k(encoder_hidden_states)
             key = attn.k_norm(key)
         else:  # if no context provided do self-attention
         if attn.use_tpu_flash_attention:  # use tpu attention offload 'flash attention'
             q_segment_indexes = None
+            if (
+                attention_mask is not None
+            ):  # if mask is required need to tune both segmenIds fields
                 # attention_mask = torch.squeeze(attention_mask).to(torch.float32)
                 attention_mask = attention_mask.to(torch.float32)
+                q_segment_indexes = torch.ones(
+                    batch_size, query.shape[2], device=query.device, dtype=torch.float32
+                )
                 assert (
                     attention_mask.shape[1] == key.shape[2]
                 ), f"ERROR: KEY SHAPE must be same as attention mask [{key.shape[2]}, {attention_mask.shape[1]}]"
             )
         else:
             hidden_states = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
             )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
         hidden_states = hidden_states.to(query.dtype)
         # linear proj
         hidden_states = attn.to_out[1](hidden_states)
         if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
         if attn.residual_connection:
             hidden_states = hidden_states + residual
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
         batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
         )
         if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
         query = attn.to_q(hidden_states)
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
         hidden_states = attn.to_out[1](hidden_states)
         if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
         if attn.residual_connection:
             hidden_states = hidden_states + residual

xora/models/transformers/embeddings.py CHANGED Viewed

@@ -26,7 +26,9 @@ def get_timestep_embedding(
     assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
     half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
     exponent = exponent / (half_dim - downscale_freq_shift)
     emb = torch.exp(exponent)
@@ -113,7 +115,9 @@ class SinusoidalPositionalEmbedding(nn.Module):
     def __init__(self, embed_dim: int, max_seq_length: int = 32):
         super().__init__()
         position = torch.arange(max_seq_length).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
         pe = torch.zeros(1, max_seq_length, embed_dim)
         pe[0, :, 0::2] = torch.sin(position * div_term)
         pe[0, :, 1::2] = torch.cos(position * div_term)

     assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
     half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
     exponent = exponent / (half_dim - downscale_freq_shift)
     emb = torch.exp(exponent)
     def __init__(self, embed_dim: int, max_seq_length: int = 32):
         super().__init__()
         position = torch.arange(max_seq_length).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim)
+        )
         pe = torch.zeros(1, max_seq_length, embed_dim)
         pe[0, :, 0::2] = torch.sin(position * div_term)
         pe[0, :, 1::2] = torch.cos(position * div_term)

xora/models/transformers/symmetric_patchifier.py CHANGED Viewed

@@ -15,12 +15,19 @@ class Patchifier(ConfigMixin, ABC):
         self._patch_size = (1, patch_size, patch_size)
     @abstractmethod
-    def patchify(self, latents: Tensor, frame_rates: Tensor, scale_grid: bool) -> Tuple[Tensor, Tensor]:
         pass
     @abstractmethod
     def unpatchify(
-        self, latents: Tensor, output_height: int, output_width: int, output_num_frames: int, out_channels: int
     ) -> Tuple[Tensor, Tensor]:
         pass
@@ -28,7 +35,9 @@ class Patchifier(ConfigMixin, ABC):
     def patch_size(self):
         return self._patch_size
-    def get_grid(self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device):
         f = orig_num_frames // self._patch_size[0]
         h = orig_height // self._patch_size[1]
         w = orig_width // self._patch_size[2]
@@ -64,6 +73,7 @@ def pixart_alpha_patchify(
     )
     return latents
 class SymmetricPatchifier(Patchifier):
     def patchify(
         self,
@@ -72,7 +82,12 @@ class SymmetricPatchifier(Patchifier):
         return pixart_alpha_patchify(latents, self._patch_size)
     def unpatchify(
-        self, latents: Tensor, output_height: int, output_width: int, output_num_frames: int, out_channels: int
     ) -> Tuple[Tensor, Tensor]:
         output_height = output_height // self._patch_size[1]
         output_width = output_width // self._patch_size[2]

         self._patch_size = (1, patch_size, patch_size)
     @abstractmethod
+    def patchify(
+        self, latents: Tensor, frame_rates: Tensor, scale_grid: bool
+    ) -> Tuple[Tensor, Tensor]:
         pass
     @abstractmethod
     def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        output_num_frames: int,
+        out_channels: int,
     ) -> Tuple[Tensor, Tensor]:
         pass
     def patch_size(self):
         return self._patch_size
+    def get_grid(
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
+    ):
         f = orig_num_frames // self._patch_size[0]
         h = orig_height // self._patch_size[1]
         w = orig_width // self._patch_size[2]
     )
     return latents
 class SymmetricPatchifier(Patchifier):
     def patchify(
         self,
         return pixart_alpha_patchify(latents, self._patch_size)
     def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        output_num_frames: int,
+        out_channels: int,
     ) -> Tuple[Tensor, Tensor]:
         output_height = output_height // self._patch_size[1]
         output_width = output_width // self._patch_size[2]

xora/models/transformers/transformer3d.py CHANGED Viewed

@@ -17,6 +17,7 @@ from xora.models.transformers.embeddings import get_3d_sincos_pos_embed
 logger = logging.get_logger(__name__)
 @dataclass
 class Transformer3DModelOutput(BaseOutput):
     """
@@ -68,7 +69,9 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         timestep_scale_multiplier: Optional[float] = None,
     ):
         super().__init__()
-        self.use_tpu_flash_attention = use_tpu_flash_attention  # FIXME: push config down to the attention modules
         self.use_linear_projection = use_linear_projection
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
@@ -86,7 +89,9 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         self.timestep_scale_multiplier = timestep_scale_multiplier
         if self.positional_embedding_type == "absolute":
-            embed_dim_3d = math.ceil((inner_dim / 2) * 3) if project_to_2d_pos else inner_dim
             if self.project_to_2d_pos:
                 self.to_2d_proj = torch.nn.Linear(embed_dim_3d, inner_dim, bias=False)
                 self._init_to_2d_proj_weights(self.to_2d_proj)
@@ -131,18 +136,24 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         # 4. Define output layers
         self.out_channels = in_channels if out_channels is None else out_channels
         self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
-        self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
         self.proj_out = nn.Linear(inner_dim, self.out_channels)
         # 5. PixArt-Alpha blocks.
-        self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=False)
         if adaptive_norm == "single_scale":
             # Use 4 channels instead of the 6 for the PixArt-Alpha scale + shift ada norm.
             self.adaln_single.linear = nn.Linear(inner_dim, 4 * inner_dim, bias=True)
         self.caption_projection = None
         if caption_channels is not None:
-            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
         self.gradient_checkpointing = False
@@ -169,16 +180,32 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         self.apply(_basic_init)
         # Initialize timestep embedding MLP:
-        nn.init.normal_(self.adaln_single.emb.timestep_embedder.linear_1.weight, std=embedding_std)
-        nn.init.normal_(self.adaln_single.emb.timestep_embedder.linear_2.weight, std=embedding_std)
         nn.init.normal_(self.adaln_single.linear.weight, std=embedding_std)
         if hasattr(self.adaln_single.emb, "resolution_embedder"):
-            nn.init.normal_(self.adaln_single.emb.resolution_embedder.linear_1.weight, std=embedding_std)
-            nn.init.normal_(self.adaln_single.emb.resolution_embedder.linear_2.weight, std=embedding_std)
         if hasattr(self.adaln_single.emb, "aspect_ratio_embedder"):
-            nn.init.normal_(self.adaln_single.emb.aspect_ratio_embedder.linear_1.weight, std=embedding_std)
-            nn.init.normal_(self.adaln_single.emb.aspect_ratio_embedder.linear_2.weight, std=embedding_std)
         # Initialize caption embedding MLP:
         nn.init.normal_(self.caption_projection.linear_1.weight, std=embedding_std)
@@ -220,7 +247,11 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
     def get_fractional_positions(self, indices_grid):
         fractional_positions = torch.stack(
-            [indices_grid[:, i] / self.positional_embedding_max_pos[i] for i in range(3)], dim=-1
         )
         return fractional_positions
@@ -236,7 +267,13 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         device = fractional_positions.device
         if spacing == "exp":
             indices = theta ** (
-                torch.linspace(math.log(start, theta), math.log(end, theta), dim // 6, device=device, dtype=dtype)
             )
             indices = indices.to(dtype=dtype)
         elif spacing == "exp_2":
@@ -245,14 +282,24 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         elif spacing == "linear":
             indices = torch.linspace(start, end, dim // 6, device=device, dtype=dtype)
         elif spacing == "sqrt":
-            indices = torch.linspace(start**2, end**2, dim // 6, device=device, dtype=dtype).sqrt()
         indices = indices * math.pi / 2
         if spacing == "exp_2":
-            freqs = (indices * fractional_positions.unsqueeze(-1)).transpose(-1, -2).flatten(2)
         else:
-            freqs = (indices * (fractional_positions.unsqueeze(-1) * 2 - 1)).transpose(-1, -2).flatten(2)
         cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
         sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
@@ -336,7 +383,9 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
             # convert encoder_attention_mask to a bias the same way we do for attention_mask
             if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
-                encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
                 encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
         # 1. Input
@@ -346,7 +395,9 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
             timestep = self.timestep_scale_multiplier * timestep
         if self.positional_embedding_type == "absolute":
-            pos_embed_3d = self.get_absolute_pos_embed(indices_grid).to(hidden_states.device)
             if self.project_to_2d_pos:
                 pos_embed = self.to_2d_proj(pos_embed_3d)
             hidden_states = (hidden_states + pos_embed).to(hidden_states.dtype)
@@ -363,13 +414,17 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         )
         # Second dimension is 1 or number of tokens (if timestep_per_token)
         timestep = timestep.view(batch_size, -1, timestep.shape[-1])
-        embedded_timestep = embedded_timestep.view(batch_size, -1, embedded_timestep.shape[-1])
         # 2. Blocks
         if self.caption_projection is not None:
             batch_size = hidden_states.shape[0]
             encoder_hidden_states = self.caption_projection(encoder_hidden_states)
-            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
         for block in self.transformer_blocks:
             if self.training and self.gradient_checkpointing:
@@ -383,7 +438,9 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                     return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
                 hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
@@ -409,7 +466,9 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                 )
         # 3. Output
-        scale_shift_values = self.scale_shift_table[None, None] + embedded_timestep[:, :, None]
         shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
         hidden_states = self.norm_out(hidden_states)
         # Modulation
@@ -422,7 +481,11 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
     def get_absolute_pos_embed(self, grid):
         grid_np = grid[0].cpu().numpy()
-        embed_dim_3d = math.ceil((self.inner_dim / 2) * 3) if self.project_to_2d_pos else self.inner_dim
         pos_embed = get_3d_sincos_pos_embed(  # (f h w)
             embed_dim_3d,
             grid_np,

 logger = logging.get_logger(__name__)
 @dataclass
 class Transformer3DModelOutput(BaseOutput):
     """
         timestep_scale_multiplier: Optional[float] = None,
     ):
         super().__init__()
+        self.use_tpu_flash_attention = (
+            use_tpu_flash_attention  # FIXME: push config down to the attention modules
+        )
         self.use_linear_projection = use_linear_projection
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
         self.timestep_scale_multiplier = timestep_scale_multiplier
         if self.positional_embedding_type == "absolute":
+            embed_dim_3d = (
+                math.ceil((inner_dim / 2) * 3) if project_to_2d_pos else inner_dim
+            )
             if self.project_to_2d_pos:
                 self.to_2d_proj = torch.nn.Linear(embed_dim_3d, inner_dim, bias=False)
                 self._init_to_2d_proj_weights(self.to_2d_proj)
         # 4. Define output layers
         self.out_channels = in_channels if out_channels is None else out_channels
         self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(2, inner_dim) / inner_dim**0.5
+        )
         self.proj_out = nn.Linear(inner_dim, self.out_channels)
         # 5. PixArt-Alpha blocks.
+        self.adaln_single = AdaLayerNormSingle(
+            inner_dim, use_additional_conditions=False
+        )
         if adaptive_norm == "single_scale":
             # Use 4 channels instead of the 6 for the PixArt-Alpha scale + shift ada norm.
             self.adaln_single.linear = nn.Linear(inner_dim, 4 * inner_dim, bias=True)
         self.caption_projection = None
         if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(
+                in_features=caption_channels, hidden_size=inner_dim
+            )
         self.gradient_checkpointing = False
         self.apply(_basic_init)
         # Initialize timestep embedding MLP:
+        nn.init.normal_(
+            self.adaln_single.emb.timestep_embedder.linear_1.weight, std=embedding_std
+        )
+        nn.init.normal_(
+            self.adaln_single.emb.timestep_embedder.linear_2.weight, std=embedding_std
+        )
         nn.init.normal_(self.adaln_single.linear.weight, std=embedding_std)
         if hasattr(self.adaln_single.emb, "resolution_embedder"):
+            nn.init.normal_(
+                self.adaln_single.emb.resolution_embedder.linear_1.weight,
+                std=embedding_std,
+            )
+            nn.init.normal_(
+                self.adaln_single.emb.resolution_embedder.linear_2.weight,
+                std=embedding_std,
+            )
         if hasattr(self.adaln_single.emb, "aspect_ratio_embedder"):
+            nn.init.normal_(
+                self.adaln_single.emb.aspect_ratio_embedder.linear_1.weight,
+                std=embedding_std,
+            )
+            nn.init.normal_(
+                self.adaln_single.emb.aspect_ratio_embedder.linear_2.weight,
+                std=embedding_std,
+            )
         # Initialize caption embedding MLP:
         nn.init.normal_(self.caption_projection.linear_1.weight, std=embedding_std)
     def get_fractional_positions(self, indices_grid):
         fractional_positions = torch.stack(
+            [
+                indices_grid[:, i] / self.positional_embedding_max_pos[i]
+                for i in range(3)
+            ],
+            dim=-1,
         )
         return fractional_positions
         device = fractional_positions.device
         if spacing == "exp":
             indices = theta ** (
+                torch.linspace(
+                    math.log(start, theta),
+                    math.log(end, theta),
+                    dim // 6,
+                    device=device,
+                    dtype=dtype,
+                )
             )
             indices = indices.to(dtype=dtype)
         elif spacing == "exp_2":
         elif spacing == "linear":
             indices = torch.linspace(start, end, dim // 6, device=device, dtype=dtype)
         elif spacing == "sqrt":
+            indices = torch.linspace(
+                start**2, end**2, dim // 6, device=device, dtype=dtype
+            ).sqrt()
         indices = indices * math.pi / 2
         if spacing == "exp_2":
+            freqs = (
+                (indices * fractional_positions.unsqueeze(-1))
+                .transpose(-1, -2)
+                .flatten(2)
+            )
         else:
+            freqs = (
+                (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
+                .transpose(-1, -2)
+                .flatten(2)
+            )
         cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
         sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
             # convert encoder_attention_mask to a bias the same way we do for attention_mask
             if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+                encoder_attention_mask = (
+                    1 - encoder_attention_mask.to(hidden_states.dtype)
+                ) * -10000.0
                 encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
         # 1. Input
             timestep = self.timestep_scale_multiplier * timestep
         if self.positional_embedding_type == "absolute":
+            pos_embed_3d = self.get_absolute_pos_embed(indices_grid).to(
+                hidden_states.device
+            )
             if self.project_to_2d_pos:
                 pos_embed = self.to_2d_proj(pos_embed_3d)
             hidden_states = (hidden_states + pos_embed).to(hidden_states.dtype)
         )
         # Second dimension is 1 or number of tokens (if timestep_per_token)
         timestep = timestep.view(batch_size, -1, timestep.shape[-1])
+        embedded_timestep = embedded_timestep.view(
+            batch_size, -1, embedded_timestep.shape[-1]
+        )
         # 2. Blocks
         if self.caption_projection is not None:
             batch_size = hidden_states.shape[0]
             encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(
+                batch_size, -1, hidden_states.shape[-1]
+            )
         for block in self.transformer_blocks:
             if self.training and self.gradient_checkpointing:
                     return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
                 hidden_states = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                 )
         # 3. Output
+        scale_shift_values = (
+            self.scale_shift_table[None, None] + embedded_timestep[:, :, None]
+        )
         shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
         hidden_states = self.norm_out(hidden_states)
         # Modulation
     def get_absolute_pos_embed(self, grid):
         grid_np = grid[0].cpu().numpy()
+        embed_dim_3d = (
+            math.ceil((self.inner_dim / 2) * 3)
+            if self.project_to_2d_pos
+            else self.inner_dim
+        )
         pos_embed = get_3d_sincos_pos_embed(  # (f h w)
             embed_dim_3d,
             grid_np,

xora/pipelines/pipeline_video_pixart_alpha.py CHANGED Viewed

@@ -5,12 +5,10 @@ import math
 import re
 import urllib.parse as ul
 from typing import Callable, Dict, List, Optional, Tuple, Union
-from abc import ABC, abstractmethod
 import torch
 import torch.nn.functional as F
-from torch import Tensor
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -29,7 +27,11 @@ from transformers import T5EncoderModel, T5Tokenizer
 from xora.models.transformers.transformer3d import Transformer3DModel
 from xora.models.transformers.symmetric_patchifier import Patchifier
-from xora.models.autoencoders.vae_encode import get_vae_size_scale_factor, vae_decode, vae_encode
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.schedulers.rf import TimestepShifter
 from xora.utils.conditioning_method import ConditioningMethod
@@ -161,7 +163,9 @@ def retrieve_timesteps(
         second element is the number of inference steps.
     """
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
@@ -238,7 +242,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             patchifier=patchifier,
         )
-        self.video_scale_factor, self.vae_scale_factor, _ = get_vae_size_scale_factor(self.vae)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
     # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
@@ -320,12 +326,16 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
                     f" {max_length} tokens: {removed_text}"
@@ -334,7 +344,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             prompt_attention_mask = text_inputs.attention_mask
             prompt_attention_mask = prompt_attention_mask.to(device)
-            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
             prompt_embeds = prompt_embeds[0]
         if self.text_encoder is not None:
@@ -349,14 +361,20 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt)
-        prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1)
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens = [negative_prompt] * batch_size
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
@@ -371,7 +389,8 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
             negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
             )
             negative_prompt_embeds = negative_prompt_embeds[0]
@@ -379,18 +398,33 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_images_per_prompt)
-            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1)
         else:
             negative_prompt_embeds = None
             negative_prompt_attention_mask = None
-        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -399,13 +433,17 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -422,7 +460,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         negative_prompt_attention_mask=None,
     ):
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
@@ -433,8 +473,12 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
@@ -449,10 +493,17 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             )
         if prompt_embeds is not None and prompt_attention_mask is None:
-            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
-        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
-            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
         if prompt_embeds is not None and negative_prompt_embeds is not None:
             if prompt_embeds.shape != negative_prompt_embeds.shape:
@@ -471,12 +522,16 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
         if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
@@ -564,13 +619,17 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
         # this-is-my-cute-cat / this_is_my_cute_cat
@@ -588,10 +647,14 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
@@ -610,7 +673,15 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-        self, batch_size, num_latent_channels, num_patches, dtype, device, generator, latents=None, latents_mask=None
     ):
         shape = (
             batch_size,
@@ -625,10 +696,14 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             )
         if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         elif latents_mask is not None:
             noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            latents = latents * latents_mask[..., None] + noise * (1 - latents_mask[..., None])
         else:
             latents = latents.to(device)
@@ -637,7 +712,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         return latents
     @staticmethod
-    def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
         """Returns binned height and width."""
         ar = float(height / width)
         closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
@@ -645,7 +722,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         return int(default_hw[0]), int(default_hw[1])
     @staticmethod
-    def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
         n_frames, orig_height, orig_width = samples.shape[-3:]
         # Check if resizing is needed
@@ -656,7 +735,12 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             # Resize
             samples = rearrange(samples, "b c n h w -> (b n) c h w")
-            samples = F.interpolate(samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False)
             samples = rearrange(samples, "(b n) c h w -> b c n h w", n=n_frames)
             # Center Crop
@@ -821,14 +905,21 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         )
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
         # 3b. Encode and prepare conditioning data
         self.video_scale_factor = self.video_scale_factor if is_video else 1
         conditioning_method = kwargs.get("conditioning_method", None)
         vae_per_channel_normalize = kwargs.get("vae_per_channel_normalize", False)
         init_latents, conditioning_mask = self.prepare_conditioning(
-            media_items, num_frames, height, width, conditioning_method, vae_per_channel_normalize
         )
         # 4. Prepare latents.
@@ -851,29 +942,46 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         )
         if conditioning_mask is not None and is_video:
             assert num_images_per_prompt == 1
-            conditioning_mask = torch.cat([conditioning_mask] * 2) if do_classifier_free_guidance else conditioning_mask
         # 5. Prepare timesteps
         retrieve_timesteps_kwargs = {}
         if isinstance(self.scheduler, TimestepShifter):
             retrieve_timesteps_kwargs["samples"] = latents
         timesteps, num_inference_steps = retrieve_timesteps(
-            self.scheduler, num_inference_steps, device, timesteps, **retrieve_timesteps_kwargs
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 latent_frame_rates = (
-                    torch.ones(latent_model_input.shape[0], 1, device=latent_model_input.device) * latent_frame_rate
                 )
                 current_timestep = t
@@ -885,13 +993,25 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                         dtype = torch.float32 if is_mps else torch.float64
                     else:
                         dtype = torch.int32 if is_mps else torch.int64
-                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
                 elif len(current_timestep.shape) == 0:
-                    current_timestep = current_timestep[None].to(latent_model_input.device)
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                current_timestep = current_timestep.expand(latent_model_input.shape[0]).unsqueeze(-1)
                 scale_grid = (
-                    (1 / latent_frame_rates, self.vae_scale_factor, self.vae_scale_factor)
                     if self.transformer.use_rope
                     else None
                 )
@@ -920,11 +1040,16 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                     current_timestep, _ = current_timestep.chunk(2)
                 # learned sigma
-                if self.transformer.config.out_channels // 2 == self.transformer.config.in_channels:
                     noise_pred = noise_pred.chunk(2, dim=1)[0]
                 # compute previous image: x_t -> x_t-1
@@ -937,7 +1062,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                 )[0]
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                 if callback_on_step_end is not None:
@@ -948,11 +1075,15 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             output_height=latent_height,
             output_width=latent_width,
             output_num_frames=latent_num_frames,
-            out_channels=self.transformer.in_channels // math.prod(self.patchifier.patch_size),
         )
         if output_type != "latent":
             image = vae_decode(
-                latents, self.vae, is_video, vae_per_channel_normalize=kwargs["vae_per_channel_normalize"]
             )
             image = self.image_processor.postprocess(image, output_type=output_type)
@@ -1005,20 +1136,31 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             vae_per_channel_normalize=vae_per_channel_normalize,
         ).float()
-        init_len, target_len = init_latents.shape[2], num_frames // self.video_scale_factor
         if isinstance(self.vae, CausalVideoAutoencoder):
             target_len += 1
         init_latents = init_latents[:, :, :target_len]
         if target_len > init_len:
             repeat_factor = (target_len + init_len - 1) // init_len  # Ceiling division
-            init_latents = init_latents.repeat(1, 1, repeat_factor, 1, 1)[:, :, :target_len]
         # Prepare the conditioning mask (1.0 = condition on this token)
         b, n, f, h, w = init_latents.shape
         conditioning_mask = torch.zeros([b, 1, f, h, w], device=init_latents.device)
-        if method in [ConditioningMethod.FIRST_FRAME, ConditioningMethod.FIRST_AND_LAST_FRAME]:
             conditioning_mask[:, :, 0] = 1.0
-        if method in [ConditioningMethod.LAST_FRAME, ConditioningMethod.FIRST_AND_LAST_FRAME]:
             conditioning_mask[:, :, -1] = 1.0
         # Patchify the init latents and the mask

 import re
 import urllib.parse as ul
 from typing import Callable, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from xora.models.transformers.transformer3d import Transformer3DModel
 from xora.models.transformers.symmetric_patchifier import Patchifier
+from xora.models.autoencoders.vae_encode import (
+    get_vae_size_scale_factor,
+    vae_decode,
+    vae_encode,
+)
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.schedulers.rf import TimestepShifter
 from xora.utils.conditioning_method import ConditioningMethod
         second element is the number of inference steps.
     """
     if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
         if not accepts_timesteps:
             raise ValueError(
                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
             patchifier=patchifier,
         )
+        self.video_scale_factor, self.vae_scale_factor, _ = get_vae_size_scale_factor(
+            self.vae
+        )
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
     # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, max_length - 1 : -1]
+                )
                 logger.warning(
                     "The following part of your input was truncated because CLIP can only handle sequences up to"
                     f" {max_length} tokens: {removed_text}"
             prompt_attention_mask = text_inputs.attention_mask
             prompt_attention_mask = prompt_attention_mask.to(device)
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device), attention_mask=prompt_attention_mask
+            )
             prompt_embeds = prompt_embeds[0]
         if self.text_encoder is not None:
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
         prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt)
+        prompt_attention_mask = prompt_attention_mask.view(
+            bs_embed * num_images_per_prompt, -1
+        )
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens = [negative_prompt] * batch_size
+            uncond_tokens = self._text_preprocessing(
+                uncond_tokens, clean_caption=clean_caption
+            )
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens,
             negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
             negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=negative_prompt_attention_mask,
             )
             negative_prompt_embeds = negative_prompt_embeds[0]
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(
+                1, num_images_per_prompt
+            )
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(
+                bs_embed * num_images_per_prompt, -1
+            )
         else:
             negative_prompt_embeds = None
             negative_prompt_attention_mask = None
+        return (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        )
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
         negative_prompt_attention_mask=None,
     ):
         if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
         if prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
             )
         if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError(
+                "Must provide `prompt_attention_mask` when specifying `prompt_embeds`."
+            )
+        if (
+            negative_prompt_embeds is not None
+            and negative_prompt_attention_mask is None
+        ):
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`."
+            )
         if prompt_embeds is not None and negative_prompt_embeds is not None:
             if prompt_embeds.shape != negative_prompt_embeds.shape:
     # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
     def _text_preprocessing(self, text, clean_caption=False):
         if clean_caption and not is_bs4_available():
+            logger.warn(
+                BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")
+            )
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
         if clean_caption and not is_ftfy_available():
+            logger.warn(
+                BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")
+            )
             logger.warn("Setting `clean_caption` to False...")
             clean_caption = False
         # "123456.."
         caption = re.sub(r"\b\d{6,}\b", "", caption)
         # filenames:
+        caption = re.sub(
+            r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption
+        )
         #
         caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
         caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+        caption = re.sub(
+            self.bad_punct_regex, r" ", caption
+        )  # ***AUSVERKAUFT***, #AUSVERKAUFT
         caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
         # this-is-my-cute-cat / this_is_my_cute_cat
         caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
         caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
         caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(
+            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption
+        )
         caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+        caption = re.sub(
+            r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption
+        )  # j2d1a2a...
         caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
+        self,
+        batch_size,
+        num_latent_channels,
+        num_patches,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        latents_mask=None,
     ):
         shape = (
             batch_size,
             )
         if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
         elif latents_mask is not None:
             noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = latents * latents_mask[..., None] + noise * (
+                1 - latents_mask[..., None]
+            )
         else:
             latents = latents.to(device)
         return latents
     @staticmethod
+    def classify_height_width_bin(
+        height: int, width: int, ratios: dict
+    ) -> Tuple[int, int]:
         """Returns binned height and width."""
         ar = float(height / width)
         closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
         return int(default_hw[0]), int(default_hw[1])
     @staticmethod
+    def resize_and_crop_tensor(
+        samples: torch.Tensor, new_width: int, new_height: int
+    ) -> torch.Tensor:
         n_frames, orig_height, orig_width = samples.shape[-3:]
         # Check if resizing is needed
             # Resize
             samples = rearrange(samples, "b c n h w -> (b n) c h w")
+            samples = F.interpolate(
+                samples,
+                size=(resized_height, resized_width),
+                mode="bilinear",
+                align_corners=False,
+            )
             samples = rearrange(samples, "(b n) c h w -> b c n h w", n=n_frames)
             # Center Crop
         )
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat(
+                [negative_prompt_attention_mask, prompt_attention_mask], dim=0
+            )
         # 3b. Encode and prepare conditioning data
         self.video_scale_factor = self.video_scale_factor if is_video else 1
         conditioning_method = kwargs.get("conditioning_method", None)
         vae_per_channel_normalize = kwargs.get("vae_per_channel_normalize", False)
         init_latents, conditioning_mask = self.prepare_conditioning(
+            media_items,
+            num_frames,
+            height,
+            width,
+            conditioning_method,
+            vae_per_channel_normalize,
         )
         # 4. Prepare latents.
         )
         if conditioning_mask is not None and is_video:
             assert num_images_per_prompt == 1
+            conditioning_mask = (
+                torch.cat([conditioning_mask] * 2)
+                if do_classifier_free_guidance
+                else conditioning_mask
+            )
         # 5. Prepare timesteps
         retrieve_timesteps_kwargs = {}
         if isinstance(self.scheduler, TimestepShifter):
             retrieve_timesteps_kwargs["samples"] = latents
         timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            **retrieve_timesteps_kwargs,
         )
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 7. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
                 latent_frame_rates = (
+                    torch.ones(
+                        latent_model_input.shape[0], 1, device=latent_model_input.device
+                    )
+                    * latent_frame_rate
                 )
                 current_timestep = t
                         dtype = torch.float32 if is_mps else torch.float64
                     else:
                         dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor(
+                        [current_timestep],
+                        dtype=dtype,
+                        device=latent_model_input.device,
+                    )
                 elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(
+                        latent_model_input.device
+                    )
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(
+                    latent_model_input.shape[0]
+                ).unsqueeze(-1)
                 scale_grid = (
+                    (
+                        1 / latent_frame_rates,
+                        self.vae_scale_factor,
+                        self.vae_scale_factor,
+                    )
                     if self.transformer.use_rope
                     else None
                 )
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
                     current_timestep, _ = current_timestep.chunk(2)
                 # learned sigma
+                if (
+                    self.transformer.config.out_channels // 2
+                    == self.transformer.config.in_channels
+                ):
                     noise_pred = noise_pred.chunk(2, dim=1)[0]
                 # compute previous image: x_t -> x_t-1
                 )[0]
                 # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
                     progress_bar.update()
                 if callback_on_step_end is not None:
             output_height=latent_height,
             output_width=latent_width,
             output_num_frames=latent_num_frames,
+            out_channels=self.transformer.in_channels
+            // math.prod(self.patchifier.patch_size),
         )
         if output_type != "latent":
             image = vae_decode(
+                latents,
+                self.vae,
+                is_video,
+                vae_per_channel_normalize=kwargs["vae_per_channel_normalize"],
             )
             image = self.image_processor.postprocess(image, output_type=output_type)
             vae_per_channel_normalize=vae_per_channel_normalize,
         ).float()
+        init_len, target_len = (
+            init_latents.shape[2],
+            num_frames // self.video_scale_factor,
+        )
         if isinstance(self.vae, CausalVideoAutoencoder):
             target_len += 1
         init_latents = init_latents[:, :, :target_len]
         if target_len > init_len:
             repeat_factor = (target_len + init_len - 1) // init_len  # Ceiling division
+            init_latents = init_latents.repeat(1, 1, repeat_factor, 1, 1)[
+                :, :, :target_len
+            ]
         # Prepare the conditioning mask (1.0 = condition on this token)
         b, n, f, h, w = init_latents.shape
         conditioning_mask = torch.zeros([b, 1, f, h, w], device=init_latents.device)
+        if method in [
+            ConditioningMethod.FIRST_FRAME,
+            ConditioningMethod.FIRST_AND_LAST_FRAME,
+        ]:
             conditioning_mask[:, :, 0] = 1.0
+        if method in [
+            ConditioningMethod.LAST_FRAME,
+            ConditioningMethod.FIRST_AND_LAST_FRAME,
+        ]:
             conditioning_mask[:, :, -1] = 1.0
         # Patchify the init latents and the mask

xora/schedulers/rf.py CHANGED Viewed

@@ -22,7 +22,9 @@ def simple_diffusion_resolution_dependent_timestep_shift(
     elif len(samples.shape) in [4, 5]:
         m = math.prod(samples.shape[2:])
     else:
-        raise ValueError("Samples must have shape (b, t, c), (b, c, h, w) or (b, c, f, h, w)")
     snr = (timesteps / (1 - timesteps)) ** 2
     shift_snr = torch.log(snr) + 2 * math.log(m / n)
     shifted_timesteps = torch.sigmoid(0.5 * shift_snr)
@@ -46,7 +48,9 @@ def get_normal_shift(
     return m * n_tokens + b
-def sd3_resolution_dependent_timestep_shift(samples: Tensor, timesteps: Tensor) -> Tensor:
     """
     Shifts the timestep schedule as a function of the generated resolution.
@@ -70,7 +74,9 @@ def sd3_resolution_dependent_timestep_shift(samples: Tensor, timesteps: Tensor)
     elif len(samples.shape) in [4, 5]:
         m = math.prod(samples.shape[2:])
     else:
-        raise ValueError("Samples must have shape (b, t, c), (b, c, h, w) or (b, c, f, h, w)")
     shift = get_normal_shift(m)
     return time_shift(shift, 1, timesteps)
@@ -104,12 +110,21 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
     order = 1
     @register_to_config
-    def __init__(self, num_train_timesteps=1000, shifting: Optional[str] = None, base_resolution: int = 32**2):
         super().__init__()
         self.init_noise_sigma = 1.0
         self.num_inference_steps = None
-        self.timesteps = self.sigmas = torch.linspace(1, 1 / num_train_timesteps, num_train_timesteps)
-        self.delta_timesteps = self.timesteps - torch.cat([self.timesteps[1:], torch.zeros_like(self.timesteps[-1:])])
         self.shifting = shifting
         self.base_resolution = base_resolution
@@ -117,10 +132,17 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
         if self.shifting == "SD3":
             return sd3_resolution_dependent_timestep_shift(samples, timesteps)
         elif self.shifting == "SimpleDiffusion":
-            return simple_diffusion_resolution_dependent_timestep_shift(samples, timesteps, self.base_resolution)
         return timesteps
-    def set_timesteps(self, num_inference_steps: int, samples: Tensor, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -130,13 +152,19 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
             device (`Union[str, torch.device]`, *optional*): The device to which the timesteps tensor will be moved.
         """
         num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
-        timesteps = torch.linspace(1, 1 / num_inference_steps, num_inference_steps).to(device)
         self.timesteps = self.shift_timesteps(samples, timesteps)
-        self.delta_timesteps = self.timesteps - torch.cat([self.timesteps[1:], torch.zeros_like(self.timesteps[-1:])])
         self.num_inference_steps = num_inference_steps
         self.sigmas = self.timesteps
-    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         # pylint: disable=unused-argument
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
@@ -206,7 +234,9 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
         else:
             # Timestep per token
             assert timestep.ndim == 2
-            current_index = (self.timesteps[:, None, None] - timestep[None]).abs().argmin(dim=0)
             dt = self.delta_timesteps[current_index]
             # Special treatment for zero timestep tokens - set dt to 0 so prev_sample = sample
             dt = torch.where(timestep == 0.0, torch.zeros_like(dt), dt)[..., None]
@@ -228,4 +258,4 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
         sigmas = append_dims(sigmas, original_samples.ndim)
         alphas = 1 - sigmas
         noisy_samples = alphas * original_samples + sigmas * noise
-        return noisy_samples

     elif len(samples.shape) in [4, 5]:
         m = math.prod(samples.shape[2:])
     else:
+        raise ValueError(
+            "Samples must have shape (b, t, c), (b, c, h, w) or (b, c, f, h, w)"
+        )
     snr = (timesteps / (1 - timesteps)) ** 2
     shift_snr = torch.log(snr) + 2 * math.log(m / n)
     shifted_timesteps = torch.sigmoid(0.5 * shift_snr)
     return m * n_tokens + b
+def sd3_resolution_dependent_timestep_shift(
+    samples: Tensor, timesteps: Tensor
+) -> Tensor:
     """
     Shifts the timestep schedule as a function of the generated resolution.
     elif len(samples.shape) in [4, 5]:
         m = math.prod(samples.shape[2:])
     else:
+        raise ValueError(
+            "Samples must have shape (b, t, c), (b, c, h, w) or (b, c, f, h, w)"
+        )
     shift = get_normal_shift(m)
     return time_shift(shift, 1, timesteps)
     order = 1
     @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps=1000,
+        shifting: Optional[str] = None,
+        base_resolution: int = 32**2,
+    ):
         super().__init__()
         self.init_noise_sigma = 1.0
         self.num_inference_steps = None
+        self.timesteps = self.sigmas = torch.linspace(
+            1, 1 / num_train_timesteps, num_train_timesteps
+        )
+        self.delta_timesteps = self.timesteps - torch.cat(
+            [self.timesteps[1:], torch.zeros_like(self.timesteps[-1:])]
+        )
         self.shifting = shifting
         self.base_resolution = base_resolution
         if self.shifting == "SD3":
             return sd3_resolution_dependent_timestep_shift(samples, timesteps)
         elif self.shifting == "SimpleDiffusion":
+            return simple_diffusion_resolution_dependent_timestep_shift(
+                samples, timesteps, self.base_resolution
+            )
         return timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        samples: Tensor,
+        device: Union[str, torch.device] = None,
+    ):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
             device (`Union[str, torch.device]`, *optional*): The device to which the timesteps tensor will be moved.
         """
         num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+        timesteps = torch.linspace(1, 1 / num_inference_steps, num_inference_steps).to(
+            device
+        )
         self.timesteps = self.shift_timesteps(samples, timesteps)
+        self.delta_timesteps = self.timesteps - torch.cat(
+            [self.timesteps[1:], torch.zeros_like(self.timesteps[-1:])]
+        )
         self.num_inference_steps = num_inference_steps
         self.sigmas = self.timesteps
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Optional[int] = None
+    ) -> torch.FloatTensor:
         # pylint: disable=unused-argument
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         else:
             # Timestep per token
             assert timestep.ndim == 2
+            current_index = (
+                (self.timesteps[:, None, None] - timestep[None]).abs().argmin(dim=0)
+            )
             dt = self.delta_timesteps[current_index]
             # Special treatment for zero timestep tokens - set dt to 0 so prev_sample = sample
             dt = torch.where(timestep == 0.0, torch.zeros_like(dt), dt)[..., None]
         sigmas = append_dims(sigmas, original_samples.ndim)
         alphas = 1 - sigmas
         noisy_samples = alphas * original_samples + sigmas * noise
+        return noisy_samples

xora/utils/conditioning_method.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from enum import Enum
 class ConditioningMethod(Enum):
     UNCONDITIONAL = "unconditional"
     FIRST_FRAME = "first_frame"
     LAST_FRAME = "last_frame"
-    FIRST_AND_LAST_FRAME = "first_and_last_frame"

 from enum import Enum
 class ConditioningMethod(Enum):
     UNCONDITIONAL = "unconditional"
     FIRST_FRAME = "first_frame"
     LAST_FRAME = "last_frame"
+    FIRST_AND_LAST_FRAME = "first_and_last_frame"

xora/utils/torch_utils.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import torch
 from torch import nn
 def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
     """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
     dims_to_append = target_dims - x.ndim
     if dims_to_append < 0:
-        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
     elif dims_to_append == 0:
         return x
     return x[(...,) + (None,) * dims_to_append]
 class Identity(nn.Module):
     """A placeholder identity operator that is argument-insensitive."""

 import torch
 from torch import nn
 def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
     """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
     dims_to_append = target_dims - x.ndim
     if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
     elif dims_to_append == 0:
         return x
     return x[(...,) + (None,) * dims_to_append]
 class Identity(nn.Module):
     """A placeholder identity operator that is argument-insensitive."""