Spaces:

KumaPower
/

AvatarArtist

Running on Zero

App Files Files Community

刘虹雨 commited on Mar 31

Commit

8ed2f16

1 Parent(s): 8f481d2

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +8 -32
DiT_VAE/.DS_Store +0 -0
DiT_VAE/__init__.py +0 -0
DiT_VAE/diffusion/__init__.py +8 -0
DiT_VAE/diffusion/configs/PixArt_xl2_4D_Triplane.py +64 -0
DiT_VAE/diffusion/configs/PixArt_xl2_img256_4D_Triplane.py +41 -0
DiT_VAE/diffusion/configs/__init__.py +0 -0
DiT_VAE/diffusion/configs/vae_model.yaml +24 -0
DiT_VAE/diffusion/data/__init__.py +2 -0
DiT_VAE/diffusion/data/builder.py +67 -0
DiT_VAE/diffusion/data/transforms.py +29 -0
DiT_VAE/diffusion/dpm_solver.py +28 -0
DiT_VAE/diffusion/iddpm.py +51 -0
DiT_VAE/diffusion/lcm_scheduler.py +455 -0
DiT_VAE/diffusion/model/__init__.py +2 -0
DiT_VAE/diffusion/model/builder.py +14 -0
DiT_VAE/diffusion/model/diffusion_utils.py +92 -0
DiT_VAE/diffusion/model/dpm_solver.py +1337 -0
DiT_VAE/diffusion/model/edm_sample.py +168 -0
DiT_VAE/diffusion/model/gaussian_diffusion.py +1006 -0
DiT_VAE/diffusion/model/hed.py +150 -0
DiT_VAE/diffusion/model/image_embedding.py +15 -0
DiT_VAE/diffusion/model/nets/PixArt_blocks.py +655 -0
DiT_VAE/diffusion/model/nets/TriDitCLIPDINO.py +315 -0
DiT_VAE/diffusion/model/nets/__init__.py +1 -0
DiT_VAE/diffusion/model/respace.py +131 -0
DiT_VAE/diffusion/model/sa_solver.py +1129 -0
DiT_VAE/diffusion/model/timestep_sampler.py +150 -0
DiT_VAE/diffusion/model/utils.py +510 -0
DiT_VAE/diffusion/sa_sampler.py +66 -0
DiT_VAE/diffusion/sa_solver_diffusers.py +840 -0
DiT_VAE/diffusion/utils/__init__.py +1 -0
DiT_VAE/diffusion/utils/checkpoint.py +80 -0
DiT_VAE/diffusion/utils/data_sampler.py +138 -0
DiT_VAE/diffusion/utils/dist_utils.py +303 -0
DiT_VAE/diffusion/utils/logger.py +94 -0
DiT_VAE/diffusion/utils/lr_scheduler.py +89 -0
DiT_VAE/diffusion/utils/misc.py +366 -0
DiT_VAE/diffusion/utils/optimizer.py +237 -0
DiT_VAE/train_diffusion.py +5 -0
DiT_VAE/train_vae.py +369 -0
DiT_VAE/util.py +217 -0
DiT_VAE/vae/__init__.py +0 -0
DiT_VAE/vae/aemodules3d.py +368 -0
DiT_VAE/vae/attention_vae.py +620 -0
DiT_VAE/vae/data/__init__.py +0 -0
DiT_VAE/vae/data/dataset_online_vae.py +108 -0
DiT_VAE/vae/distributions.py +94 -0
DiT_VAE/vae/losses/__init__.py +1 -0

.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

.gitattributes CHANGED Viewed

@@ -1,36 +1,12 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

+*.gif filter=lfs diff=lfs merge=lfs -text
+data_process/lib/FaceVerse/v3/faceverse_v3_1.npy filter=lfs diff=lfs merge=lfs -text
+data_process/lib/faceverse_process/BgMatting_models/rvm_resnet50_fp32.torchscript filter=lfs diff=lfs merge=lfs -text
+data_process/lib/faceverse_process/metamodel/v3/faceverse_v3_1.npy filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.torchscript filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

DiT_VAE/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

DiT_VAE/__init__.py ADDED Viewed

File without changes

DiT_VAE/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from .iddpm import IDDPM
+from .dpm_solver import DPMS
+from .sa_sampler import SASolverSampler

DiT_VAE/diffusion/configs/PixArt_xl2_4D_Triplane.py ADDED Viewed

	@@ -0,0 +1,64 @@

+data_root = '/data/data'
+data = dict(type='TriplaneData', data_base_dir='triplane', data_json_file='/nas8/liuhongyu/HeadGallery_Data/data.json', model_names='configs/gan_model.yaml' )
+image_size = 256  # the generated image resolution
+train_batch_size = 32
+eval_batch_size = 16
+use_fsdp=False   # if use FSDP mode
+valid_num=0      # take as valid aspect-ratio when sample number >= valid_num
+triplane_size = (256*4, 256)
+# model setting
+image_encoder_path = "/home/liuhongyu/code/IP-Adapter/models/image_encoder"
+vae_triplane_config_path = "vae_model.yaml"
+std_dir = '/nas8/liuhongyu/HeadGallery_Data/final_std.pt'
+mean_dir = '/nas8/liuhongyu/HeadGallery_Data/final_mean.pt'
+conditioning_params_dir = '/nas8/liuhongyu/HeadGallery_Data/conditioning_params.pkl'
+gan_model_base_dir = '/nas8/liuhongyu/HeadGallery_Data/gan_models'
+model = 'PixArt_XL_2'
+aspect_ratio_type = None         # base aspect ratio [ASPECT_RATIO_512 or ASPECT_RATIO_256]
+multi_scale = False     # if use multiscale dataset model training
+lewei_scale = 1.0    # lewei_scale for positional embedding interpolation
+# training setting
+num_workers=4
+train_sampling_steps = 1000
+eval_sampling_steps = 250
+model_max_length = 8
+lora_rank = 4
+num_epochs = 80
+gradient_accumulation_steps = 1
+grad_checkpointing = False
+gradient_clip = 1.0
+gc_step = 1
+auto_lr = dict(rule='sqrt')
+# we use different weight decay with the official implementation since it results better result
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=3e-2, eps=1e-10)
+lr_schedule = 'constant'
+lr_schedule_args = dict(num_warmup_steps=500)
+save_image_epochs = 1
+save_model_epochs = 1
+save_model_steps=1000000
+sample_posterior = True
+mixed_precision = 'fp16'
+scale_factor = 0.3994218
+ema_rate = 0.9999
+tensorboard_mox_interval = 50
+log_interval = 50
+cfg_scale = 4
+mask_type='null'
+num_group_tokens=0
+mask_loss_coef=0.
+load_mask_index=False    # load prepared mask_type index
+# load model settings
+vae_pretrained = "/cache/pretrained_models/sd-vae-ft-ema"
+load_from = None
+resume_from = dict(checkpoint=None, load_ema=False, resume_optimizer=True, resume_lr_scheduler=True)
+snr_loss=False
+# work dir settings
+work_dir = '/cache/exps/'
+s3_work_dir = None
+seed = 43

DiT_VAE/diffusion/configs/PixArt_xl2_img256_4D_Triplane.py ADDED Viewed

	@@ -0,0 +1,41 @@

+_base_ = ['./PixArt_xl2_4D_Triplane.py']
+data_root = 'data'
+data = dict(type='TriplaneData', data_base_dir='/nas8/liuhongyu/HeadGallery_Data',
+            data_json_file='/nas8/liuhongyu/HeadGallery_Data/data_combine.json', model_names='configs/gan_model.yaml',
+            dino_path='/nas8/liuhongyu/model/dinov2-base')
+image_size = 256
+# model setting
+gan_model_config = "./configs/gan_model.yaml"
+image_encoder_path = "/home/liuhongyu/code/IP-Adapter/models/image_encoder"
+vae_triplane_config_path = "./vae_model.yaml"
+std_dir = '/nas8/liuhongyu/HeadGallery_Data/final_std.pt'
+mean_dir = '/nas8/liuhongyu/HeadGallery_Data/final_mean.pt'
+conditioning_params_dir = '/nas8/liuhongyu/HeadGallery_Data/conditioning_params.pkl'
+gan_model_base_dir = '/nas8/liuhongyu/HeadGallery_Data/gan_models'
+dino_pretrained = '/nas8/liuhongyu/HeadGallery_Data/dinov2-base'
+window_block_indexes = []
+window_size = 0
+use_rel_pos = False
+model = 'PixArt_XL_2'
+fp32_attention = True
+dino_norm = False
+img_feature_self_attention = False
+load_from = None
+vae_pretrained = "/nas8/liuhongyu/all_training_results/VAE/checkpoint-140000"
+# training setting
+eval_sampling_steps = 200
+save_model_steps = 10000
+num_workers = 2
+train_batch_size = 8  # 32  # max 96 for PixArt-L/4 when grad_checkpoint
+num_epochs = 200  # 3
+gradient_accumulation_steps = 1
+grad_checkpointing = True
+gradient_clip = 0.01
+optimizer = dict(type='AdamW', lr=2e-5, weight_decay=3e-2, eps=1e-10)
+lr_schedule_args = dict(num_warmup_steps=1000)
+log_interval = 20
+save_model_epochs = 5
+work_dir = 'output/debug'

DiT_VAE/diffusion/configs/__init__.py ADDED Viewed

File without changes

DiT_VAE/diffusion/configs/vae_model.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+embed_dim: 8
+ddconfig:
+  double_z: True
+  z_channels: 8
+  encoder:
+    target: DiT_VAE.vae.aemodules3d.Encoder
+    params:
+      n_hiddens: 128
+      downsample: [4, 8, 8]
+      image_channel: 32
+      norm_type: group
+      padding_type: replicate
+      double_z: True
+      z_channels: 8
+  decoder:
+    target: DiT_VAE.vae.aemodules3d.Decoder
+    params:
+      n_hiddens: 128
+      upsample: [4, 8, 8]
+      z_channels: 8
+      image_channel: 32
+      norm_type: group

DiT_VAE/diffusion/data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .datasets import *
2	+ from .transforms import get_transform

DiT_VAE/diffusion/data/builder.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import time
+from mmcv import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+from DiT_VAE.diffusion.data.transforms import get_transform
+from DiT_VAE.diffusion.utils.logger import get_root_logger
+DATASETS = Registry('datasets')
+DATA_ROOT = '/cache/data'
+def set_data_root(data_root):
+    global DATA_ROOT
+    DATA_ROOT = data_root
+def get_data_path(data_dir):
+    if os.path.isabs(data_dir):
+        return data_dir
+    global DATA_ROOT
+    return os.path.join(DATA_ROOT, data_dir)
+def build_dataset_triplane(cfg, resolution=256, **kwargs):
+    logger = get_root_logger()
+    dataset_type = cfg.get('type')
+    logger.info(f"Constructing dataset {dataset_type}...")
+    t = time.time()
+    dataset = build_from_cfg(cfg, DATASETS, default_args=dict( image_size=resolution, **kwargs))
+    logger.info(f"Dataset {dataset_type} constructed. time: {(time.time() - t):.2f} s, length: {len(dataset)} ")
+    return dataset
+def build_dataset(cfg, resolution=224, **kwargs):
+    logger = get_root_logger()
+    dataset_type = cfg.get('type')
+    logger.info(f"Constructing dataset {dataset_type}...")
+    t = time.time()
+    transform = cfg.pop('transform', 'default_train')
+    transform = get_transform(transform, resolution)
+    dataset = build_from_cfg(cfg, DATASETS, default_args=dict(transform=transform, resolution=resolution, **kwargs))
+    logger.info(f"Dataset {dataset_type} constructed. time: {(time.time() - t):.2f} s, length (use/ori): {len(dataset)}/{dataset.ori_imgs_nums}")
+    return dataset
+def build_dataloader(dataset, batch_size=256, num_workers=2, shuffle=True, **kwargs):
+    return (
+        DataLoader(
+            dataset,
+            batch_sampler=kwargs['batch_sampler'],
+            num_workers=num_workers,
+            pin_memory=True,
+        )
+        if 'batch_sampler' in kwargs
+        else DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            num_workers=num_workers,
+            pin_memory=True,
+            **kwargs
+        )
+    )

DiT_VAE/diffusion/data/transforms.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torchvision.transforms as T
+TRANSFORMS = {}
+def register_transform(transform):
+    name = transform.__name__
+    if name in TRANSFORMS:
+        raise RuntimeError(f'Transform {name} has already registered.')
+    TRANSFORMS.update({name: transform})
+def get_transform(type, resolution):
+    transform = TRANSFORMS[type](resolution)
+    transform = T.Compose(transform)
+    transform.image_size = resolution
+    return transform
+@register_transform
+def default_train(n_px):
+    return [
+        T.Lambda(lambda img: img.convert('RGB')),
+        T.Resize(n_px),  # Image.BICUBIC
+        T.CenterCrop(n_px),
+        # T.RandomHorizontalFlip(),
+        T.ToTensor(),
+        T.Normalize([0.5], [0.5]),
+    ]

DiT_VAE/diffusion/dpm_solver.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+from .model import gaussian_diffusion as gd
+from .model.dpm_solver import model_wrapper, DPM_Solver, NoiseScheduleVP
+def DPMS(model, condition, uncondition, cfg_scale, model_type='noise', noise_schedule="linear", guidance_type='classifier-free', model_kwargs=None, diffusion_steps=1000):
+    if model_kwargs is None:
+        model_kwargs = {}
+    betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))
+    ## 1. Define the noise schedule.
+    noise_schedule = NoiseScheduleVP(schedule='discrete', betas=betas)
+    ## 2. Convert your discrete-time `model` to the continuous-time
+    ## noise prediction model. Here is an example for a diffusion model
+    ## `model` with the noise prediction type ("noise") .
+    model_fn = model_wrapper(
+        model,
+        noise_schedule,
+        model_type=model_type,
+        model_kwargs=model_kwargs,
+        guidance_type=guidance_type,
+        condition=condition,
+        unconditional_condition=uncondition,
+        guidance_scale=cfg_scale,
+    )
+    ## 3. Define dpm-solver and sample by multistep DPM-Solver.
+    return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")

DiT_VAE/diffusion/iddpm.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from  .model.respace import SpacedDiffusion, space_timesteps
+from .model import gaussian_diffusion as gd
+def IDDPM(
+        timestep_respacing,
+        noise_schedule="linear",
+        use_kl=False,
+        sigma_small=False,
+        predict_xstart=False,
+        learn_sigma=True,
+        pred_sigma=True,
+        rescale_learned_sigmas=False,
+        diffusion_steps=1000,
+        snr=False,
+        return_startx=False,
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.START_X if predict_xstart else gd.ModelMeanType.EPSILON
+        ),
+        model_var_type=(
+            (gd.ModelVarType.LEARNED_RANGE if learn_sigma else (
+                                 gd.ModelVarType.FIXED_LARGE
+                                 if not sigma_small
+                                 else gd.ModelVarType.FIXED_SMALL
+                             )
+             )
+            if pred_sigma
+            else None
+        ),
+        loss_type=loss_type,
+        snr=snr,
+        return_startx=return_startx,
+        # rescale_timesteps=rescale_timesteps,
+    )

DiT_VAE/diffusion/lcm_scheduler.py ADDED Viewed

	@@ -0,0 +1,455 @@

+# Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import ConfigMixin, SchedulerMixin
+from diffusers.configuration_utils import register_to_config
+from diffusers.utils import BaseOutput
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.FloatTensor
+    denoised: Optional[torch.FloatTensor] = None
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+        num_diffusion_timesteps,
+        max_beta=0.999,
+        alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    elif alpha_transform_type == "exp":
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt ** 2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[:1], alphas])
+    betas = 1 - alphas
+    return betas
+class LCMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic vae (DDPMs) with
+    non-Markovian guidance.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
+            Diffusion.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion vae such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+    # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            beta_start: float = 0.0001,
+            beta_end: float = 0.02,
+            beta_schedule: str = "linear",
+            trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+            clip_sample: bool = True,
+            set_alpha_to_one: bool = True,
+            steps_offset: int = 0,
+            prediction_type: str = "epsilon",
+            thresholding: bool = False,
+            dynamic_thresholding_ratio: float = 0.995,
+            clip_sample_range: float = 1.0,
+            sample_max_value: float = 1.0,
+            timestep_spacing: str = "leading",
+            rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                    torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        return (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+        return sample
+    def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+        self.num_inference_steps = num_inference_steps
+        # LCM Timesteps Setting:  # Linear Spacing
+        c = self.config.num_train_timesteps // lcm_origin_steps
+        lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1  # LCM Training  Steps Schedule
+        skipping_step = len(lcm_origin_timesteps) // num_inference_steps
+        timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]  # LCM Inference Steps Schedule
+        self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
+    def get_scalings_for_boundary_condition_discrete(self, t):
+        self.sigma_data = 0.5  # Default: 0.5
+        # By dividing 0.1: This is almost a delta function at t=0.
+        c_skip = self.sigma_data ** 2 / ((t / 0.1) ** 2 + self.sigma_data ** 2)
+        c_out = ((t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data ** 2) ** 0.5)
+        return c_skip, c_out
+    def step(
+            self,
+            model_output: torch.FloatTensor,
+            timeindex: int,
+            timestep: int,
+            sample: torch.FloatTensor,
+            eta: float = 0.0,
+            use_clipped_model_output: bool = False,
+            generator=None,
+            variance_noise: Optional[torch.FloatTensor] = None,
+            return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        # 1. get previous step value
+        prev_timeindex = timeindex + 1
+        if prev_timeindex < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_timeindex]
+        else:
+            prev_timestep = timestep
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+        # 4. Different Parameterization:
+        parameterization = self.config.prediction_type
+        if parameterization == "epsilon":  # noise-prediction
+            pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+        elif parameterization == "sample":  # x-prediction
+            pred_x0 = model_output
+        elif parameterization == "v_prediction":  # v-prediction
+            pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+        # 4. Denoise model output using boundary conditions
+        denoised = c_out * pred_x0 + c_skip * sample
+        # 5. Sample z ~ N(0, I), For MultiStep Inference
+        # Noise is not used for one-step sampling.
+        if len(self.timesteps) > 1:
+            noise = torch.randn(model_output.shape).to(model_output.device)
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+        if not return_dict:
+            return (prev_sample, denoised)
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+            self,
+            original_samples: torch.FloatTensor,
+            noise: torch.FloatTensor,
+            timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        return sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+            self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        return sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+    def __len__(self):
+        return self.config.num_train_timesteps

DiT_VAE/diffusion/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .nets import *
2	+ # import utils

DiT_VAE/diffusion/model/builder.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from mmcv import Registry
+from DiT_VAE.diffusion.model.utils import set_grad_checkpoint
+MODELS = Registry('vae')
+def build_model(cfg, use_grad_checkpoint=False, use_fp32_attention=False, gc_step=1, **kwargs):
+    if isinstance(cfg, str):
+        cfg = dict(type=cfg)
+    model = MODELS.build(cfg, default_args=kwargs)
+    if use_grad_checkpoint:
+        set_grad_checkpoint(model, use_fp32_attention=use_fp32_attention, gc_step=gc_step)
+    return model

DiT_VAE/diffusion/model/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = next(
+        (
+            obj
+            for obj in (mean1, logvar1, mean2, logvar2)
+            if isinstance(obj, th.Tensor)
+        ),
+        None,
+    )
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x, device=tensor.device)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    return th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(
+        normalized_x
+    )
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

DiT_VAE/diffusion/model/dpm_solver.py ADDED Viewed

	@@ -0,0 +1,1337 @@

+import torch
+from tqdm import tqdm
+class NoiseScheduleVP:
+    def __init__(
+            self,
+            schedule='discrete',
+            betas=None,
+            alphas_cumprod=None,
+            continuous_beta_0=0.1,
+            continuous_beta_1=20.,
+            dtype=torch.float32,
+    ):
+        """Create a wrapper class for the forward SDE (VP type).
+        ***
+        Update: We support discrete-time diffusion vae by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion vae, especially for high-resolution images.
+        ***
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+        1. For discrete-time DPMs:
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+        2. For continuous-time DPMs:
+            We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise
+            schedule are the default settings in Yang Song's ScoreSDE:
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                T: A `float` number. The ending time of the forward process.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+        ===============================================================
+        Example:
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+        """
+        if schedule not in ['discrete', 'linear']:
+            raise ValueError(
+                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear'"
+            )
+        self.schedule = schedule
+        if schedule == 'discrete':
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.T = 1.
+            self.log_alpha_array = self.numerical_clip_alpha(log_alphas).reshape((1, -1,)).to(dtype=dtype)
+            self.total_N = self.log_alpha_array.shape[1]
+            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
+        else:
+            self.T = 1.
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+    def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1):
+        """
+        For some beta schedules such as cosine schedule, the log-SNR has numerical isssues.
+        We clip the log-SNR near t=T within -5.1 to ensure the stability.
+        Such a trick is very useful for diffusion vae with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE.
+        """
+        log_sigmas = 0.5 * torch.log(1. - torch.exp(2. * log_alphas))
+        lambs = log_alphas - log_sigmas
+        idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda)
+        if idx > 0:
+            log_alphas = log_alphas[:-idx]
+        return log_alphas
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
+                                  self.log_alpha_array.to(t.device)).reshape((-1))
+        elif self.schedule == 'linear':
+            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == 'linear':
+            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0 ** 2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == 'discrete':
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                               torch.flip(self.t_array.to(lamb.device), [1]))
+            return t.reshape((-1,))
+def model_wrapper(
+        model,
+        noise_schedule,
+        model_type="noise",
+        model_kwargs={},
+        guidance_type="uncond",
+        condition=None,
+        unconditional_condition=None,
+        guidance_scale=1.,
+        classifier_fn=None,
+        classifier_kwargs={},
+):
+    """Create a wrapper function for the noise prediction model.
+    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+    We support four types of the diffusion model by setting `model_type`:
+        1. "noise": noise prediction model. (Trained by predicting noise).
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion vae."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion vae beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+    ===============================================================
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == 'discrete':
+            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
+        else:
+            return t_continuous
+    def noise_pred_fn(x, t_continuous, cond=None, cond_2=None):
+        t_input = get_model_input_time(t_continuous)
+        if cond is None:
+            output = model(x, t_input, **model_kwargs)
+        else:
+            output = model(x, t_input, y=cond, img_feature=cond_2, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim())
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            return -expand_dims(sigma_t, x.dim()) * output
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1. or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t_continuous] * 2)
+            # c_in = torch.cat([unconditional_condition, condition])
+            c_in_y = torch.cat([unconditional_condition[0], condition[0]])
+            c_in_dino = torch.cat([unconditional_condition[1], condition[1]])
+            noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in_y, cond_2=c_in_dino).chunk(2)
+            return noise_uncond + guidance_scale * (noise - noise_uncond)
+    assert model_type in ["noise", "x_start", "v", "score"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+class DPM_Solver:
+    def __init__(
+            self,
+            model_fn,
+            noise_schedule,
+            algorithm_type="dpmsolver++",
+            correcting_x0_fn=None,
+            correcting_xt_fn=None,
+            thresholding_max_val=1.,
+            dynamic_thresholding_ratio=0.995,
+    ):
+        """Construct a DPM-Solver.
+        We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).
+        We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion vae, you
+        can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
+        dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
+        DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
+        DPMs (such as stable-diffusion).
+        To support advanced algorithms in image-to-image applications, we also support corrector functions for
+        both x0 and xt.
+        Args:
+            model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
+                ``
+                def model_fn(x, t_continuous):
+                    return noise
+                ``
+                The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
+            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+            algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
+            correcting_x0_fn: A `str` or a function with the following format:
+                ```
+                def correcting_x0_fn(x0, t):
+                    x0_new = ...
+                    return x0_new
+                ```
+                This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
+                ```
+                x0_pred = data_pred_model(xt, t)
+                if correcting_x0_fn is not None:
+                    x0_pred = correcting_x0_fn(x0_pred, t)
+                xt_1 = update(x0_pred, xt, t)
+                ```
+                If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
+            correcting_xt_fn: A function with the following format:
+                ```
+                def correcting_xt_fn(xt, t, step):
+                    x_new = ...
+                    return x_new
+                ```
+                This function is to correct the intermediate samples xt at each sampling step. e.g.,
+                ```
+                xt = ...
+                xt = correcting_xt_fn(xt, t, step)
+                ```
+            thresholding_max_val: A `float`. The max value for thresholding.
+                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
+            dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
+                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
+        [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
+            Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion vae
+            with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
+        """
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        self.noise_schedule = noise_schedule
+        assert algorithm_type in ["dpmsolver", "dpmsolver++"]
+        self.algorithm_type = algorithm_type
+        if correcting_x0_fn == "dynamic_thresholding":
+            self.correcting_x0_fn = self.dynamic_thresholding_fn
+        else:
+            self.correcting_x0_fn = correcting_x0_fn
+        self.correcting_xt_fn = correcting_xt_fn
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.thresholding_max_val = thresholding_max_val
+    def dynamic_thresholding_fn(self, x0, t):
+        """
+        The dynamic thresholding method.
+        """
+        dims = x0.dim()
+        p = self.dynamic_thresholding_ratio
+        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
+        x0 = torch.clamp(x0, -s, s) / s
+        return x0
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def data_prediction_fn(self, x, t):
+        """
+        Return the data prediction model (with corrector).
+        """
+        noise = self.noise_prediction_fn(x, t)
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        x0 = (x - sigma_t * noise) / alpha_t
+        if self.correcting_x0_fn is not None:
+            x0 = self.correcting_x0_fn(x0, t)
+        return x0
+    def model_fn(self, x, t):
+        """
+        Convert the model to the noise prediction model or the data prediction model.
+        """
+        if self.algorithm_type == "dpmsolver++":
+            return self.data_prediction_fn(x, t)
+        else:
+            return self.noise_prediction_fn(x, t)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == 'logSNR':
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == 'time_uniform':
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == 'time_quadratic':
+            t_order = 2
+            return (
+                torch.linspace(
+                    t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1
+                )
+                .pow(t_order)
+                .to(device)
+            )
+        else:
+            raise ValueError(
+                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'"
+            )
+    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
+        """
+        Get the order of each step for sampling by the singlestep DPM-Solver.
+        We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
+        Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
+            - If order == 1:
+                We take `steps` of DPM-Solver-1 (i.e. DDIM).
+            - If order == 2:
+                - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
+                - If steps % 2 == 0, we use K steps of DPM-Solver-2.
+                - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
+            - If order == 3:
+                - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+                - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
+                - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
+                - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
+        ============================================
+        Args:
+            order: A `int`. The max order for the solver (2 or 3).
+            steps: A `int`. The total number of function evaluations (NFE).
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            device: A torch device.
+        Returns:
+            orders: A list of the solver order of each step.
+        """
+        if order == 3:
+            K = steps // 3 + 1
+            if steps % 3 == 0:
+                orders = [3, ] * (K - 2) + [2, 1]
+            elif steps % 3 == 1:
+                orders = [3, ] * (K - 1) + [1]
+            else:
+                orders = [3, ] * (K - 1) + [2]
+        elif order == 2:
+            if steps % 2 == 0:
+                K = steps // 2
+                orders = [2, ] * K
+            else:
+                K = steps // 2 + 1
+                orders = [2, ] * (K - 1) + [1]
+        elif order == 1:
+            K = 1
+            orders = [1, ] * steps
+        else:
+            raise ValueError("'order' must be '1' or '2' or '3'.")
+        if skip_type == 'logSNR':
+            # To reproduce the results in DPM-Solver paper
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
+        else:
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
+                torch.cumsum(torch.tensor([0, ] + orders), 0).to(device)]
+        return timesteps_outer, orders
+    def denoise_to_zero_fn(self, x, s):
+        """
+        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
+        """
+        return self.data_prediction_fn(x, s)
+    def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
+        """
+        DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            model_s: A pytorch tensor. The model function evaluated at time `s`.
+                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+            return_intermediate: A `bool`. If true, also return the model value at time `s`.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        ns = self.noise_schedule
+        dims = x.dim()
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        if self.algorithm_type == "dpmsolver++":
+            phi_1 = torch.expm1(-h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_t = (
+                    sigma_t / sigma_s * x
+                    - alpha_t * phi_1 * model_s
+            )
+        else:
+            phi_1 = torch.expm1(h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s) * x
+                    - (sigma_t * phi_1) * model_s
+            )
+        return (x_t, {'model_s': model_s}) if return_intermediate else x_t
+    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
+                                            solver_type='dpmsolver'):
+        """
+        Singlestep solver DPM-Solver-2 from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            r1: A `float`. The hyperparameter of the second-order solver.
+            model_s: A pytorch tensor. The model function evaluated at time `s`.
+                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+            return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if solver_type not in ['dpmsolver', 'taylor']:
+            raise ValueError(
+                f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}"
+            )
+        if r1 is None:
+            r1 = 0.5
+        ns = self.noise_schedule
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        lambda_s1 = lambda_s + r1 * h
+        s1 = ns.inverse_lambda(lambda_s1)
+        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
+            s1), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
+        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
+        if self.algorithm_type == "dpmsolver++":
+            phi_11 = torch.expm1(-r1 * h)
+            phi_1 = torch.expm1(-h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_s1 = (
+                    (sigma_s1 / sigma_s) * x
+                    - (alpha_s1 * phi_11) * model_s
+            )
+            model_s1 = self.model_fn(x_s1, s1)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                        (sigma_t / sigma_s) * x
+                        - (alpha_t * phi_1) * model_s
+                        - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                        (sigma_t / sigma_s) * x
+                        - (alpha_t * phi_1) * model_s
+                        + (1. / r1) * (alpha_t * (phi_1 / h + 1.)) * (model_s1 - model_s)
+                )
+        else:
+            phi_11 = torch.expm1(r1 * h)
+            phi_1 = torch.expm1(h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_s1 = (
+                    torch.exp(log_alpha_s1 - log_alpha_s) * x
+                    - (sigma_s1 * phi_11) * model_s
+            )
+            model_s1 = self.model_fn(x_s1, s1)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                        torch.exp(log_alpha_t - log_alpha_s) * x
+                        - (sigma_t * phi_1) * model_s
+                        - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                        torch.exp(log_alpha_t - log_alpha_s) * x
+                        - (sigma_t * phi_1) * model_s
+                        - (1. / r1) * (sigma_t * (phi_1 / h - 1.)) * (model_s1 - model_s)
+                )
+        if return_intermediate:
+            return x_t, {'model_s': model_s, 'model_s1': model_s1}
+        else:
+            return x_t
+    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
+                                           return_intermediate=False, solver_type='dpmsolver'):
+        """
+        Singlestep solver DPM-Solver-3 from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            r1: A `float`. The hyperparameter of the third-order solver.
+            r2: A `float`. The hyperparameter of the third-order solver.
+            model_s: A pytorch tensor. The model function evaluated at time `s`.
+                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+            model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
+                If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
+            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if solver_type not in ['dpmsolver', 'taylor']:
+            raise ValueError(
+                f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}"
+            )
+        if r1 is None:
+            r1 = 1. / 3.
+        if r2 is None:
+            r2 = 2. / 3.
+        ns = self.noise_schedule
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        lambda_s1 = lambda_s + r1 * h
+        lambda_s2 = lambda_s + r2 * h
+        s1 = ns.inverse_lambda(lambda_s1)
+        s2 = ns.inverse_lambda(lambda_s2)
+        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
+            s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
+            s2), ns.marginal_std(t)
+        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
+        if self.algorithm_type == "dpmsolver++":
+            phi_11 = torch.expm1(-r1 * h)
+            phi_12 = torch.expm1(-r2 * h)
+            phi_1 = torch.expm1(-h)
+            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
+            phi_2 = phi_1 / h + 1.
+            phi_3 = phi_2 / h - 0.5
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            if model_s1 is None:
+                x_s1 = (
+                        (sigma_s1 / sigma_s) * x
+                        - (alpha_s1 * phi_11) * model_s
+                )
+                model_s1 = self.model_fn(x_s1, s1)
+            x_s2 = (
+                    (sigma_s2 / sigma_s) * x
+                    - (alpha_s2 * phi_12) * model_s
+                    + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
+            )
+            model_s2 = self.model_fn(x_s2, s2)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                        (sigma_t / sigma_s) * x
+                        - (alpha_t * phi_1) * model_s
+                        + (1. / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
+                )
+            elif solver_type == 'taylor':
+                D1_0 = (1. / r1) * (model_s1 - model_s)
+                D1_1 = (1. / r2) * (model_s2 - model_s)
+                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
+                x_t = (
+                        (sigma_t / sigma_s) * x
+                        - (alpha_t * phi_1) * model_s
+                        + (alpha_t * phi_2) * D1
+                        - (alpha_t * phi_3) * D2
+                )
+        else:
+            phi_11 = torch.expm1(r1 * h)
+            phi_12 = torch.expm1(r2 * h)
+            phi_1 = torch.expm1(h)
+            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
+            phi_2 = phi_1 / h - 1.
+            phi_3 = phi_2 / h - 0.5
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            if model_s1 is None:
+                x_s1 = (
+                        (torch.exp(log_alpha_s1 - log_alpha_s)) * x
+                        - (sigma_s1 * phi_11) * model_s
+                )
+                model_s1 = self.model_fn(x_s1, s1)
+            x_s2 = (
+                    (torch.exp(log_alpha_s2 - log_alpha_s)) * x
+                    - (sigma_s2 * phi_12) * model_s
+                    - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
+            )
+            model_s2 = self.model_fn(x_s2, s2)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                        (torch.exp(log_alpha_t - log_alpha_s)) * x
+                        - (sigma_t * phi_1) * model_s
+                        - (1. / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
+                )
+            elif solver_type == 'taylor':
+                D1_0 = (1. / r1) * (model_s1 - model_s)
+                D1_1 = (1. / r2) * (model_s2 - model_s)
+                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
+                x_t = (
+                        (torch.exp(log_alpha_t - log_alpha_s)) * x
+                        - (sigma_t * phi_1) * model_s
+                        - (sigma_t * phi_2) * D1
+                        - (sigma_t * phi_3) * D2
+                )
+        if return_intermediate:
+            return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2}
+        else:
+            return x_t
+    def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
+        """
+        Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            model_prev_list: A list of pytorch tensor. The previous computed model values.
+            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if solver_type not in ['dpmsolver', 'taylor']:
+            raise ValueError(
+                f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}"
+            )
+        ns = self.noise_schedule
+        model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
+        t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
+        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
+            t_prev_0), ns.marginal_lambda(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h_0 = lambda_prev_0 - lambda_prev_1
+        h = lambda_t - lambda_prev_0
+        r0 = h_0 / h
+        D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
+        if self.algorithm_type == "dpmsolver++":
+            phi_1 = torch.expm1(-h)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                        (sigma_t / sigma_prev_0) * x
+                        - (alpha_t * phi_1) * model_prev_0
+                        - 0.5 * (alpha_t * phi_1) * D1_0
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                        (sigma_t / sigma_prev_0) * x
+                        - (alpha_t * phi_1) * model_prev_0
+                        + (alpha_t * (phi_1 / h + 1.)) * D1_0
+                )
+        else:
+            phi_1 = torch.expm1(h)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                        (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                        - (sigma_t * phi_1) * model_prev_0
+                        - 0.5 * (sigma_t * phi_1) * D1_0
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                        (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                        - (sigma_t * phi_1) * model_prev_0
+                        - (sigma_t * (phi_1 / h - 1.)) * D1_0
+                )
+        return x_t
+    def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpmsolver'):
+        """
+        Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            model_prev_list: A list of pytorch tensor. The previous computed model values.
+            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        ns = self.noise_schedule
+        model_prev_2, model_prev_1, model_prev_0 = model_prev_list
+        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
+        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
+            t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h_1 = lambda_prev_1 - lambda_prev_2
+        h_0 = lambda_prev_0 - lambda_prev_1
+        h = lambda_t - lambda_prev_0
+        r0, r1 = h_0 / h, h_1 / h
+        D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
+        D1_1 = (1. / r1) * (model_prev_1 - model_prev_2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1. / (r0 + r1)) * (D1_0 - D1_1)
+        if self.algorithm_type == "dpmsolver++":
+            phi_1 = torch.expm1(-h)
+            phi_2 = phi_1 / h + 1.
+            phi_3 = phi_2 / h - 0.5
+            return (
+                (sigma_t / sigma_prev_0) * x
+                - (alpha_t * phi_1) * model_prev_0
+                + (alpha_t * phi_2) * D1
+                - (alpha_t * phi_3) * D2
+            )
+        else:
+            phi_1 = torch.expm1(h)
+            phi_2 = phi_1 / h - 1.
+            phi_3 = phi_2 / h - 0.5
+            return (
+                (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                - (sigma_t * phi_1) * model_prev_0
+                - (sigma_t * phi_2) * D1
+                - (sigma_t * phi_3) * D2
+            )
+    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpmsolver', r1=None,
+                                     r2=None):
+        """
+        Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+            r1: A `float`. The hyperparameter of the second-order or third-order solver.
+            r2: A `float`. The hyperparameter of the third-order solver.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if order == 1:
+            return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
+        elif order == 2:
+            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
+                                                            solver_type=solver_type, r1=r1)
+        elif order == 3:
+            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
+                                                           solver_type=solver_type, r1=r1, r2=r2)
+        else:
+            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
+    def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpmsolver'):
+        """
+        Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            model_prev_list: A list of pytorch tensor. The previous computed model values.
+            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if order == 1:
+            return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
+        elif order == 2:
+            return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
+        elif order == 3:
+            return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
+        else:
+            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
+    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
+                            solver_type='dpmsolver'):
+        """
+        The adaptive step size solver based on singlestep DPM-Solver.
+        Args:
+            x: A pytorch tensor. The initial value at time `t_T`.
+            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            h_init: A `float`. The initial step size (for logSNR).
+            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
+            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
+            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
+            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
+                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_0: A pytorch tensor. The approximated solution at time `t_0`.
+        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based vae," arXiv preprint arXiv:2105.14080, 2021.
+        """
+        ns = self.noise_schedule
+        s = t_T * torch.ones((1,)).to(x)
+        lambda_s = ns.marginal_lambda(s)
+        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
+        h = h_init * torch.ones_like(s).to(x)
+        x_prev = x
+        nfe = 0
+        if order == 2:
+            r1 = 0.5
+            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
+                                                                                               solver_type=solver_type,
+                                                                                               **kwargs)
+        elif order == 3:
+            r1, r2 = 1. / 3., 2. / 3.
+            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
+                                                                                    return_intermediate=True,
+                                                                                    solver_type=solver_type)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
+                                                                                              solver_type=solver_type,
+                                                                                              **kwargs)
+        else:
+            raise ValueError(
+                f"For adaptive step size solver, order must be 2 or 3, got {order}"
+            )
+        while torch.abs((s - t_0)).mean() > t_err:
+            t = ns.inverse_lambda(lambda_s + h)
+            x_lower, lower_noise_kwargs = lower_update(x, s, t)
+            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
+            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
+            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
+            E = norm_fn((x_higher - x_lower) / delta).max()
+            if torch.all(E <= 1.):
+                x = x_higher
+                s = t
+                x_prev = x_lower
+                lambda_s = ns.marginal_lambda(s)
+            h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
+            nfe += order
+        print('adaptive solver nfe', nfe)
+        return x
+    def add_noise(self, x, t, noise=None):
+        """
+        Compute the noised input xt = alpha_t * x + sigma_t * noise.
+        Args:
+            x: A `torch.Tensor` with shape `(batch_size, *shape)`.
+            t: A `torch.Tensor` with shape `(t_size,)`.
+        Returns:
+            xt with shape `(t_size, batch_size, *shape)`.
+        """
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        if noise is None:
+            noise = torch.randn((t.shape[0], *x.shape), device=x.device)
+        x = x.reshape((-1, *x.shape))
+        xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
+        return xt.squeeze(0) if t.shape[0] == 1 else xt
+    def inverse(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
+                method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
+                atol=0.0078, rtol=0.05, return_intermediate=False,
+                ):
+        """
+        Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
+        For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
+        """
+        t_0 = 1. / self.noise_schedule.total_N if t_start is None else t_start
+        t_T = self.noise_schedule.T if t_end is None else t_end
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        return self.sample(x, steps=steps, t_start=t_0, t_end=t_T, order=order, skip_type=skip_type,
+                           method=method, lower_order_final=lower_order_final, denoise_to_zero=denoise_to_zero,
+                           solver_type=solver_type,
+                           atol=atol, rtol=rtol, return_intermediate=return_intermediate)
+    def sample(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
+               method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
+               atol=0.0078, rtol=0.05, return_intermediate=False,
+               ):
+        """
+        Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
+        =====================================================
+        We support the following algorithms for both noise prediction model and data prediction model:
+            - 'singlestep':
+                Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
+                We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
+                The total number of function evaluations (NFE) == `steps`.
+                Given a fixed NFE == `steps`, the sampling procedure is:
+                    - If `order` == 1:
+                        - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
+                    - If `order` == 2:
+                        - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
+                        - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
+                        - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
+                    - If `order` == 3:
+                        - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+                        - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
+                        - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
+                        - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
+            - 'multistep':
+                Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
+                We initialize the first `order` values by lower order multistep solvers.
+                Given a fixed NFE == `steps`, the sampling procedure is:
+                    Denote K = steps.
+                    - If `order` == 1:
+                        - We use K steps of DPM-Solver-1 (i.e. DDIM).
+                    - If `order` == 2:
+                        - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
+                    - If `order` == 3:
+                        - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
+            - 'singlestep_fixed':
+                Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
+                We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
+            - 'adaptive':
+                Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
+                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
+                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
+                (NFE) and the sample quality.
+                    - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
+                    - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
+        =====================================================
+        Some advices for choosing the algorithm:
+            - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
+                Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
+                e.g., DPM-Solver:
+                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
+                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
+                            skip_type='time_uniform', method='singlestep')
+                e.g., DPM-Solver++:
+                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
+                            skip_type='time_uniform', method='singlestep')
+            - For **guided sampling with large guidance scale** by DPMs:
+                Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
+                e.g.
+                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
+                            skip_type='time_uniform', method='multistep')
+        We support three types of `skip_type`:
+            - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
+            - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
+            - 'time_quadratic': quadratic time for the time steps.
+        =====================================================
+        Args:
+            x: A pytorch tensor. The initial value at time `t_start`
+                e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
+            steps: A `int`. The total number of function evaluations (NFE).
+            t_start: A `float`. The starting time of the sampling.
+                If `T` is None, we use self.noise_schedule.T (default is 1.0).
+            t_end: A `float`. The ending time of the sampling.
+                If `t_end` is None, we use 1. / self.noise_schedule.total_N.
+                e.g. if total_N == 1000, we have `t_end` == 1e-3.
+                For discrete-time DPMs:
+                    - We recommend `t_end` == 1. / self.noise_schedule.total_N.
+                For continuous-time DPMs:
+                    - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
+            order: A `int`. The order of DPM-Solver.
+            skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
+            method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
+            denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
+                Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
+                This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
+                score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
+                for diffusion vae sampling by diffusion SDEs for low-resolutional images
+                (such as CIFAR-10). However, we observed that such trick does not matter for
+                high-resolutional images. As it needs an additional NFE, we do not recommend
+                it for high-resolutional images.
+            lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
+                Only valid for `method=multistep` and `steps < 15`. We empirically find that
+                this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
+                (especially for steps <= 10). So we recommend to set it to be `True`.
+            solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
+            atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
+            rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
+            return_intermediate: A `bool`. Whether to save the xt at each step.
+                When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
+        Returns:
+            x_end: A pytorch tensor. The approximated solution at time `t_end`.
+        """
+        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        if return_intermediate:
+            assert method in ['multistep', 'singlestep',
+                              'singlestep_fixed'], "Cannot use adaptive solver when saving intermediate values"
+        if self.correcting_xt_fn is not None:
+            assert method in ['multistep', 'singlestep',
+                              'singlestep_fixed'], "Cannot use adaptive solver when correcting_xt_fn is not None"
+        device = x.device
+        intermediates = []
+        with torch.no_grad():
+            if method == 'adaptive':
+                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
+                                             solver_type=solver_type)
+            elif method == 'multistep':
+                assert steps >= order
+                timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+                assert timesteps.shape[0] - 1 == steps
+                # Init the initial values.
+                step = 0
+                t = timesteps[step]
+                t_prev_list = [t]
+                model_prev_list = [self.model_fn(x, t)]
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+                # Init the first `order` values by lower order multistep DPM-Solver.
+                for step in range(1, order):
+                    t = timesteps[step]
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, step,
+                                                         solver_type=solver_type)
+                    if self.correcting_xt_fn is not None:
+                        x = self.correcting_xt_fn(x, t, step)
+                    if return_intermediate:
+                        intermediates.append(x)
+                    t_prev_list.append(t)
+                    model_prev_list.append(self.model_fn(x, t))
+                # Compute the remaining values by `order`-th order multistep DPM-Solver.
+                for step in tqdm(range(order, steps + 1)):
+                    t = timesteps[step]
+                    # We only use lower order for steps < 10
+                    if lower_order_final and steps < 10:
+                        step_order = min(order, steps + 1 - step)
+                    else:
+                        step_order = order
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, step_order,
+                                                         solver_type=solver_type)
+                    if self.correcting_xt_fn is not None:
+                        x = self.correcting_xt_fn(x, t, step)
+                    if return_intermediate:
+                        intermediates.append(x)
+                    for i in range(order - 1):
+                        t_prev_list[i] = t_prev_list[i + 1]
+                        model_prev_list[i] = model_prev_list[i + 1]
+                    t_prev_list[-1] = t
+                    # We do not need to evaluate the final model value.
+                    if step < steps:
+                        model_prev_list[-1] = self.model_fn(x, t)
+            elif method in ['singlestep', 'singlestep_fixed']:
+                if method == 'singlestep':
+                    timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps,
+                                                                                                  order=order,
+                                                                                                  skip_type=skip_type,
+                                                                                                  t_T=t_T, t_0=t_0,
+                                                                                                  device=device)
+                elif method == 'singlestep_fixed':
+                    K = steps // order
+                    orders = [order, ] * K
+                    timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
+                for step, order in enumerate(orders):
+                    s, t = timesteps_outer[step], timesteps_outer[step + 1]
+                    timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order,
+                                                          device=device)
+                    lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
+                    h = lambda_inner[-1] - lambda_inner[0]
+                    r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
+                    r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
+                    x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
+                    if self.correcting_xt_fn is not None:
+                        x = self.correcting_xt_fn(x, t, step)
+                    if return_intermediate:
+                        intermediates.append(x)
+            else:
+                raise ValueError(f"Got wrong method {method}")
+            if denoise_to_zero:
+                t = torch.ones((1,)).to(device) * t_0
+                x = self.denoise_to_zero_fn(x, t)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step + 1)
+                if return_intermediate:
+                    intermediates.append(x)
+        return (x, intermediates) if return_intermediate else x
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    return start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,) * (dims - 1)]

DiT_VAE/diffusion/model/edm_sample.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import numpy as np
+from tqdm import tqdm
+# ----------------------------------------------------------------------------
+# Proposed EDM sampler (Algorithm 2).
+def edm_sampler(
+        net, latents, class_labels=None, cfg_scale=None, randn_like=torch.randn_like,
+        num_steps=18, sigma_min=0.002, sigma_max=80, rho=7,
+        S_churn=0, S_min=0, S_max=float('inf'), S_noise=1, **kwargs
+):
+    # Adjust noise levels based on what's supported by the network.
+    sigma_min = max(sigma_min, net.sigma_min)
+    sigma_max = min(sigma_max, net.sigma_max)
+    # Time step discretization.
+    step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device)
+    t_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (
+                sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho
+    t_steps = torch.cat([net.round_sigma(t_steps), torch.zeros_like(t_steps[:1])])  # t_N = 0
+    # Main sampling loop.
+    x_next = latents.to(torch.float64) * t_steps[0]
+    for i, (t_cur, t_next) in tqdm(list(enumerate(zip(t_steps[:-1], t_steps[1:])))):  # 0, ..., N-1
+        x_cur = x_next
+        # Increase noise temporarily.
+        gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= t_cur <= S_max else 0
+        t_hat = net.round_sigma(t_cur + gamma * t_cur)
+        x_hat = x_cur + (t_hat ** 2 - t_cur ** 2).sqrt() * S_noise * randn_like(x_cur)
+        # Euler step.
+        denoised = net(x_hat.float(), t_hat, class_labels, cfg_scale, **kwargs)['x'].to(torch.float64)
+        d_cur = (x_hat - denoised) / t_hat
+        x_next = x_hat + (t_next - t_hat) * d_cur
+        # Apply 2nd order correction.
+        if i < num_steps - 1:
+            denoised = net(x_next.float(), t_next, class_labels, cfg_scale, **kwargs)['x'].to(torch.float64)
+            d_prime = (x_next - denoised) / t_next
+            x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime)
+    return x_next
+# ----------------------------------------------------------------------------
+# Generalized ablation sampler, representing the superset of all sampling
+# methods discussed in the paper.
+def ablation_sampler(
+        net, latents, class_labels=None, cfg_scale=None, feat=None, randn_like=torch.randn_like,
+        num_steps=18, sigma_min=None, sigma_max=None, rho=7,
+        solver='heun', discretization='edm', schedule='linear', scaling='none',
+        epsilon_s=1e-3, C_1=0.001, C_2=0.008, M=1000, alpha=1,
+        S_churn=0, S_min=0, S_max=float('inf'), S_noise=1,
+):
+    assert solver in ['euler', 'heun']
+    assert discretization in ['vp', 've', 'iddpm', 'edm']
+    assert schedule in ['vp', 've', 'linear']
+    assert scaling in ['vp', 'none']
+    # Helper functions for VP & VE noise level schedules.
+    vp_sigma = lambda beta_d, beta_min: lambda t: (np.e ** (0.5 * beta_d * (t ** 2) + beta_min * t) - 1) ** 0.5
+    vp_sigma_deriv = lambda beta_d, beta_min: lambda t: 0.5 * (beta_min + beta_d * t) * (sigma(t) + 1 / sigma(t))
+    vp_sigma_inv = lambda beta_d, beta_min: lambda sigma: ((beta_min ** 2 + 2 * beta_d * (
+            sigma ** 2 + 1).log()).sqrt() - beta_min) / beta_d
+    ve_sigma = lambda t: t.sqrt()
+    ve_sigma_deriv = lambda t: 0.5 / t.sqrt()
+    ve_sigma_inv = lambda sigma: sigma ** 2
+    # Select default noise level range based on the specified time step discretization.
+    if sigma_min is None:
+        vp_def = vp_sigma(beta_d=19.1, beta_min=0.1)(t=epsilon_s)
+        sigma_min = {'vp': vp_def, 've': 0.02, 'iddpm': 0.002, 'edm': 0.002}[discretization]
+    if sigma_max is None:
+        vp_def = vp_sigma(beta_d=19.1, beta_min=0.1)(t=1)
+        sigma_max = {'vp': vp_def, 've': 100, 'iddpm': 81, 'edm': 80}[discretization]
+    # Adjust noise levels based on what's supported by the network.
+    sigma_min = max(sigma_min, net.sigma_min)
+    sigma_max = min(sigma_max, net.sigma_max)
+    # Compute corresponding betas for VP.
+    vp_beta_d = 2 * (np.log(sigma_min ** 2 + 1) / epsilon_s - np.log(sigma_max ** 2 + 1)) / (epsilon_s - 1)
+    vp_beta_min = np.log(sigma_max ** 2 + 1) - 0.5 * vp_beta_d
+    # Define time steps in terms of noise level.
+    step_indices = torch.arange(num_steps, dtype=torch.float64, device=latents.device)
+    if discretization == 'vp':
+        orig_t_steps = 1 + step_indices / (num_steps - 1) * (epsilon_s - 1)
+        sigma_steps = vp_sigma(vp_beta_d, vp_beta_min)(orig_t_steps)
+    elif discretization == 've':
+        orig_t_steps = (sigma_max ** 2) * ((sigma_min ** 2 / sigma_max ** 2) ** (step_indices / (num_steps - 1)))
+        sigma_steps = ve_sigma(orig_t_steps)
+    elif discretization == 'iddpm':
+        u = torch.zeros(M + 1, dtype=torch.float64, device=latents.device)
+        alpha_bar = lambda j: (0.5 * np.pi * j / M / (C_2 + 1)).sin() ** 2
+        for j in torch.arange(M, 0, -1, device=latents.device):  # M, ..., 1
+            u[j - 1] = ((u[j] ** 2 + 1) / (alpha_bar(j - 1) / alpha_bar(j)).clip(min=C_1) - 1).sqrt()
+        u_filtered = u[torch.logical_and(u >= sigma_min, u <= sigma_max)]
+        sigma_steps = u_filtered[((len(u_filtered) - 1) / (num_steps - 1) * step_indices).round().to(torch.int64)]
+    else:
+        assert discretization == 'edm'
+        sigma_steps = (sigma_max ** (1 / rho) + step_indices / (num_steps - 1) * (
+                sigma_min ** (1 / rho) - sigma_max ** (1 / rho))) ** rho
+    # Define noise level schedule.
+    if schedule == 'vp':
+        sigma = vp_sigma(vp_beta_d, vp_beta_min)
+        sigma_deriv = vp_sigma_deriv(vp_beta_d, vp_beta_min)
+        sigma_inv = vp_sigma_inv(vp_beta_d, vp_beta_min)
+    elif schedule == 've':
+        sigma = ve_sigma
+        sigma_deriv = ve_sigma_deriv
+        sigma_inv = ve_sigma_inv
+    else:
+        assert schedule == 'linear'
+        sigma = lambda t: t
+        sigma_deriv = lambda t: 1
+        sigma_inv = lambda sigma: sigma
+    # Define scaling schedule.
+    if scaling == 'vp':
+        s = lambda t: 1 / (1 + sigma(t) ** 2).sqrt()
+        s_deriv = lambda t: -sigma(t) * sigma_deriv(t) * (s(t) ** 3)
+    else:
+        assert scaling == 'none'
+        s = lambda t: 1
+        s_deriv = lambda t: 0
+    # Compute final time steps based on the corresponding noise levels.
+    t_steps = sigma_inv(net.round_sigma(sigma_steps))
+    t_steps = torch.cat([t_steps, torch.zeros_like(t_steps[:1])])  # t_N = 0
+    # Main sampling loop.
+    t_next = t_steps[0]
+    x_next = latents.to(torch.float64) * (sigma(t_next) * s(t_next))
+    for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):  # 0, ..., N-1
+        x_cur = x_next
+        # Increase noise temporarily.
+        gamma = min(S_churn / num_steps, np.sqrt(2) - 1) if S_min <= sigma(t_cur) <= S_max else 0
+        t_hat = sigma_inv(net.round_sigma(sigma(t_cur) + gamma * sigma(t_cur)))
+        x_hat = s(t_hat) / s(t_cur) * x_cur + (sigma(t_hat) ** 2 - sigma(t_cur) ** 2).clip(min=0).sqrt() * s(
+            t_hat) * S_noise * randn_like(x_cur)
+        # Euler step.
+        h = t_next - t_hat
+        denoised = net(x_hat.float() / s(t_hat), sigma(t_hat), class_labels, cfg_scale, feat=feat)['x'].to(
+            torch.float64)
+        d_cur = (sigma_deriv(t_hat) / sigma(t_hat) + s_deriv(t_hat) / s(t_hat)) * x_hat - sigma_deriv(t_hat) * s(
+            t_hat) / sigma(t_hat) * denoised
+        x_prime = x_hat + alpha * h * d_cur
+        t_prime = t_hat + alpha * h
+        # Apply 2nd order correction.
+        if solver == 'euler' or i == num_steps - 1:
+            x_next = x_hat + h * d_cur
+        else:
+            assert solver == 'heun'
+            denoised = net(x_prime.float() / s(t_prime), sigma(t_prime), class_labels, cfg_scale, feat=feat)['x'].to(
+                torch.float64)
+            d_prime = (sigma_deriv(t_prime) / sigma(t_prime) + s_deriv(t_prime) / s(t_prime)) * x_prime - sigma_deriv(
+                t_prime) * s(t_prime) / sigma(t_prime) * denoised
+            x_next = x_hat + h * ((1 - 1 / (2 * alpha)) * d_cur + 1 / (2 * alpha) * d_prime)
+    return x_next

DiT_VAE/diffusion/model/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,1006 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import enum
+import math
+import numpy as np
+import torch as th
+import torch.nn.functional as F
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self in [LossType.KL, LossType.RESCALED_KL]
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion vae.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        snr=False,
+        return_startx=False,
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        self.snr = snr
+        self.return_startx = return_startx
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        elif self.model_var_type in [ModelVarType.FIXED_LARGE, ModelVarType.FIXED_SMALL]:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        else:
+            model_variance = th.zeros_like(model_output)
+            model_log_variance = th.zeros_like(model_output)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            return x.clamp(-1, 1) if clip_denoised else x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        return p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        img = noise if noise is not None else th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        img = noise if noise is not None else th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, timestep, model_kwargs=None, noise=None, skip_noise=False):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        t = timestep
+        if model_kwargs is None:
+            model_kwargs = {}
+        if skip_noise:
+            x_t = x_start
+        else:
+            if noise is None:
+                noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type in [LossType.MSE, LossType.RESCALED_MSE]:
+            model_output = model(x_t, t, **model_kwargs)
+            if isinstance(model_output, dict) and model_output.get('x', None) is not None:
+                output = model_output['x']
+            else:
+                output = model_output
+            if self.return_startx and self.model_mean_type == ModelMeanType.EPSILON:
+                return self._extracted_from_training_losses_diffusers(x_t, output, t)
+            # self.model_var_type = ModelVarType.LEARNED_RANGE:4
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert output.shape == (B, C * 2, *x_t.shape[2:])
+                output, model_var_values = th.split(output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let it affect our mean prediction.
+                frozen_out = th.cat([output.detach(), model_var_values], dim=1)
+                # vb variational bound
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out, **kwargs: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert output.shape == target.shape == x_start.shape
+            if self.snr:
+                if self.model_mean_type == ModelMeanType.START_X:
+                    pred_noise = self._predict_eps_from_xstart(x_t=x_t, t=t, pred_xstart=output)
+                    pred_startx = output
+                elif self.model_mean_type == ModelMeanType.EPSILON:
+                    pred_noise = output
+                    pred_startx = self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output)
+                # terms["mse_eps"] = mean_flat((noise - pred_noise) ** 2)
+                # terms["mse_x0"] = mean_flat((x_start - pred_startx) ** 2)
+                t = t[:, None, None, None].expand(pred_startx.shape)  # [128, 4, 32, 32]
+                # best
+                target = th.where(t > 249, noise, x_start)
+                output = th.where(t > 249, pred_noise, pred_startx)
+            loss = (target - output) ** 2
+            if model_kwargs.get('mask_ratio', False) and model_kwargs['mask_ratio'] > 0:
+                assert 'mask' in model_output
+                loss = F.avg_pool2d(loss.mean(dim=1), model.model.module.patch_size).flatten(1)
+                mask = model_output['mask']
+                unmask = 1 - mask
+                terms['mse'] = mean_flat(loss * unmask) * unmask.shape[1]/unmask.sum(1)
+                if model_kwargs['mask_loss_coef'] > 0:
+                    terms['mae'] = model_kwargs['mask_loss_coef'] * mean_flat(loss * mask) * mask.shape[1]/mask.sum(1)
+            else:
+                terms["mse"] = mean_flat(loss)
+            terms["loss"] = terms["mse"] + terms["vb"] if "vb" in terms else terms["mse"]
+            if "mae" in terms:
+                terms["loss"] = terms["loss"] + terms["mae"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def training_losses_diffusers(self, model, x_start, timestep, model_kwargs=None, noise=None, skip_noise=False):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        t = timestep
+        if model_kwargs is None:
+            model_kwargs = {}
+        if skip_noise:
+            x_t = x_start
+        else:
+            if noise is None:
+                noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type in [LossType.KL, LossType.RESCALED_KL]:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type in [LossType.MSE, LossType.RESCALED_MSE]:
+            output = model(x_t, timestep=t, **model_kwargs, return_dict=False)[0]
+            if self.return_startx and self.model_mean_type == ModelMeanType.EPSILON:
+                return self._extracted_from_training_losses_diffusers(x_t, output, t)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert output.shape == (B, C * 2, *x_t.shape[2:])
+                output, model_var_values = th.split(output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let it affect our mean prediction.
+                frozen_out = th.cat([output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out, **kwargs: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert output.shape == target.shape == x_start.shape
+            if self.snr:
+                if self.model_mean_type == ModelMeanType.START_X:
+                    pred_noise = self._predict_eps_from_xstart(x_t=x_t, t=t, pred_xstart=output)
+                    pred_startx = output
+                elif self.model_mean_type == ModelMeanType.EPSILON:
+                    pred_noise = output
+                    pred_startx = self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output)
+                # terms["mse_eps"] = mean_flat((noise - pred_noise) ** 2)
+                # terms["mse_x0"] = mean_flat((x_start - pred_startx) ** 2)
+                t = t[:, None, None, None].expand(pred_startx.shape)  # [128, 4, 32, 32]
+                # best
+                target = th.where(t > 249, noise, x_start)
+                output = th.where(t > 249, pred_noise, pred_startx)
+            loss = (target - output) ** 2
+            terms["mse"] = mean_flat(loss)
+            terms["loss"] = terms["mse"] + terms["vb"] if "vb" in terms else terms["mse"]
+            if "mae" in terms:
+                terms["loss"] = terms["loss"] + terms["mae"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _extracted_from_training_losses_diffusers(self, x_t, output, t):
+        B, C = x_t.shape[:2]
+        assert output.shape == (B, C * 2, *x_t.shape[2:])
+        output = th.split(output, C, dim=1)[0]
+        return output, self._predict_xstart_from_eps(x_t=x_t, t=t, eps=output), x_t
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

DiT_VAE/diffusion/model/hed.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official vae and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+import sys
+from pathlib import Path
+current_file_path = Path(__file__).resolve()
+sys.path.insert(0, str(current_file_path.parent.parent.parent))
+from torch import nn
+import torch
+import numpy as np
+from torchvision import transforms as T
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+import json
+from PIL import Image
+import torchvision.transforms.functional as TF
+from accelerate import Accelerator
+from diffusers.models import AutoencoderKL
+import os
+image_resize = 1024
+class DoubleConvBlock(nn.Module):
+    def __init__(self, input_channel, output_channel, layer_number):
+        super().__init__()
+        self.convs = torch.nn.Sequential()
+        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        for i in range(1, layer_number):
+            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
+    def forward(self, x, down_sampling=False):
+        h = x
+        if down_sampling:
+            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
+        for conv in self.convs:
+            h = conv(h)
+            h = torch.nn.functional.relu(h)
+        return h, self.projection(h)
+class ControlNetHED_Apache2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
+        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
+        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
+        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
+        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
+    def forward(self, x):
+        h = x - self.norm
+        h, projection1 = self.block1(h)
+        h, projection2 = self.block2(h, down_sampling=True)
+        h, projection3 = self.block3(h, down_sampling=True)
+        h, projection4 = self.block4(h, down_sampling=True)
+        h, projection5 = self.block5(h, down_sampling=True)
+        return projection1, projection2, projection3, projection4, projection5
+class InternData(Dataset):
+    def __init__(self):
+        ####
+        with open('data/InternData/partition/data_info.json', 'r') as f:
+            self.j = json.load(f)
+        self.transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB')),
+            T.Resize(image_resize),  # Image.BICUBIC
+            T.CenterCrop(image_resize),
+            T.ToTensor(),
+        ])
+    def __len__(self):
+        return len(self.j)
+    def getdata(self, idx):
+        path = self.j[idx]['path']
+        image = Image.open("data/InternImgs/" + path)
+        image = self.transform(image)
+        return image, path
+    def __getitem__(self, idx):
+        for i in range(20):
+            try:
+                data = self.getdata(idx)
+                return data
+            except Exception as e:
+                print(f"Error details: {str(e)}")
+                idx = np.random.randint(len(self))
+        raise RuntimeError('Too many bad data.')
+class HEDdetector(nn.Module):
+    def __init__(self, feature=True, vae=None):
+        super().__init__()
+        self.model = ControlNetHED_Apache2()
+        self.model.load_state_dict(torch.load('output/pretrained_models/ControlNetHED.pth', map_location='cpu'))
+        self.model.eval()
+        self.model.requires_grad_(False)
+        if feature:
+            if vae is None:
+                self.vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema")
+            else:
+                self.vae = vae
+            self.vae.eval()
+            self.vae.requires_grad_(False)
+        else:
+            self.vae = None
+    def forward(self, input_image):
+        B, C, H, W = input_image.shape
+        with torch.inference_mode():
+            edges = self.model(input_image * 255.)
+            edges = torch.cat([TF.resize(e, [H, W]) for e in edges], dim=1)
+            edge = 1 / (1 + torch.exp(-torch.mean(edges, dim=1, keepdim=True)))
+            edge.clip_(0, 1)
+            if self.vae:
+                edge = TF.normalize(edge, [.5], [.5])
+                edge = edge.repeat(1, 3, 1, 1)
+                posterior = self.vae.encode(edge).latent_dist
+                edge = torch.cat([posterior.mean, posterior.std], dim=1).cpu().numpy()
+        return edge
+def main():
+    dataset = InternData()
+    dataloader = DataLoader(dataset, batch_size=10, shuffle=False, num_workers=8, pin_memory=True)
+    hed = HEDdetector()
+    accelerator = Accelerator()
+    hed, dataloader = accelerator.prepare(hed, dataloader)
+    for img, path in tqdm(dataloader):
+        out = hed(img.cuda())
+        for p, o in zip(path, out):
+            save = f'data/InternalData/hed_feature_{image_resize}/' + p.replace('.png', '.npz')
+            if os.path.exists(save):
+                continue
+            os.makedirs(os.path.dirname(save), exist_ok=True)
+            np.savez_compressed(save, o)
+if __name__ == "__main__":
+    main()

DiT_VAE/diffusion/model/image_embedding.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+from transformers import AutoImageProcessor, Dinov2Model
+from PIL import Image
+import requests
+# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+# image = Image.open(requests.get(url, stream=True).raw)
+#
+# processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+# model = AutoModel.from_pretrained('facebook/dinov2-base')
+#
+# inputs = processor(images=image, return_tensors="pt")
+# outputs = model(**inputs)
+# last_hidden_states = outputs[0]

DiT_VAE/diffusion/model/nets/PixArt_blocks.py ADDED Viewed

	@@ -0,0 +1,655 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+from timm.models.vision_transformer import Mlp, Attention as Attention_
+from einops import rearrange
+import xformers.ops
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def t2i_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class MultiHeadCrossAttention(nn.Module):
+    def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., **block_kwargs):
+        super(MultiHeadCrossAttention, self).__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.kv_linear = nn.Linear(d_model, d_model * 2)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(d_model, d_model)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, cond, mask=None):
+        # query: img tokens; key/value: condition; mask: if padding tokens
+        B, N, C = x.shape
+        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
+        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(2)
+        attn_bias = None
+        if mask is not None:
+            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
+        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
+        x = x.view(B, -1, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        # q = self.q_linear(x).reshape(B, -1, self.num_heads, self.head_dim)
+        # kv = self.kv_linear(cond).reshape(B, -1, 2, self.num_heads, self.head_dim)
+        # k, v = kv.unbind(2)
+        # attn_bias = None
+        # if mask is not None:
+        #     attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
+        #     attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
+        # x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
+        # x = x.contiguous().reshape(B, -1, C)
+        # x = self.proj(x)
+        # x = self.proj_drop(x)
+        return x
+class WindowAttention(Attention_):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=True,
+            use_rel_pos=False,
+            rel_pos_zero_init=True,
+            input_size=None,
+            **block_kwargs,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, **block_kwargs)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim))
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = qkv.unbind(2)
+        if use_fp32_attention := getattr(self, 'fp32_attention', False):
+            q, k, v = q.float(), k.float(), v.float()
+        attn_bias = None
+        if mask is not None:
+            attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
+            attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
+        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
+        x = x.view(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#   AMP attention with fp32 softmax to fix loss NaN problem during training     #
+#################################################################################
+class Attention(Attention_):
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        use_fp32_attention = getattr(self, 'fp32_attention', False)
+        if use_fp32_attention:
+            q, k = q.float(), k.float()
+        with torch.cuda.amp.autocast(enabled=not use_fp32_attention):
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AttentionTest(Attention_):
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = qkv.unbind(2)
+        attn_bias = None
+        if mask is not None:
+            attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
+            attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
+        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
+        x = x.view(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class T2IFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5)
+        self.out_channels = out_channels
+    def forward(self, x, t):
+        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
+        x = t2i_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class MaskFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True)
+        )
+    def forward(self, x, t):
+        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DecoderLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, decoder_hidden_size):
+        super().__init__()
+        self.norm_decoder = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, decoder_hidden_size, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, t):
+        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
+        x = modulate(self.norm_decoder(x), shift, scale)
+        x = self.linear(x)
+        return x
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(self.dtype)
+        return self.mlp(t_freq)
+    @property
+    def dtype(self):
+        # 返回模型参数的数据类型
+        return next(self.parameters()).dtype
+class SizeEmbedder(TimestepEmbedder):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.outdim = hidden_size
+    def forward(self, s, bs):
+        if s.ndim == 1:
+            s = s[:, None]
+        assert s.ndim == 2
+        if s.shape[0] != bs:
+            s = s.repeat(bs // s.shape[0], 1)
+            assert s.shape[0] == bs
+        b, dims = s.shape[0], s.shape[1]
+        s = rearrange(s, "b d -> (b d)")
+        s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
+        s_emb = self.mlp(s_freq)
+        s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        return s_emb
+    @property
+    def dtype(self):
+        # 返回模型参数的数据类型
+        return next(self.parameters()).dtype
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        return self.embedding_table(labels)
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class ImageCaptionEmbedder(nn.Module):
+    """
+    Embeds image feature into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), depth=4,
+                 dim_head=64, heads=12, ff_mult=4, token_num=4):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, token_num, hidden_size) / hidden_size ** 0.5)
+        self.proj_in = nn.Linear(in_channels, hidden_size)
+        self.proj_out = Mlp(in_features=hidden_size, hidden_features=hidden_size, out_features=hidden_size,
+                            act_layer=act_layer, drop=0)
+        self.norm_out = nn.LayerNorm(hidden_size)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=hidden_size, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=hidden_size, mult=ff_mult),
+                    ]
+                )
+            )
+        self.uncond_prob = uncond_prob
+    def forward(self, x, train, force_drop_ids=None):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        latents = self.norm_out(latents)
+        image_caption = latents.unsqueeze(1)  # # (N, 1, L, D)
+        return image_caption
+class DinoFeatureEmbedderQFormer(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=257, depth=4,
+                 dim_head=64, heads=12, ff_mult=4 ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, token_num, hidden_size) / hidden_size ** 0.5)
+        self.proj_in = nn.Linear(in_channels, hidden_size)
+        self.proj_out = Mlp(in_features=hidden_size, hidden_features=hidden_size, out_features=hidden_size,
+                            act_layer=act_layer, drop=0)
+        self.norm_out = nn.LayerNorm(hidden_size)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=hidden_size, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=hidden_size, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x, train, force_drop_ids=None):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        latents = self.norm_out(latents)
+        image_caption = latents.unsqueeze(1)  # # (N, 1, L, D)
+        return image_caption
+class DinoFeatureEmbedderV2(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=257, use_drop=True, dino_norm=False):
+        super().__init__()
+        self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size,
+                          act_layer=act_layer, drop=0)
+        self.dino_norm = dino_norm
+        if self.dino_norm:
+            self.norm_out = nn.LayerNorm(hidden_size)
+    def forward(self, dino_feature):
+        dino_feature = dino_feature.unsqueeze(1)
+        dino_feature = self.y_proj(dino_feature)
+        if self.dino_norm:
+            dino_feature = self.norm_out(dino_feature)
+        return dino_feature
+class DinoFeatureEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=257 ):
+        super().__init__()
+        self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size,
+                          act_layer=act_layer, drop=0)
+        self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
+        self.uncond_prob = uncond_prob
+    def token_drop(self, dino_feature, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(dino_feature.shape[0]).cuda() < self.uncond_prob
+        else:
+            force_drop_ids = torch.tensor(force_drop_ids).cuda()
+            drop_ids = force_drop_ids == 1
+        dino_feature = torch.where(drop_ids[:, None, None, None], self.y_embedding, dino_feature)
+        return dino_feature
+    def forward(self, dino_feature, train, force_drop_ids=None):
+        # print("dino_2", dino_feature)
+        dino_feature = dino_feature.unsqueeze(1)
+        if train:
+            assert dino_feature.shape[2:] == self.y_embedding.shape
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None and force_drop_ids != {} and len(force_drop_ids) != 0):
+            dino_feature = self.token_drop(dino_feature, force_drop_ids)
+        dino_feature = self.y_proj(dino_feature)
+        # print("dino_3", dino_feature)
+        return dino_feature
+class FusionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, act_layer=nn.GELU(approximate='tanh')):
+        super().__init__()
+        self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size,
+                          act_layer=act_layer, drop=0)
+    def forward(self, fusion_feature):
+        dino_feature = self.y_proj(fusion_feature)
+        return dino_feature
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
+        super().__init__()
+        self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size,
+                          act_layer=act_layer, drop=0)
+        self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
+        self.uncond_prob = uncond_prob
+    def token_drop(self, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return caption
+    def forward(self, caption, train, force_drop_ids=None):
+        if train:
+            assert caption.shape[2:] == self.y_embedding.shape
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption = self.token_drop(caption, force_drop_ids)
+        caption = self.y_proj(caption)
+        return caption
+class CaptionEmbedderDoubleBr(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
+        super().__init__()
+        self.proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size,
+                        act_layer=act_layer, drop=0)
+        self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5)
+        self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5)
+        self.uncond_prob = uncond_prob
+    def token_drop(self, global_caption, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption)
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return global_caption, caption
+    def forward(self, caption, train, force_drop_ids=None):
+        assert caption.shape[2:] == self.y_embedding.shape
+        global_caption = caption.mean(dim=2).squeeze()
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids)
+        y_embed = self.proj(global_caption)
+        return y_embed, caption

DiT_VAE/diffusion/model/nets/TriDitCLIPDINO.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import os
+import numpy as np
+from timm.models.layers import DropPath
+from timm.models.vision_transformer import PatchEmbed, Mlp
+from DiT_VAE.diffusion.model.builder import MODELS
+from DiT_VAE.diffusion.model.utils import auto_grad_checkpoint, to_2tuple
+from DiT_VAE.diffusion.model.nets.PixArt_blocks import t2i_modulate, WindowAttention, MultiHeadCrossAttention, \
+    T2IFinalLayer, TimestepEmbedder, ImageCaptionEmbedder, DinoFeatureEmbedderQFormer
+from DiT_VAE.diffusion.utils.logger import get_root_logger
+class PixArtBlock(nn.Module):
+    """
+    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None,
+                 use_rel_pos=False, **block_kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
+                                    input_size=input_size if window_size == 0 else (window_size, window_size),
+                                    use_rel_pos=use_rel_pos, **block_kwargs)
+        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        # to be compatible with lower version pytorch
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu,
+                       drop=0)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.window_size = window_size
+        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
+    def forward(self, x, y, t, mask=None, img_feature=None, **kwargs):
+        B, N, C = x.shape
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                    self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
+        if img_feature is None:
+            x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
+        else:
+            x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
+            img_feature = img_feature.squeeze(1)
+            N_new = N + img_feature.shape[1]
+            x_m = self.attn(torch.cat([x_m, img_feature], dim=1)).reshape(B, N_new, C)
+            x_m = x_m[:,:N, :]
+            x = x + self.drop_path(gate_msa * x_m)
+        x = x + self.cross_attn(x, y, mask)
+        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
+        return x
+#############################################################################
+#                                 Core PixArt Model                                #
+#################################################################################
+@MODELS.register_module()
+class TriDitCLIPDINO(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(self, input_size, patch_size=2, in_channels=8, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0,
+                 class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0,
+                 window_block_indexes=None, use_rel_pos=False, caption_channels=1280, lewei_scale=1.0, config=None, dino_channels=768, img_feature_self_attention=False, dino_norm=False,
+                 model_max_length=257, **kwargs):
+        if window_block_indexes is None:
+            window_block_indexes = []
+        super().__init__()
+        self.img_feature_self_attention= img_feature_self_attention
+        self.pred_sigma = pred_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if pred_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.lewei_scale = lewei_scale,
+        assert isinstance(input_size, tuple)
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        num_patches = self.x_embedder.num_patches
+        self.base_size_h = input_size[0] // self.patch_size
+        self.base_size_w = input_size[1] // self.patch_size
+        self.h = self.base_size_h
+        self.w= self.base_size_w
+        # Will use fixed sin-cos embedding:
+        self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.t_block = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.dino_embedder = DinoFeatureEmbedderQFormer(in_channels=dino_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=256)
+        self.y_embedder = ImageCaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size,
+                                               uncond_prob=class_dropout_prob, act_layer=approx_gelu,
+                                               token_num=16)
+        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            PixArtBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
+                        input_size=(input_size[0] // patch_size, input_size[1] // patch_size),
+                        window_size=window_size if i in window_block_indexes else 0,
+                        use_rel_pos=use_rel_pos if i in window_block_indexes else False)
+            for i in range(depth)
+        ])
+        self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+        if config:
+            logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
+            logger.warning(
+                f"lewei scale: {self.lewei_scale}, base size h: {self.base_size_h} base size w: {self.base_size_w}")
+        else:
+            print(
+                f'Warning: lewei scale: {self.lewei_scale},  base size h: {self.base_size_h} base size w: {self.base_size_w}')
+    def forward(self, x, timestep, y, img_feature, drop_img_mask=None, mask=None, data_info=None, **kwargs):
+        """
+        Forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) tensor of class labels
+        """
+        x = x.to(self.dtype)
+        timestep = timestep.to(self.dtype)
+        y = y.to(self.dtype)
+        img_feature = img_feature.to(self.dtype)
+        pos_embed = self.pos_embed.to(self.dtype)
+        self.h, self.w = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep.to(x.dtype))  # (N, D)
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
+        img_embedding = self.dino_embedder(img_feature, self.training)
+        # y_fusion = y
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = [y.shape[2]] * y.shape[0]
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+        for block in self.blocks:
+            x = auto_grad_checkpoint(block, x, y, t0, y_lens, img_embedding)  # (N, T, D) #support grad checkpoint
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)  # (N, out_channels, H, W)
+        return x
+    def forward_with_dpmsolver(self, x, timestep, y, img_feature, mask=None, **kwargs):
+        """
+        dpm solver donnot need variance prediction
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        model_out = self.forward(x, timestep, y, img_feature)
+        return model_out.chunk(2, dim=1)[0]
+    def forward_with_cfg(self, x, timestep, y, img_feature, cfg_scale, mask=None, **kwargs):
+        """
+        Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, timestep, y, img_feature, kwargs)
+        model_out = model_out['x'] if isinstance(model_out, dict) else model_out
+        eps, rest = torch.split(model_out, self.in_channels, dim=1)
+        # eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = int(x.shape[1] ** 0.5 * 2)
+        w = int(x.shape[1] ** 0.5 / 2)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        return x.reshape(shape=(x.shape[0], c, h * p, w * p))
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5),
+                                            lewei_scale=self.lewei_scale, base_size_h=self.base_size_h,
+                                            base_size_w=self.base_size_w)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.t_block[1].weight, std=0.02)
+        # Initialize caption embedding MLP:
+        nn.init.normal_(self.y_embedder.proj_out.fc1.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.proj_out.fc2.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.proj_in.weight, std=0.02)
+        # Initialize dino embedding MLP:
+        # nn.init.normal_(self.dino_embedder.y_proj.fc1.weight, std=0.02)
+        # nn.init.normal_(self.dino_embedder.y_proj.fc2.weight, std=0.02)
+        nn.init.normal_(self.dino_embedder.proj_out.fc1.weight, std=0.02)
+        nn.init.normal_(self.dino_embedder.proj_out.fc2.weight, std=0.02)
+        nn.init.normal_(self.dino_embedder.proj_in.weight, std=0.02)
+        # if not self.img_feature_self_attention:
+        #     # Initialize fusion embedding MLP:
+        #     nn.init.normal_(self.fusion_embedder.y_proj.fc1.weight, std=0.02)
+        #     nn.init.normal_(self.fusion_embedder.y_proj.fc2.weight, std=0.02)
+        # Zero-out adaLN modulation layers in PixArt blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.cross_attn.proj.weight, 0)
+            nn.init.constant_(block.cross_attn.proj.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size_h=16,
+                            base_size_w=16):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = to_2tuple(grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size_h) / lewei_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size_w) / lewei_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    return np.concatenate([emb_h, emb_w], axis=1)
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    return np.concatenate([emb_sin, emb_cos], axis=1)
+#################################################################################
+#                                   PixArt Configs                                  #
+#################################################################################
+@MODELS.register_module()
+def TriDitCLIPDINO_XL_2(**kwargs):
+    return TriDitCLIPDINO(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)

DiT_VAE/diffusion/model/nets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .TriDitCLIPDINO import TriDitCLIPDINO_XL_2, TriDitCLIPDINO

DiT_VAE/diffusion/model/respace.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        frac_stride = 1 if section_count <= 1 else (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def training_losses_diffusers(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses_diffusers(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, timestep, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=timestep.device, dtype=timestep.dtype)
+        new_ts = map_tensor[timestep]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, timestep=new_ts, **kwargs)

DiT_VAE/diffusion/model/sa_solver.py ADDED Viewed

	@@ -0,0 +1,1129 @@

+import torch
+import torch.nn.functional as F
+import math
+from tqdm import tqdm
+class NoiseScheduleVP:
+    def __init__(
+            self,
+            schedule='discrete',
+            betas=None,
+            alphas_cumprod=None,
+            continuous_beta_0=0.1,
+            continuous_beta_1=20.,
+            dtype=torch.float32,
+    ):
+        """Thanks to DPM-Solver for their code base"""
+        """Create a wrapper class for the forward SDE (VP type).
+        ***
+        Update: We support discrete-time diffusion vae by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion vae, especially for high-resolution images.
+        ***
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+        1. For discrete-time DPMs:
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+        2. For continuous-time DPMs:
+            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+            schedule are the default settings in DDPM and improved-DDPM:
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+                T: A `float` number. The ending time of the forward process.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' or 'cosine' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+        ===============================================================
+        Example:
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+        """
+        if schedule not in ['discrete', 'linear', 'cosine']:
+            raise ValueError(
+                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'"
+            )
+        self.schedule = schedule
+        if schedule == 'discrete':
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.total_N = len(log_alphas)
+            self.T = 1.
+            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
+            self.log_alpha_array = log_alphas.reshape((1, -1,)).to(dtype=dtype)
+        else:
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+            self.cosine_s = 0.008
+            self.cosine_beta_max = 999.
+            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
+            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
+            self.schedule = schedule
+            self.T = 0.9946 if schedule == 'cosine' else 1.
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
+                                  self.log_alpha_array.to(t.device)).reshape((-1))
+        elif self.schedule == 'linear':
+            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        elif self.schedule == 'cosine':
+            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
+            return log_alpha_fn(t) - self.cosine_log_alpha_0
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == 'linear':
+            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0 ** 2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == 'discrete':
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                               torch.flip(self.t_array.to(lamb.device), [1]))
+            return t.reshape((-1,))
+        else:
+            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
+            return t_fn(log_alpha)
+    def edm_sigma(self, t):
+        return self.marginal_std(t) / self.marginal_alpha(t)
+    def edm_inverse_sigma(self, edmsigma):
+        alpha = 1 / (edmsigma ** 2 + 1).sqrt()
+        sigma = alpha * edmsigma
+        lambda_t = torch.log(alpha / sigma)
+        return self.inverse_lambda(lambda_t)
+def model_wrapper(
+        model,
+        noise_schedule,
+        model_type="noise",
+        model_kwargs={},
+        guidance_type="uncond",
+        condition=None,
+        unconditional_condition=None,
+        guidance_scale=1.,
+        classifier_fn=None,
+        classifier_kwargs={},
+):
+    """Thanks to DPM-Solver for their code base"""
+    """Create a wrapper function for the noise prediction model.
+    SA-Solver needs to solve the continuous-time diffusion SDEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+    We support four types of the diffusion model by setting `model_type`:
+        1. "noise": noise prediction model. (Trained by predicting noise).
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion vae."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion vae beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for SA-Solver.
+    ===============================================================
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == 'discrete':
+            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
+        else:
+            return t_continuous
+    def noise_pred_fn(x, t_continuous, cond=None, cond_2=None):
+        t_input = get_model_input_time(t_continuous)
+        if cond is None:
+            output = model(x, t_input, **model_kwargs)
+        else:
+            output = model(x, t_input, cond, cond_2, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return (x - alpha_t[0] * output) / sigma_t[0]
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return alpha_t[0] * output + sigma_t[0] * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            return -sigma_t[0] * output
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * sigma_t * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1. or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t_continuous] * 2)
+            # c_in = torch.cat([unconditional_condition, condition])
+            c_in_y = torch.cat([unconditional_condition[0], condition[0]])
+            c_in_dino = torch.cat([unconditional_condition[1], condition[1]])
+            noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in_y, cond_2=c_in_dino).chunk(2)
+            return noise_uncond + guidance_scale * (noise - noise_uncond)
+    assert model_type in ["noise", "x_start", "v", "score"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+class SASolver:
+    def __init__(
+            self,
+            model_fn,
+            noise_schedule,
+            algorithm_type="data_prediction",
+            correcting_x0_fn=None,
+            correcting_xt_fn=None,
+            thresholding_max_val=1.,
+            dynamic_thresholding_ratio=0.995
+    ):
+        """
+        Construct a SA-Solver
+        The default value for algorithm_type is "data_prediction" and we recommend not to change it to
+        "noise_prediction". For details, please see Appendix A.2.4 in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        self.noise_schedule = noise_schedule
+        assert algorithm_type in ["data_prediction", "noise_prediction"]
+        if correcting_x0_fn == "dynamic_thresholding":
+            self.correcting_x0_fn = self.dynamic_thresholding_fn
+        else:
+            self.correcting_x0_fn = correcting_x0_fn
+        self.correcting_xt_fn = correcting_xt_fn
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.thresholding_max_val = thresholding_max_val
+        self.predict_x0 = algorithm_type == "data_prediction"
+        self.sigma_min = float(self.noise_schedule.edm_sigma(torch.tensor([1e-3])))
+        self.sigma_max = float(self.noise_schedule.edm_sigma(torch.tensor([1])))
+    def dynamic_thresholding_fn(self, x0, t=None):
+        """
+        The dynamic thresholding method.
+        """
+        dims = x0.dim()
+        p = self.dynamic_thresholding_ratio
+        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
+        x0 = torch.clamp(x0, -s, s) / s
+        return x0
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def data_prediction_fn(self, x, t):
+        """
+        Return the data prediction model (with corrector).
+        """
+        noise = self.noise_prediction_fn(x, t)
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        x0 = (x - sigma_t * noise) / alpha_t
+        if self.correcting_x0_fn is not None:
+            x0 = self.correcting_x0_fn(x0)
+        return x0
+    def model_fn(self, x, t):
+        """
+        Convert the model to the noise prediction model or the data prediction model.
+        """
+        if self.predict_x0:
+            return self.data_prediction_fn(x, t)
+        else:
+            return self.noise_prediction_fn(x, t)
+    def get_time_steps(self, skip_type, t_T, t_0, N, order, device):
+        """Compute the intermediate time steps for sampling.
+        """
+        if skip_type == 'logSNR':
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = lambda_T + torch.linspace(torch.tensor(0.).cpu().item(),
+                                                     (lambda_0 - lambda_T).cpu().item() ** (1. / order), N + 1).pow(
+                order).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == 'time':
+            t = torch.linspace(t_T ** (1. / order), t_0 ** (1. / order), N + 1).pow(order).to(device)
+            return t
+        elif skip_type == 'karras':
+            sigma_min = max(0.002, self.sigma_min)
+            sigma_max = min(80, self.sigma_max)
+            sigma_steps = torch.linspace(sigma_max ** (1. / 7), sigma_min ** (1. / 7), N + 1).pow(7).to(device)
+            return self.noise_schedule.edm_inverse_sigma(sigma_steps)
+        else:
+            raise ValueError(
+                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time' or 'karras'"
+            )
+    def denoise_to_zero_fn(self, x, s):
+        """
+        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
+        """
+        return self.data_prediction_fn(x, s)
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+        """
+        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
+        For calculating the coefficient of gradient terms after the lagrange interpolation,
+        see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        For noise_prediction formula.
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+        if order == 0:
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+        elif order == 1:
+            return torch.exp(-interval_end) * (
+                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
+        elif order == 2:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
+                            interval_end ** 2 + 2 * interval_end + 2))
+        elif order == 3:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
+                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+        """
+        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
+        For calculating the coefficient of gradient terms after the lagrange interpolation,
+        see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        For data_prediction formula.
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+        # after change of variable(cov)
+        interval_end_cov = (1 + tau ** 2) * interval_end
+        interval_start_cov = (1 + tau ** 2) * interval_start
+        if order == 0:
+            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
+            (1 + tau ** 2))
+        elif order == 1:
+            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
+        elif order == 2:
+            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
+                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
+        elif order == 3:
+            return torch.exp(interval_end_cov) * (
+                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
+                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
+                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)
+    def lagrange_polynomial_coefficient(self, order, lambda_list):
+        """
+        Calculate the coefficient of lagrange polynomial
+        For lagrange interpolation
+        """
+        assert order in [0, 1, 2, 3]
+        assert order == len(lambda_list) - 1
+        if order == 0:
+            return [[1]]
+        elif order == 1:
+            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
+                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
+        elif order == 2:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2]) / denominator1,
+                     lambda_list[1] * lambda_list[2] / denominator1],
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2]) / denominator2,
+                     lambda_list[0] * lambda_list[2] / denominator2],
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1]) / denominator3,
+                     lambda_list[0] * lambda_list[1] / denominator3]
+                    ]
+        elif order == 3:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
+                        lambda_list[0] - lambda_list[3])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
+                        lambda_list[1] - lambda_list[3])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
+                        lambda_list[2] - lambda_list[3])
+            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
+                        lambda_list[3] - lambda_list[2])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator1,
+                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator2,
+                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
+                         3]) / denominator3,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],
+                    [1 / denominator4,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
+                         2]) / denominator4,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]
+                    ]
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+        """
+        Calculate the coefficient of gradients.
+        """
+        assert order in [1, 2, 3, 4]
+        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
+        coefficients = []
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        for i in range(order):
+            coefficient = sum(
+                lagrange_coefficient[i][j]
+                * self.get_coefficients_exponential_positive(
+                    order - 1 - j, interval_start, interval_end, tau
+                )
+                if self.predict_x0
+                else lagrange_coefficient[i][j]
+                * self.get_coefficients_exponential_negative(
+                    order - 1 - j, interval_start, interval_end
+                )
+                for j in range(order)
+            )
+            coefficients.append(coefficient)
+        assert len(coefficients) == order, 'the length of coefficients does not match the order'
+        return coefficients
+    def adams_bashforth_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Predictor, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+        return x_t
+    def adams_moulton_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        t_list = t_prev_list + [t]
+        lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+        return x_t
+    def adams_bashforth_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Predictor, with the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+        if self.predict_x0:
+            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
+                    t_prev_list[-2]))
+                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
+                    t_prev_list[-2]))
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+        return x_t
+    def adams_moulton_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        t_list = t_prev_list + [t]
+        lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+        if self.predict_x0:
+            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2 * h))
+                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2 * h))
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+        return x_t
+    def sample_few_steps(self, x, tau, steps=5, t_start=None, t_end=None, skip_type='time', skip_order=1,
+                         predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
+                         ):
+        """
+        For the PC-mode, please refer to the wiki page
+        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
+        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
+        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
+        """
+        skip_first_step = False
+        skip_final_step = True
+        lower_order_final = True
+        denoise_to_zero = False
+        assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
+        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        device = x.device
+        intermediates = []
+        with torch.no_grad():
+            assert steps >= max(predictor_order, corrector_order - 1)
+            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
+                                            device=device)
+            assert timesteps.shape[0] - 1 == steps
+            # Init the initial values.
+            step = 0
+            t = timesteps[step]
+            noise = torch.randn_like(x)
+            t_prev_list = [t]
+            # do not evaluate if skip_first_step
+            if skip_first_step:
+                if self.predict_x0:
+                    alpha_t = self.noise_schedule.marginal_alpha(t)
+                    sigma_t = self.noise_schedule.marginal_std(t)
+                    model_prev_list = [(1 - sigma_t) / alpha_t * x]
+                else:
+                    model_prev_list = [x]
+            else:
+                model_prev_list = [self.model_fn(x, t)]
+            if self.correcting_xt_fn is not None:
+                x = self.correcting_xt_fn(x, t, step)
+            if return_intermediate:
+                intermediates.append(x)
+            # determine the first several values
+            for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):
+                t = timesteps[step]
+                predictor_order_used = min(predictor_order, step)
+                corrector_order_used = min(corrector_order, step + 1)
+                noise = torch.randn_like(x)
+                # predictor step
+                x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
+                                                            model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                            noise=noise, t=t)
+                # evaluation step
+                model_x = self.model_fn(x_p, t)
+                # update model_list
+                model_prev_list.append(model_x)
+                # corrector step
+                if corrector_order > 0:
+                    x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
+                                                            model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                            noise=noise, t=t)
+                else:
+                    x = x_p
+                # evaluation step if correction and mode = pece
+                if corrector_order > 0 and pc_mode == 'PECE':
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+                t_prev_list.append(t)
+            for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
+                if lower_order_final:
+                    predictor_order_used = min(predictor_order, steps - step + 1)
+                    corrector_order_used = min(corrector_order, steps - step + 2)
+                else:
+                    predictor_order_used = predictor_order
+                    corrector_order_used = corrector_order
+                t = timesteps[step]
+                noise = torch.randn_like(x)
+                # predictor step
+                if skip_final_step and step == steps and not denoise_to_zero:
+                    x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=0,
+                                                                model_prev_list=model_prev_list,
+                                                                t_prev_list=t_prev_list, noise=noise, t=t)
+                else:
+                    x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
+                                                                model_prev_list=model_prev_list,
+                                                                t_prev_list=t_prev_list, noise=noise, t=t)
+                # evaluation step
+                # do not evaluate if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_x = self.model_fn(x_p, t)
+                # update model_list
+                # do not update if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_prev_list.append(model_x)
+                # corrector step
+                # do not correct if skip_final_step and step = steps
+                if corrector_order > 0 and (not skip_final_step or step < steps):
+                    x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
+                                                            model_prev_list=model_prev_list,
+                                                            t_prev_list=t_prev_list, noise=noise, t=t)
+                else:
+                    x = x_p
+                # evaluation step if mode = pece and step != steps
+                if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+                t_prev_list.append(t)
+                del model_prev_list[0]
+            if denoise_to_zero:
+                t = torch.ones((1,)).to(device) * t_0
+                x = self.denoise_to_zero_fn(x, t)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step + 1)
+                if return_intermediate:
+                    intermediates.append(x)
+        return (x, intermediates) if return_intermediate else x
+    def sample_more_steps(self, x, tau, steps=20, t_start=None, t_end=None, skip_type='time', skip_order=1,
+                          predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
+                          ):
+        """
+        For the PC-mode, please refer to the wiki page
+        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
+        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
+        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
+        """
+        skip_first_step = False
+        skip_final_step = False
+        lower_order_final = True
+        denoise_to_zero = True
+        assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
+        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        device = x.device
+        intermediates = []
+        with torch.no_grad():
+            assert steps >= max(predictor_order, corrector_order - 1)
+            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
+                                            device=device)
+            assert timesteps.shape[0] - 1 == steps
+            # Init the initial values.
+            step = 0
+            t = timesteps[step]
+            noise = torch.randn_like(x)
+            t_prev_list = [t]
+            # do not evaluate if skip_first_step
+            if skip_first_step:
+                if self.predict_x0:
+                    alpha_t = self.noise_schedule.marginal_alpha(t)
+                    sigma_t = self.noise_schedule.marginal_std(t)
+                    model_prev_list = [(1 - sigma_t) / alpha_t * x]
+                else:
+                    model_prev_list = [x]
+            else:
+                model_prev_list = [self.model_fn(x, t)]
+            if self.correcting_xt_fn is not None:
+                x = self.correcting_xt_fn(x, t, step)
+            if return_intermediate:
+                intermediates.append(x)
+            # determine the first several values
+            for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):
+                t = timesteps[step]
+                predictor_order_used = min(predictor_order, step)
+                corrector_order_used = min(corrector_order, step + 1)
+                noise = torch.randn_like(x)
+                # predictor step
+                x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
+                                                  model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
+                                                  t=t)
+                # evaluation step
+                model_x = self.model_fn(x_p, t)
+                # update model_list
+                model_prev_list.append(model_x)
+                # corrector step
+                if corrector_order > 0:
+                    x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
+                                                  model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
+                                                  t=t)
+                else:
+                    x = x_p
+                # evaluation step if mode = pece
+                if corrector_order > 0 and pc_mode == 'PECE':
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+                t_prev_list.append(t)
+            for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
+                if lower_order_final:
+                    predictor_order_used = min(predictor_order, steps - step + 1)
+                    corrector_order_used = min(corrector_order, steps - step + 2)
+                else:
+                    predictor_order_used = predictor_order
+                    corrector_order_used = corrector_order
+                t = timesteps[step]
+                noise = torch.randn_like(x)
+                # predictor step
+                if skip_final_step and step == steps and not denoise_to_zero:
+                    x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=0,
+                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                      noise=noise, t=t)
+                else:
+                    x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
+                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                      noise=noise, t=t)
+                # evaluation step
+                # do not evaluate if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_x = self.model_fn(x_p, t)
+                # update model_list
+                # do not update if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_prev_list.append(model_x)
+                # corrector step
+                # do not correct if skip_final_step and step = steps
+                if corrector_order > 0:
+                    if not skip_final_step or step < steps:
+                        x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
+                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                      noise=noise, t=t)
+                    else:
+                        x = x_p
+                else:
+                    x = x_p
+                # evaluation step if mode = pece and step != steps
+                if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+                t_prev_list.append(t)
+                del model_prev_list[0]
+            if denoise_to_zero:
+                t = torch.ones((1,)).to(device) * t_0
+                x = self.denoise_to_zero_fn(x, t)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step + 1)
+                if return_intermediate:
+                    intermediates.append(x)
+        if return_intermediate:
+            return x, intermediates
+        else:
+            return x
+    def sample(self, mode, x, tau, steps, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3,
+               corrector_order=4, pc_mode='PEC', return_intermediate=False
+               ):
+        """
+        For the PC-mode, please refer to the wiki page
+        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
+        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
+        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
+        'few_steps' mode is recommended. The differences between 'few_steps' and 'more_steps' are as below:
+        1) 'few_steps' do not correct at final step and do not denoise to zero, while 'more_steps' do these two.
+        Thus the NFEs for 'few_steps' = steps, NFEs for 'more_steps' = steps + 2
+        For most of the experiments and tasks, we find these two operations do not have much help to sample quality.
+        2) 'few_steps' use a rescaling trick as in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        We find it will slightly improve the sample quality especially in few steps.
+        """
+        assert mode in ['few_steps', 'more_steps'], "mode must be either 'few_steps' or 'more_steps'"
+        if mode == 'few_steps':
+            return self.sample_few_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
+                                         skip_order=skip_order, predictor_order=predictor_order,
+                                         corrector_order=corrector_order, pc_mode=pc_mode,
+                                         return_intermediate=return_intermediate)
+        else:
+            return self.sample_more_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
+                                          skip_order=skip_order, predictor_order=predictor_order,
+                                          corrector_order=corrector_order, pc_mode=pc_mode,
+                                          return_intermediate=return_intermediate)
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,) * (dims - 1)]

DiT_VAE/diffusion/model/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs, device=local_ts.device) for _ in batch_sizes]
+        loss_batches = [th.zeros(max_bs, device=local_losses.device) for _ in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

DiT_VAE/diffusion/model/utils.py ADDED Viewed

	@@ -0,0 +1,510 @@

+import os
+import sys
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+import torch.nn.functional as F
+import torch
+import torch.distributed as dist
+import re
+import math
+from collections.abc import Iterable
+from itertools import repeat
+from torchvision import transforms as T
+import random
+from PIL import Image
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
+    assert isinstance(model, nn.Module)
+    def set_attr(module):
+        module.grad_checkpointing = True
+        module.fp32_attention = use_fp32_attention
+        module.grad_checkpointing_step = gc_step
+    model.apply(set_attr)
+def auto_grad_checkpoint(module, *args, **kwargs):
+    if getattr(module, 'grad_checkpointing', False):
+        if not isinstance(module, Iterable):
+            return checkpoint(module, *args, **kwargs)
+        gc_step = module[0].grad_checkpointing_step
+        return checkpoint_sequential(module, gc_step, *args, **kwargs)
+    return module(*args, **kwargs)
+def checkpoint_sequential(functions, step, input, *args, **kwargs):
+    # Hack for keyword-only parameter in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(kwargs))
+    def run_function(start, end, functions):
+        def forward(input):
+            for j in range(start, end + 1):
+                input = functions[j](input, *args)
+            return input
+        return forward
+    if isinstance(functions, torch.nn.Sequential):
+        functions = list(functions.children())
+    # the last chunk has to be non-volatile
+    end = -1
+    segment = len(functions) // step
+    for start in range(0, step * (segment - 1), step):
+        end = start + step - 1
+        input = checkpoint(run_function(start, end, functions), input, preserve_rng_state=preserve)
+    return run_function(end + 1, len(functions) - 1, functions)(input)
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def mean_flat(tensor):
+    return tensor.mean(dim=list(range(1, tensor.ndim)))
+#################################################################################
+#                          Token Masking and Unmasking                          #
+#################################################################################
+def get_mask(batch, length, mask_ratio, device, mask_type=None, data_info=None, extra_len=0):
+    """
+    Get the binary mask for the input sequence.
+    Args:
+        - batch: batch size
+        - length: sequence length
+        - mask_ratio: ratio of tokens to mask
+        - data_info: dictionary with info for reconstruction
+    return:
+        mask_dict with following keys:
+        - mask: binary mask, 0 is keep, 1 is remove
+        - ids_keep: indices of tokens to keep
+        - ids_restore: indices to restore the original order
+    """
+    assert mask_type in ['random', 'fft', 'laplacian', 'group']
+    mask = torch.ones([batch, length], device=device)
+    len_keep = int(length * (1 - mask_ratio)) - extra_len
+    if mask_type in ['random', 'group']:
+        noise = torch.rand(batch, length, device=device)  # noise in [0, 1]
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        ids_removed = ids_shuffle[:, len_keep:]
+    elif mask_type in ['fft', 'laplacian']:
+        if 'strength' in data_info:
+            strength = data_info['strength']
+        else:
+            N = data_info['N'][0]
+            img = data_info['ori_img']
+            # 获取原图的尺寸信息
+            _, C, H, W = img.shape
+            if mask_type == 'fft':
+                # 对图片进行reshape，将其变为patch (3, H/N, N, W/N, N)
+                reshaped_image = img.reshape((batch, -1, H // N, N, W // N, N))
+                fft_image = torch.fft.fftn(reshaped_image, dim=(3, 5))
+                # 取绝对值并求和获取频率强度
+                strength = torch.sum(torch.abs(fft_image), dim=(1, 3, 5)).reshape((batch, -1,))
+            elif type == 'laplacian':
+                laplacian_kernel = torch.tensor([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=torch.float32).reshape(1, 1, 3, 3)
+                laplacian_kernel = laplacian_kernel.repeat(C, 1, 1, 1)
+                # 对图片进行reshape，将其变为patch (3, H/N, N, W/N, N)
+                reshaped_image = img.reshape(-1, C, H // N, N, W // N, N).permute(0, 2, 4, 1, 3, 5).reshape(-1, C, N, N)
+                laplacian_response = F.conv2d(reshaped_image, laplacian_kernel, padding=1, groups=C)
+                strength = laplacian_response.sum(dim=[1, 2, 3]).reshape((batch, -1,))
+        # 对频率强度进行归一化，然后使用torch.multinomial进行采样
+        probabilities = strength / (strength.max(dim=1)[0][:, None]+1e-5)
+        ids_shuffle = torch.multinomial(probabilities.clip(1e-5, 1), length, replacement=False)
+        ids_keep = ids_shuffle[:, :len_keep]
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_removed = ids_shuffle[:, len_keep:]
+    mask[:, :len_keep] = 0
+    mask = torch.gather(mask, dim=1, index=ids_restore)
+    return {'mask': mask,
+            'ids_keep': ids_keep,
+            'ids_restore': ids_restore,
+            'ids_removed': ids_removed}
+def mask_out_token(x, ids_keep, ids_removed=None):
+    """
+    Mask out the tokens specified by ids_keep.
+    Args:
+        - x: input sequence, [N, L, D]
+        - ids_keep: indices of tokens to keep
+    return:
+        - x_masked: masked sequence
+    """
+    N, L, D = x.shape  # batch, length, dim
+    x_remain = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+    if ids_removed is not None:
+        x_masked = torch.gather(x, dim=1, index=ids_removed.unsqueeze(-1).repeat(1, 1, D))
+        return x_remain, x_masked
+    else:
+        return x_remain
+def mask_tokens(x, mask_ratio):
+    """
+    Perform per-sample random masking by per-sample shuffling.
+    Per-sample shuffling is done by argsort random noise.
+    x: [N, L, D], sequence
+    """
+    N, L, D = x.shape  # batch, length, dim
+    len_keep = int(L * (1 - mask_ratio))
+    noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=1)
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+    # generate the binary mask: 0 is keep, 1 is remove
+    mask = torch.ones([N, L], device=x.device)
+    mask[:, :len_keep] = 0
+    mask = torch.gather(mask, dim=1, index=ids_restore)
+    return x_masked, mask, ids_restore
+def unmask_tokens(x, ids_restore, mask_token):
+    # x: [N, T, D] if extras == 0 (i.e., no cls token) else x: [N, T+1, D]
+    mask_tokens = mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
+    x = torch.cat([x, mask_tokens], dim=1)
+    x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+    return x
+# Parse 'None' to None and others to float value
+def parse_float_none(s):
+    assert isinstance(s, str)
+    return None if s == 'None' else float(s)
+#----------------------------------------------------------------------------
+# Parse a comma separated list of numbers or ranges and return a list of ints.
+# Example: '1,2,5-10' returns [1, 2, 5, 6, 7, 8, 9, 10]
+def parse_int_list(s):
+    if isinstance(s, list): return s
+    ranges = []
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    for p in s.split(','):
+        if m := range_re.match(p):
+            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
+        else:
+            ranges.append(int(p))
+    return ranges
+def init_processes(fn, args):
+    """ Initialize the distributed environment. """
+    os.environ['MASTER_ADDR'] = args.master_address
+    os.environ['MASTER_PORT'] = str(random.randint(2000, 6000))
+    print(f'MASTER_ADDR = {os.environ["MASTER_ADDR"]}')
+    print(f'MASTER_PORT = {os.environ["MASTER_PORT"]}')
+    torch.cuda.set_device(args.local_rank)
+    dist.init_process_group(backend='nccl', init_method='env://', rank=args.global_rank, world_size=args.global_size)
+    fn(args)
+    if args.global_size > 1:
+        cleanup()
+def mprint(*args, **kwargs):
+    """
+    Print only from rank 0.
+    """
+    if dist.get_rank() == 0:
+        print(*args, **kwargs)
+def cleanup():
+    """
+    End DDP training.
+    """
+    dist.barrier()
+    mprint("Done!")
+    dist.barrier()
+    dist.destroy_process_group()
+#----------------------------------------------------------------------------
+# logging info.
+class Logger(object):
+    """
+    Redirect stderr to stdout, optionally print stdout to a file,
+    and optionally force flushing on both stdout and the file.
+    """
+    def __init__(self, file_name=None, file_mode="w", should_flush=True):
+        self.file = None
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+        sys.stdout = self
+        sys.stderr = self
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+    def write(self, text):
+        """Write text to stdout (and a file) and optionally flush."""
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+        if self.file is not None:
+            self.file.write(text)
+        self.stdout.write(text)
+        if self.should_flush:
+            self.flush()
+    def flush(self):
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+        self.stdout.flush()
+    def close(self):
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+        if self.file is not None:
+            self.file.close()
+class StackedRandomGenerator:
+    def __init__(self, device, seeds):
+        super().__init__()
+        self.generators = [torch.Generator(device).manual_seed(int(seed) % (1 << 32)) for seed in seeds]
+    def randn(self, size, **kwargs):
+        assert size[0] == len(self.generators)
+        return torch.stack([torch.randn(size[1:], generator=gen, **kwargs) for gen in self.generators])
+    def randn_like(self, input):
+        return self.randn(input.shape, dtype=input.dtype, layout=input.layout, device=input.device)
+    def randint(self, *args, size, **kwargs):
+        assert size[0] == len(self.generators)
+        return torch.stack([torch.randint(*args, size=size[1:], generator=gen, **kwargs) for gen in self.generators])
+def prepare_prompt_ar(prompt, ratios, device='cpu', show=True):
+    # get aspect_ratio or ar
+    aspect_ratios = re.findall(r"--aspect_ratio\s+(\d+:\d+)", prompt)
+    ars = re.findall(r"--ar\s+(\d+:\d+)", prompt)
+    custom_hw = re.findall(r"--hw\s+(\d+:\d+)", prompt)
+    if show:
+        print("aspect_ratios:", aspect_ratios, "ars:", ars, "hws:", custom_hw)
+    prompt_clean = prompt.split("--aspect_ratio")[0].split("--ar")[0].split("--hw")[0]
+    if len(aspect_ratios) + len(ars) + len(custom_hw) == 0 and show:
+        print( "Wrong prompt format. Set to default ar: 1. change your prompt into format '--ar h:w or --hw h:w' for correct generating")
+    if len(aspect_ratios) != 0:
+        ar = float(aspect_ratios[0].split(':')[0]) / float(aspect_ratios[0].split(':')[1])
+    elif len(ars) != 0:
+        ar = float(ars[0].split(':')[0]) / float(ars[0].split(':')[1])
+    else:
+        ar = 1.
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+    if len(custom_hw) != 0:
+        custom_hw = [float(custom_hw[0].split(':')[0]), float(custom_hw[0].split(':')[1])]
+    else:
+        custom_hw = ratios[closest_ratio]
+    default_hw = ratios[closest_ratio]
+    prompt_show = f'prompt: {prompt_clean.strip()}\nSize: --ar {closest_ratio}, --bin hw {ratios[closest_ratio]}, --custom hw {custom_hw}'
+    return prompt_clean, prompt_show, torch.tensor(default_hw, device=device)[None], torch.tensor([float(closest_ratio)], device=device)[None], torch.tensor(custom_hw, device=device)[None]
+def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int):
+    orig_hw = torch.tensor([samples.shape[2], samples.shape[3]], dtype=torch.int)
+    custom_hw = torch.tensor([int(new_height), int(new_width)], dtype=torch.int)
+    if (orig_hw != custom_hw).all():
+        ratio = max(custom_hw[0] / orig_hw[0], custom_hw[1] / orig_hw[1])
+        resized_width = int(orig_hw[1] * ratio)
+        resized_height = int(orig_hw[0] * ratio)
+        transform = T.Compose([
+            T.Resize((resized_height, resized_width)),
+            T.CenterCrop(custom_hw.tolist())
+        ])
+        return transform(samples)
+    else:
+        return samples
+def resize_and_crop_img(img: Image, new_width, new_height):
+    orig_width, orig_height = img.size
+    ratio = max(new_width/orig_width, new_height/orig_height)
+    resized_width = int(orig_width * ratio)
+    resized_height = int(orig_height * ratio)
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    left = (resized_width - new_width)/2
+    top = (resized_height - new_height)/2
+    right = (resized_width + new_width)/2
+    bottom = (resized_height + new_height)/2
+    img = img.crop((left, top, right, bottom))
+    return img
+def mask_feature(emb, mask):
+    if emb.shape[0] == 1:
+        keep_index = mask.sum().item()
+        return emb[:, :, :keep_index, :], keep_index
+    else:
+        masked_feature = emb * mask[:, None, :, None]
+        return masked_feature, emb.shape[2]

DiT_VAE/diffusion/sa_sampler.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""SAMPLING ONLY."""
+import torch
+import numpy as np
+from DiT_VAE.diffusion.model.sa_solver import NoiseScheduleVP, model_wrapper, SASolver
+from .model import gaussian_diffusion as gd
+class SASolverSampler(object):
+    def __init__(self, model,
+                 noise_schedule="linear",
+                 diffusion_steps=1000,
+                 device='cpu',
+                 ):
+        super().__init__()
+        self.model = model
+        self.device = device
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(device)
+        betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))
+        alphas = 1.0 - betas
+        self.register_buffer('alphas_cumprod', to_torch(np.cumprod(alphas, axis=0)))
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor and attr.device != torch.device("cuda"):
+            attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+    @torch.no_grad()
+    def sample(self, S, batch_size, shape, conditioning=None, callback=None, normals_sequence=None, img_callback=None, quantize_x0=False, eta=0., mask=None, x0=None, temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, verbose=True, x_T=None, log_every_t=100, unconditional_guidance_scale=1., unconditional_conditioning=None, model_kwargs=None, **kwargs):
+        if model_kwargs is None:
+            model_kwargs = {}
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            elif conditioning.shape[0] != batch_size:
+                print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        device = self.device
+        img = torch.randn(size, device=device) if x_T is None else x_T
+        ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
+        model_fn = model_wrapper(
+            self.model,
+            ns,
+            model_type="noise",
+            guidance_type="classifier-free",
+            condition=conditioning,
+            unconditional_condition=unconditional_conditioning,
+            guidance_scale=unconditional_guidance_scale,
+            model_kwargs=model_kwargs,
+        )
+        sasolver = SASolver(model_fn, ns, algorithm_type="data_prediction")
+        tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0
+        x = sasolver.sample(mode='few_steps', x=img, tau=tau_t, steps=S, skip_type='time', skip_order=1, predictor_order=2, corrector_order=2, pc_mode='PEC', return_intermediate=False)
+        return x.to(device), None

DiT_VAE/diffusion/sa_solver_diffusers.py ADDED Viewed

	@@ -0,0 +1,840 @@

+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: check https://arxiv.org/abs/2309.05019
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+import math
+from typing import List, Optional, Tuple, Union, Callable
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+        num_diffusion_timesteps,
+        max_beta=0.999,
+        alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    elif alpha_transform_type == "exp":
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+class SASolverScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `SASolverScheduler` is a fast dedicated high-order solver for diffusion SDEs.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        predictor_order (`int`, defaults to 2):
+            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided
+            sampling, and `predictor_order=3` for unconditional sampling.
+        corrector_order (`int`, defaults to 2):
+            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
+            sampling, and `corrector_order=3` for unconditional sampling.
+        predictor_corrector_mode (`str`, defaults to `PEC`):
+            The predictor-corrector mode can be `PEC` or 'PECE'. It is recommended to use `PEC` mode for fast
+            sampling, and `PECE` for high-quality sampling (PECE needs around twice model evaluations as PEC).
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion vae such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `data_prediction`):
+            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction`
+            with `solver_order=2` for guided sampling like in Stable Diffusion.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Default = True.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion vae that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
+            Diffusion.
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+            self,
+            num_train_timesteps: int = 1000,
+            beta_start: float = 0.0001,
+            beta_end: float = 0.02,
+            beta_schedule: str = "linear",
+            trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+            predictor_order: int = 2,
+            corrector_order: int = 2,
+            predictor_corrector_mode: str = 'PEC',
+            prediction_type: str = "epsilon",
+            tau_func: Callable = lambda t: 1 if t >= 200 and t <= 800 else 0,
+            thresholding: bool = False,
+            dynamic_thresholding_ratio: float = 0.995,
+            sample_max_value: float = 1.0,
+            algorithm_type: str = "data_prediction",
+            lower_order_final: bool = True,
+            use_karras_sigmas: Optional[bool] = False,
+            lambda_min_clipped: float = -float("inf"),
+            variance_type: Optional[str] = None,
+            timestep_spacing: str = "linspace",
+            steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                    torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        if algorithm_type not in ["data_prediction", "noise_prediction"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
+        self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
+        self.tau_func = tau_func
+        self.predict_x0 = algorithm_type == "data_prediction"
+        self.lower_order_nums = 0
+        self.last_sample = None
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            timesteps = np.flip(timesteps).copy().astype(np.int64)
+        self.sigmas = torch.from_numpy(sigmas)
+        # when num_inference_steps == num_train_timesteps, we can end up with
+        # duplicates in timesteps.
+        _, unique_indices = np.unique(timesteps, return_index=True)
+        timesteps = timesteps[np.sort(unique_indices)]
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+                                 None,
+                             ] * max(self.config.predictor_order, self.config.corrector_order - 1)
+        self.lower_order_nums = 0
+        self.last_sample = None
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+        return sample
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(sigma)
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        return (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+    def convert_model_output(
+            self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+        <Tip>
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction vae.
+        </Tip>
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        # SA-Solver_data_prediction needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["data_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+            return x0_pred
+        # SA-Solver_noise_prediction needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["noise_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+            if self.config.thresholding:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+            return epsilon
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+        """
+        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+        if order == 0:
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+        elif order == 1:
+            return torch.exp(-interval_end) * (
+                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
+        elif order == 2:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
+                            interval_end ** 2 + 2 * interval_end + 2))
+        elif order == 3:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
+                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+        """
+        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+        # after change of variable(cov)
+        interval_end_cov = (1 + tau ** 2) * interval_end
+        interval_start_cov = (1 + tau ** 2) * interval_start
+        if order == 0:
+            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
+            (1 + tau ** 2))
+        elif order == 1:
+            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
+        elif order == 2:
+            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
+                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
+        elif order == 3:
+            return torch.exp(interval_end_cov) * (
+                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
+                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
+                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)
+    def lagrange_polynomial_coefficient(self, order, lambda_list):
+        """
+        Calculate the coefficient of lagrange polynomial
+        """
+        assert order in [0, 1, 2, 3]
+        assert order == len(lambda_list) - 1
+        if order == 0:
+            return [[1]]
+        elif order == 1:
+            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
+                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
+        elif order == 2:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2]) / denominator1,
+                     lambda_list[1] * lambda_list[2] / denominator1],
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2]) / denominator2,
+                     lambda_list[0] * lambda_list[2] / denominator2],
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1]) / denominator3,
+                     lambda_list[0] * lambda_list[1] / denominator3]
+                    ]
+        elif order == 3:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
+                        lambda_list[0] - lambda_list[3])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
+                        lambda_list[1] - lambda_list[3])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
+                        lambda_list[2] - lambda_list[3])
+            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
+                        lambda_list[3] - lambda_list[2])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator1,
+                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator2,
+                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
+                         3]) / denominator3,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],
+                    [1 / denominator4,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
+                         2]) / denominator4,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]
+                    ]
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+        assert order in [1, 2, 3, 4]
+        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
+        coefficients = []
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        for i in range(order):
+            coefficient = sum(
+                lagrange_coefficient[i][j]
+                * self.get_coefficients_exponential_positive(
+                    order - 1 - j, interval_start, interval_end, tau
+                )
+                if self.predict_x0
+                else lagrange_coefficient[i][j]
+                * self.get_coefficients_exponential_negative(
+                    order - 1 - j, interval_start, interval_end
+                )
+                for j in range(order)
+            )
+            coefficients.append(coefficient)
+        assert len(coefficients) == order, 'the length of coefficients does not match the order'
+        return coefficients
+    def stochastic_adams_bashforth_update(
+            self,
+            model_output: torch.FloatTensor,
+            prev_timestep: int,
+            sample: torch.FloatTensor,
+            noise: torch.FloatTensor,
+            order: int,
+            tau: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the SA-Predictor.
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of SA-Predictor at this timestep.
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        assert noise is not None
+        timestep_list = self.timestep_list
+        model_output_list = self.model_outputs
+        s0, t = self.timestep_list[-1], prev_timestep
+        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        gradient_part = torch.zeros_like(sample)
+        h = lambda_t - lambda_s0
+        lambda_list = [self.lambda_t[timestep_list[-(i + 1)]] for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+        x = sample
+        if self.predict_x0 and order == 2:
+            gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                        h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                            (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
+                timestep_list[-2]])
+            gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                        h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                            (1 + tau ** 2) ** 2)) / (self.lambda_t[timestep_list[-1]] - self.lambda_t[
+                timestep_list[-2]])
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_output_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def stochastic_adams_moulton_update(
+            self,
+            this_model_output: torch.FloatTensor,
+            this_timestep: int,
+            last_sample: torch.FloatTensor,
+            last_noise: torch.FloatTensor,
+            this_sample: torch.FloatTensor,
+            order: int,
+            tau: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the SA-Corrector.
+        Args:
+            this_model_output (`torch.FloatTensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.FloatTensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.FloatTensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The order of SA-Corrector at this step.
+        Returns:
+            `torch.FloatTensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        assert last_noise is not None
+        timestep_list = self.timestep_list
+        model_output_list = self.model_outputs
+        s0, t = self.timestep_list[-1], this_timestep
+        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        gradient_part = torch.zeros_like(this_sample)
+        h = lambda_t - lambda_s0
+        t_list = timestep_list + [this_timestep]
+        lambda_list = [self.lambda_t[t_list[-(i + 1)]] for i in range(order)]
+        model_prev_list = model_output_list + [this_model_output]
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+        x = last_sample
+        if self.predict_x0 and order == 2:
+            gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                        h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                            (1 + tau ** 2) ** 2 * h))
+            gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                        h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                            (1 + tau ** 2) ** 2 * h))
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * last_noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def step(
+            self,
+            model_output: torch.FloatTensor,
+            timestep: int,
+            sample: torch.FloatTensor,
+            generator=None,
+            return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the SA-Solver.
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+        step_index = (self.timesteps == timestep).nonzero()
+        if len(step_index) == 0:
+            step_index = len(self.timesteps) - 1
+        else:
+            step_index = step_index.item()
+        use_corrector = (
+                step_index > 0 and self.last_sample is not None
+        )
+        model_output_convert = self.convert_model_output(model_output, timestep, sample)
+        if use_corrector:
+            current_tau = self.tau_func(self.timestep_list[-1])
+            sample = self.stochastic_adams_moulton_update(
+                this_model_output=model_output_convert,
+                this_timestep=timestep,
+                last_sample=self.last_sample,
+                last_noise=self.last_noise,
+                this_sample=sample,
+                order=self.this_corrector_order,
+                tau=current_tau,
+            )
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+        noise = randn_tensor(
+            model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+        )
+        if self.config.lower_order_final:
+            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - step_index)
+            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - step_index + 1)
+        else:
+            this_predictor_order = self.config.predictor_order
+            this_corrector_order = self.config.corrector_order
+        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
+        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
+        assert self.this_predictor_order > 0
+        assert self.this_corrector_order > 0
+        self.last_sample = sample
+        self.last_noise = noise
+        current_tau = self.tau_func(self.timestep_list[-1])
+        prev_sample = self.stochastic_adams_bashforth_update(
+            model_output=model_output_convert,
+            prev_timestep=prev_timestep,
+            sample=sample,
+            noise=noise,
+            order=self.this_predictor_order,
+            tau=current_tau,
+        )
+        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
+            self.lower_order_nums += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+            self,
+            original_samples: torch.FloatTensor,
+            noise: torch.FloatTensor,
+            timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        return sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    def __len__(self):
+        return self.config.num_train_timesteps

DiT_VAE/diffusion/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # from .logger import get_root_logger

DiT_VAE/diffusion/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import re
+import torch
+from DiT_VAE.diffusion.utils.logger import get_root_logger
+def save_checkpoint(work_dir,
+                    epoch,
+                    model,
+                    model_ema=None,
+                    optimizer=None,
+                    lr_scheduler=None,
+                    keep_last=False,
+                    step=None,
+                    ):
+    os.makedirs(work_dir, exist_ok=True)
+    state_dict = dict(state_dict=model.state_dict())
+    if model_ema is not None:
+        state_dict['state_dict_ema'] = model_ema.state_dict()
+    if optimizer is not None:
+        state_dict['optimizer'] = optimizer.state_dict()
+    if lr_scheduler is not None:
+        state_dict['scheduler'] = lr_scheduler.state_dict()
+    if epoch is not None:
+        state_dict['epoch'] = epoch
+        file_path = os.path.join(work_dir, f"epoch_{epoch}.pth")
+        if step is not None:
+            file_path = file_path.split('.pth')[0] + f"_step_{step}.pth"
+    logger = get_root_logger()
+    torch.save(state_dict, file_path)
+    logger.info(f'Saved checkpoint of epoch {epoch} to {file_path.format(epoch)}.')
+    if keep_last:
+        for i in range(epoch):
+            previous_ckgt = file_path.format(i)
+            if os.path.exists(previous_ckgt):
+                os.remove(previous_ckgt)
+def load_checkpoint(checkpoint,
+                    model,
+                    model_ema=None,
+                    optimizer=None,
+                    lr_scheduler=None,
+                    load_ema=False,
+                    resume_optimizer=True,
+                    resume_lr_scheduler=True
+                    ):
+    assert isinstance(checkpoint, str)
+    ckpt_file = checkpoint
+    checkpoint = torch.load(ckpt_file, map_location="cpu")
+    state_dict_keys = ['pos_embed', 'base_model.pos_embed', 'model.pos_embed']
+    for key in state_dict_keys:
+        if key in checkpoint['state_dict']:
+            del checkpoint['state_dict'][key]
+            if 'state_dict_ema' in checkpoint and key in checkpoint['state_dict_ema']:
+                del checkpoint['state_dict_ema'][key]
+            break
+    if load_ema:
+        state_dict = checkpoint['state_dict_ema']
+    else:
+        state_dict = checkpoint.get('state_dict', checkpoint)  # to be compatible with the official checkpoint
+    # model.load_state_dict(state_dict)
+    missing, unexpect = model.load_state_dict(state_dict, strict=False)
+    if model_ema is not None:
+        model_ema.load_state_dict(checkpoint['state_dict_ema'], strict=False)
+    if optimizer is not None and resume_optimizer:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+    if lr_scheduler is not None and resume_lr_scheduler:
+        lr_scheduler.load_state_dict(checkpoint['scheduler'])
+    logger = get_root_logger()
+    if optimizer is not None:
+        epoch = checkpoint.get('epoch', re.match(r'.*epoch_(\d*).*.pth', ckpt_file).group()[0])
+        logger.info(f'Resume checkpoint of epoch {epoch} from {ckpt_file}. Load ema: {load_ema}, '
+                    f'resume optimizer： {resume_optimizer}, resume lr scheduler: {resume_lr_scheduler}.')
+        return epoch, missing, unexpect
+    logger.info(f'Load checkpoint from {ckpt_file}. Load ema: {load_ema}.')
+    return missing, unexpect

DiT_VAE/diffusion/utils/data_sampler.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Sequence
+from torch.utils.data import BatchSampler, Sampler, Dataset
+from random import shuffle, choice
+from copy import deepcopy
+from DiT_VAE.diffusion.utils.logger import get_root_logger
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 aspect_ratios: dict,
+                 drop_last: bool = False,
+                 config=None,
+                 valid_num=0,   # take as valid aspect-ratio when sample number >= valid_num
+                 **kwargs) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.ratio_nums_gt = kwargs.get('ratio_nums', None)
+        self.config = config
+        assert self.ratio_nums_gt
+        # buckets for each aspect ratio
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        self.current_available_bucket_keys =  [str(k) for k, v in self.ratio_nums_gt.items() if v >= valid_num]
+        logger = get_root_logger() if config is None else get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
+        logger.warning(f"Using valid_num={valid_num} in config file. Available {len(self.current_available_bucket_keys)} aspect_ratios: {self.current_available_bucket_keys}")
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.dataset.get_data_info(idx)
+            height, width =  data_info['height'], data_info['width']
+            ratio = height / width
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+        # yield the rest data and reset the buckets
+        for bucket in self._aspect_ratio_buckets.values():
+            while len(bucket) > 0:
+                if len(bucket) <= self.batch_size:
+                    if not self.drop_last:
+                        yield bucket[:]
+                    bucket = []
+                else:
+                    yield bucket[:self.batch_size]
+                    bucket = bucket[self.batch_size:]
+class BalancedAspectRatioBatchSampler(AspectRatioBatchSampler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Assign samples to each bucket
+        self.ratio_nums_gt = kwargs.get('ratio_nums', None)
+        assert self.ratio_nums_gt
+        self._aspect_ratio_buckets = {float(ratio): [] for ratio in self.aspect_ratios.keys()}
+        self.original_buckets = {}
+        self.current_available_bucket_keys =  [k for k, v in self.ratio_nums_gt.items() if v >= 3000]
+        self.all_available_keys = deepcopy(self.current_available_bucket_keys)
+        self.exhausted_bucket_keys = []
+        self.total_batches = len(self.sampler) // self.batch_size
+        self._aspect_ratio_count = {}
+        for k in self.all_available_keys:
+            self._aspect_ratio_count[float(k)] = 0
+            self.original_buckets[float(k)] = []
+        logger = get_root_logger(os.path.join(self.config.work_dir, 'train_log.log'))
+        logger.warning(f"Available {len(self.current_available_bucket_keys)} aspect_ratios: {self.current_available_bucket_keys}")
+    def __iter__(self) -> Sequence[int]:
+        i = 0
+        for idx in self.sampler:
+            data_info = self.dataset.get_data_info(idx)
+            height, width = data_info['height'], data_info['width']
+            ratio = height / width
+            closest_ratio = float(min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio)))
+            if closest_ratio not in self.all_available_keys:
+                continue
+            if self._aspect_ratio_count[closest_ratio] < self.ratio_nums_gt[closest_ratio]:
+                self._aspect_ratio_count[closest_ratio] += 1
+                self._aspect_ratio_buckets[closest_ratio].append(idx)
+                self.original_buckets[closest_ratio].append(idx)    # Save the original samples for each bucket
+            if not self.current_available_bucket_keys:
+                self.current_available_bucket_keys, self.exhausted_bucket_keys = self.exhausted_bucket_keys, []
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            key = closest_ratio
+            bucket = self._aspect_ratio_buckets[key]
+            if len(bucket) == self.batch_size:
+                yield bucket[:self.batch_size]
+                del bucket[:self.batch_size]
+                i += 1
+                self.exhausted_bucket_keys.append(key)
+                self.current_available_bucket_keys.remove(key)
+        for _ in range(self.total_batches - i):
+            key = choice(self.all_available_keys)
+            bucket = self._aspect_ratio_buckets[key]
+            if len(bucket) >= self.batch_size:
+                yield bucket[:self.batch_size]
+                del bucket[:self.batch_size]
+                # If a bucket is exhausted
+                if not bucket:
+                    self._aspect_ratio_buckets[key] = deepcopy(self.original_buckets[key][:])
+                    shuffle(self._aspect_ratio_buckets[key])
+            else:
+                self._aspect_ratio_buckets[key] = deepcopy(self.original_buckets[key][:])
+                shuffle(self._aspect_ratio_buckets[key])

DiT_VAE/diffusion/utils/dist_utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+import os
+import pickle
+import shutil
+import gc
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+def is_distributed():
+    return get_world_size() > 1
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    return dist.get_world_size() if dist.is_initialized() else 1
+def get_rank():
+    if not dist.is_available():
+        return 0
+    return dist.get_rank() if dist.is_initialized() else 0
+def get_local_rank():
+    if not dist.is_available():
+        return 0
+    return int(os.getenv('LOCAL_RANK', 0)) if dist.is_initialized() else 0
+def is_master():
+    return get_rank() == 0
+def is_local_master():
+    return get_local_rank() == 0
+def get_local_proc_group(group_size=8):
+    world_size = get_world_size()
+    if world_size <= group_size or group_size == 1:
+        return None
+    assert world_size % group_size == 0, f'world size ({world_size}) should be evenly divided by group size ({group_size}).'
+    process_groups = getattr(get_local_proc_group, 'process_groups', {})
+    if group_size not in process_groups:
+        num_groups = dist.get_world_size() // group_size
+        groups = [list(range(i * group_size, (i + 1) * group_size)) for i in range(num_groups)]
+        process_groups.update({group_size: [torch.distributed.new_group(group) for group in groups]})
+        get_local_proc_group.process_groups = process_groups
+    group_idx = get_rank() // group_size
+    return get_local_proc_group.process_groups.get(group_size)[group_idx]
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    to_device = torch.device("cuda")
+    # to_device = torch.device("cpu")
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(to_device)
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to(to_device)
+    size_list = [torch.LongTensor([0]).to(to_device) for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    tensor_list = [
+        torch.ByteTensor(size=(max_size,)).to(to_device) for _ in size_list
+    ]
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to(to_device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        reduced_dict = _extracted_from_reduce_dict_14(input_dict, average, world_size)
+    return reduced_dict
+# TODO Rename this here and in `reduce_dict`
+def _extracted_from_reduce_dict_14(input_dict, average, world_size):
+    names = []
+    values = []
+    # sort the keys so that they are consistent across processes
+    for k in sorted(input_dict.keys()):
+        names.append(k)
+        values.append(input_dict[k])
+    values = torch.stack(values, dim=0)
+    dist.reduce(values, dst=0)
+    if dist.get_rank() == 0 and average:
+        # only main process gets accumulated, so only divide by
+        # world_size in this case
+        values /= world_size
+    return dict(zip(names, values))
+def broadcast(data, **kwargs):
+    if get_world_size() == 1:
+        return data
+    data = [data]
+    dist.broadcast_object_list(data, **kwargs)
+    return data[0]
+def all_gather_cpu(result_part, tmpdir=None, collect_by_master=True):
+    rank, world_size = get_dist_info()
+    if tmpdir is None:
+        tmpdir = './tmp'
+    if rank == 0:
+        mmcv.mkdir_or_exist(tmpdir)
+    synchronize()
+    # dump the part result to the dir
+    mmcv.dump(result_part, os.path.join(tmpdir, f'part_{rank}.pkl'))
+    synchronize()
+    if collect_by_master and rank != 0:
+        return None
+    # load results of all parts from tmp dir
+    results = []
+    for i in range(world_size):
+        part_file = os.path.join(tmpdir, f'part_{i}.pkl')
+        results.append(mmcv.load(part_file))
+    if not collect_by_master:
+        synchronize()
+    # remove tmp dir
+    if rank == 0:
+        shutil.rmtree(tmpdir)
+    return results
+def all_gather_tensor(tensor, group_size=None, group=None):
+    if group_size is None:
+        group_size = get_world_size()
+    if group_size == 1:
+        output = [tensor]
+    else:
+        output = [torch.zeros_like(tensor) for _ in range(group_size)]
+        dist.all_gather(output, tensor, group=group)
+    return output
+def gather_difflen_tensor(feat, num_samples_list, concat=True, group=None, group_size=None):
+    world_size = get_world_size()
+    if world_size == 1:
+        return feat if concat else [feat]
+    num_samples, *feat_dim = feat.size()
+    # padding to max number of samples
+    feat_padding = feat.new_zeros((max(num_samples_list), *feat_dim))
+    feat_padding[:num_samples] = feat
+    # gather
+    feat_gather = all_gather_tensor(feat_padding, group=group, group_size=group_size)
+    for r, num in enumerate(num_samples_list):
+        feat_gather[r] = feat_gather[r][:num]
+    if concat:
+        feat_gather = torch.cat(feat_gather)
+    return feat_gather
+class GatherLayer(torch.autograd.Function):
+    '''Gather tensors from all process, supporting backward propagation.
+    '''
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        num_samples = torch.tensor(input.size(0), dtype=torch.long, device=input.device)
+        ctx.num_samples_list = all_gather_tensor(num_samples)
+        output = gather_difflen_tensor(input, ctx.num_samples_list, concat=False)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):  # tuple(output)'s grad
+        input, = ctx.saved_tensors
+        num_samples_list = ctx.num_samples_list
+        rank = get_rank()
+        start, end = sum(num_samples_list[:rank]), sum(num_samples_list[:rank + 1])
+        grads = torch.cat(grads)
+        if is_distributed():
+            dist.all_reduce(grads)
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[start:end]
+        return grad_out, None, None
+class GatherLayerWithGroup(torch.autograd.Function):
+    '''Gather tensors from all process, supporting backward propagation.
+    '''
+    @staticmethod
+    def forward(ctx, input, group, group_size):
+        ctx.save_for_backward(input)
+        ctx.group_size = group_size
+        output = all_gather_tensor(input, group=group, group_size=group_size)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):  # tuple(output)'s grad
+        input, = ctx.saved_tensors
+        grads = torch.stack(grads)
+        if is_distributed():
+            dist.all_reduce(grads)
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[get_rank() % ctx.group_size]
+        return grad_out, None, None
+def gather_layer_with_group(data, group=None, group_size=None):
+    if group_size is None:
+        group_size = get_world_size()
+    return GatherLayer.apply(data, group, group_size)
+from typing import Union
+import math
+# from torch.distributed.fsdp.fully_sharded_data_parallel import TrainingState_, _calc_grad_norm
+@torch.no_grad()
+def clip_grad_norm_(
+    self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0
+) -> None:
+    self._lazy_init()
+    self._wait_for_previous_optim_step()
+    assert self._is_root, "clip_grad_norm should only be called on the root (parent) instance"
+    self._assert_state(TrainingState_.IDLE)
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    # Computes the max norm for this shard's gradients and sync's across workers
+    local_norm = _calc_grad_norm(self.params_with_grad, norm_type).cuda()  # type: ignore[arg-type]
+    if norm_type == math.inf:
+        total_norm = local_norm
+        dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
+    else:
+        total_norm = local_norm ** norm_type
+        dist.all_reduce(total_norm, group=self.process_group)
+        total_norm = total_norm ** (1.0 / norm_type)
+    clip_coef = torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) / (total_norm + 1e-6)
+    if clip_coef < 1:
+        # multiply by clip_coef, aka, (max_norm/total_norm).
+        for p in self.params_with_grad:
+            assert p.grad is not None
+            p.grad.detach().mul_(clip_coef.to(p.grad.device))
+    return total_norm
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()

DiT_VAE/diffusion/utils/logger.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import logging
+import os
+import torch.distributed as dist
+from datetime import datetime
+from .dist_utils import is_local_master
+from mmcv.utils.logging import logger_initialized
+def get_root_logger(log_file=None, log_level=logging.INFO, name='PixArt'):
+    """Get root logger.
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+        name (str): logger name
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    if log_file is None:
+        log_file = '/dev/null'
+    return get_logger(name=name, log_file=log_file, log_level=log_level)
+def get_logger(name, log_file=None, log_level=logging.INFO):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    logger.propagate = False  # disable root logger to avoid duplicate logging
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+    rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        file_handler = logging.FileHandler(log_file, 'w')
+        handlers.append(file_handler)
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+    # only rank0 for each node will print logs
+    log_level = log_level if is_local_master() else logging.ERROR
+    logger.setLevel(log_level)
+    logger_initialized[name] = True
+    return logger
+def rename_file_with_creation_time(file_path):
+    # 获取文件的创建时间
+    creation_time = os.path.getctime(file_path)
+    creation_time_str = datetime.fromtimestamp(creation_time).strftime('%Y-%m-%d_%H-%M-%S')
+    # 构建新的文件名
+    dir_name, file_name = os.path.split(file_path)
+    name, ext = os.path.splitext(file_name)
+    new_file_name = f"{name}_{creation_time_str}{ext}"
+    new_file_path = os.path.join(dir_name, new_file_name)
+    # 重命名文件
+    os.rename(file_path, new_file_path)
+    print(f"File renamed to: {new_file_path}")

DiT_VAE/diffusion/utils/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from diffusers import get_cosine_schedule_with_warmup, get_constant_schedule_with_warmup
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+import math
+from DiT_VAE.diffusion.utils.logger import get_root_logger
+def build_lr_scheduler(config, optimizer, train_dataloader, lr_scale_ratio):
+    if not config.get('lr_schedule_args', None):
+        config.lr_schedule_args = {}
+    if config.get('lr_warmup_steps', None):
+        config['num_warmup_steps'] = config.get('lr_warmup_steps')  # for compatibility with old version
+    logger = get_root_logger()
+    logger.info(
+        f'Lr schedule: {config.lr_schedule}, ' + ",".join(
+            [f"{key}:{value}" for key, value in config.lr_schedule_args.items()]) + '.')
+    if config.lr_schedule == 'cosine':
+        lr_scheduler = get_cosine_schedule_with_warmup(
+            optimizer=optimizer,
+            **config.lr_schedule_args,
+            num_training_steps=(len(train_dataloader) * config.num_epochs),
+        )
+    elif config.lr_schedule == 'constant':
+        lr_scheduler = get_constant_schedule_with_warmup(
+            optimizer=optimizer,
+            **config.lr_schedule_args,
+        )
+    elif config.lr_schedule == 'cosine_decay_to_constant':
+        assert lr_scale_ratio >= 1
+        lr_scheduler = get_cosine_decay_to_constant_with_warmup(
+            optimizer=optimizer,
+            **config.lr_schedule_args,
+            final_lr=1 / lr_scale_ratio,
+            num_training_steps=(len(train_dataloader) * config.num_epochs),
+        )
+    else:
+        raise RuntimeError(f'Unrecognized lr schedule {config.lr_schedule}.')
+    return lr_scheduler
+def get_cosine_decay_to_constant_with_warmup(optimizer: Optimizer,
+                                             num_warmup_steps: int,
+                                             num_training_steps: int,
+                                             final_lr: float = 0.0,
+                                             num_decay: float = 0.667,
+                                             num_cycles: float = 0.5,
+                                             last_epoch: int = -1
+                                             ):
+    """
+    Create a schedule with a cosine annealing lr followed by a constant lr.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The number of total training steps.
+        final_lr (`int`):
+            The final constant lr after cosine decay.
+        num_decay (`int`):
+            The
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        num_decay_steps = int(num_training_steps * num_decay)
+        if current_step > num_decay_steps:
+            return final_lr
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_decay_steps - num_warmup_steps))
+        return (
+            max(
+                0.0,
+                0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress)),
+            )
+            * (1 - final_lr)
+        ) + final_lr
+    return LambdaLR(optimizer, lr_lambda, last_epoch)

DiT_VAE/diffusion/utils/misc.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import collections
+import datetime
+import os
+import random
+import time
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv import Config
+from mmcv.runner import get_dist_info
+from .logger import get_root_logger
+os.environ["MOX_SILENT_MODE"] = "1"  # mute moxing log
+def read_config(file):
+    # solve config loading conflict when multi-processes
+    import time
+    while True:
+        config = Config.fromfile(file)
+        if len(config) == 0:
+            time.sleep(0.1)
+            continue
+        break
+    return config
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2 ** 31)
+    if world_size == 1:
+        return seed
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+class SimpleTimer:
+    def __init__(self, num_tasks, log_interval=1, desc="Process"):
+        self.num_tasks = num_tasks
+        self.desc = desc
+        self.count = 0
+        self.log_interval = log_interval
+        self.start_time = time.time()
+        self.logger = get_root_logger()
+    def log(self):
+        self.count += 1
+        if (self.count % self.log_interval) == 0 or self.count == self.num_tasks:
+            time_elapsed = time.time() - self.start_time
+            avg_time = time_elapsed / self.count
+            eta_sec = avg_time * (self.num_tasks - self.count)
+            eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+            elapsed_str = str(datetime.timedelta(seconds=int(time_elapsed)))
+            log_info = f"{self.desc} [{self.count}/{self.num_tasks}], elapsed_time:{elapsed_str}," \
+                       f" avg_time: {avg_time}, eta: {eta_str}."
+            self.logger.info(log_info)
+class DebugUnderflowOverflow:
+    """
+    This debug class helps detect and understand where the model starts getting very large or very small, and more
+    importantly `nan` or `inf` weight and activation elements.
+    There are 2 working modes:
+    1. Underflow/overflow detection (default)
+    2. Specific batch absolute min/max tracing without detection
+    Mode 1: Underflow/overflow detection
+    To activate the underflow/overflow detection, initialize the object with the model :
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model)
+    ```
+    then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or
+    output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this
+    event, each frame reporting
+    1. the fully qualified module name plus the class name whose `forward` was run
+    2. the absolute min and max value of all elements for each module weights, and the inputs and output
+    For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision :
+    ```
+    Detected inf/nan during batch_number=0
+    Last 21 forward frames:
+    abs min  abs max  metadata
+    [...]
+                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+    2.17e-07 4.50e+00 weight
+    1.79e-06 4.65e+00 input[0]
+    2.68e-06 3.70e+01 output
+                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+    8.08e-07 2.66e+01 weight
+    1.79e-06 4.65e+00 input[0]
+    1.27e-04 2.37e+02 output
+                      encoder.block.2.layer.1.DenseReluDense.wo Linear
+    1.01e-06 6.44e+00 weight
+    0.00e+00 9.74e+03 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+    1.79e-06 4.65e+00 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.dropout Dropout
+    3.18e-04 6.27e+04 input[0]
+    0.00e+00      inf output
+    ```
+    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value
+    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
+    renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
+    64K, and we get an overlow.
+    As you can see it's the previous frames that we need to look into when the numbers start going into very large for
+    fp16 numbers.
+    The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
+    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+    ```
+        To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
+        take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
+        section.
+        Mode 2. Specific batch absolute min/max tracing without detection
+        The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
+        Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
+    given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    ```
+    And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
+    This is helpful if you know that the program starts misbehaving after a certain batch number, so you can
+    fast-forward right to that area.
+    Early stopping:
+    You can also specify the batch number after which to stop the training, with :
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    ```
+    This feature is mainly useful in the tracing mode, but you can use it for any mode.
+    **Performance**:
+    As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the
+    training down. Therefore remember to turn it off once the debugging needs have been met.
+    Args:
+        model (`nn.Module`):
+            The model to debug.
+        max_frames_to_save (`int`, *optional*, defaults to 21):
+            How many frames back to record
+        trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
+            Which batch numbers to trace (turns detection off)
+        abort_after_batch_num  (`int``, *optional*):
+            Whether to abort after a certain batch number has finished
+    """
+    def __init__(self, model, max_frames_to_save=21, trace_batch_nums=None, abort_after_batch_num=None):
+        if trace_batch_nums is None:
+            trace_batch_nums = []
+        self.model = model
+        self.trace_batch_nums = trace_batch_nums
+        self.abort_after_batch_num = abort_after_batch_num
+        # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence
+        self.frames = collections.deque([], max_frames_to_save)
+        self.frame = []
+        self.batch_number = 0
+        self.total_calls = 0
+        self.detected_overflow = False
+        self.prefix = "                 "
+        self.analyse_model()
+        self.register_forward_hook()
+    def save_frame(self, frame=None):
+        if frame is not None:
+            self.expand_frame(frame)
+        self.frames.append("\n".join(self.frame))
+        self.frame = []  # start a new frame
+    def expand_frame(self, line):
+        self.frame.append(line)
+    def trace_frames(self):
+        print("\n".join(self.frames))
+        self.frames = []
+    def reset_saved_frames(self):
+        self.frames = []
+    def dump_saved_frames(self):
+        print(f"\nDetected inf/nan during batch_number={self.batch_number} "
+              f"Last {len(self.frames)} forward frames:"
+              f"{'abs min':8} {'abs max':8} metadata"
+              f"'\n'.join(self.frames)"
+              f"\n\n")
+        self.frames = []
+    def analyse_model(self):
+        # extract the fully qualified module names, to be able to report at run time. e.g.:
+        # encoder.block.2.layer.0.SelfAttention.o
+        #
+        # for shared weights only the first shared module name will be registered
+        self.module_names = {m: name for name, m in self.model.named_modules()}
+        # self.longest_module_name = max(len(v) for v in self.module_names.values())
+    def analyse_variable(self, var, ctx):
+        if torch.is_tensor(var):
+            self.expand_frame(self.get_abs_min_max(var, ctx))
+            if self.detect_overflow(var, ctx):
+                self.detected_overflow = True
+        elif var is None:
+            self.expand_frame(f"{'None':>17} {ctx}")
+        else:
+            self.expand_frame(f"{'not a tensor':>17} {ctx}")
+    def batch_start_frame(self):
+        self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***")
+        self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")
+    def batch_end_frame(self):
+        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number - 1} ***\n\n")
+    def create_frame(self, module, input, output):
+        self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")
+        # params
+        for name, p in module.named_parameters(recurse=False):
+            self.analyse_variable(p, name)
+        # inputs
+        if isinstance(input, tuple):
+            for i, x in enumerate(input):
+                self.analyse_variable(x, f"input[{i}]")
+        else:
+            self.analyse_variable(input, "input")
+        # outputs
+        if isinstance(output, tuple):
+            for i, x in enumerate(output):
+                # possibly a tuple of tuples
+                if isinstance(x, tuple):
+                    for j, y in enumerate(x):
+                        self.analyse_variable(y, f"output[{i}][{j}]")
+                else:
+                    self.analyse_variable(x, f"output[{i}]")
+        else:
+            self.analyse_variable(output, "output")
+        self.save_frame()
+    def register_forward_hook(self):
+        self.model.apply(self._register_forward_hook)
+    def _register_forward_hook(self, module):
+        module.register_forward_hook(self.forward_hook)
+    def forward_hook(self, module, input, output):
+        # - input is a tuple of packed inputs (could be non-Tensors)
+        # - output could be a Tensor or a tuple of Tensors and non-Tensors
+        last_frame_of_batch = False
+        trace_mode = self.batch_number in self.trace_batch_nums
+        if trace_mode:
+            self.reset_saved_frames()
+        if self.total_calls == 0:
+            self.batch_start_frame()
+        self.total_calls += 1
+        # count batch numbers - the very first forward hook of the batch will be called when the
+        # batch completes - i.e. it gets called very last - we know this batch has finished
+        if module == self.model:
+            self.batch_number += 1
+            last_frame_of_batch = True
+        self.create_frame(module, input, output)
+        # if last_frame_of_batch:
+        #     self.batch_end_frame()
+        if trace_mode:
+            self.trace_frames()
+        if last_frame_of_batch:
+            self.batch_start_frame()
+        if self.detected_overflow and not trace_mode:
+            self.dump_saved_frames()
+            # now we can abort, as it's pointless to continue running
+            raise ValueError(
+                "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
+                "Please scroll up above this traceback to see the activation values prior to this event."
+            )
+        # abort after certain batch if requested to do so
+        if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
+            raise ValueError(
+                f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to `abort_after_batch_num={self.abort_after_batch_num}` arg"
+            )
+    @staticmethod
+    def get_abs_min_max(var, ctx):
+        abs_var = var.abs()
+        return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}"
+    @staticmethod
+    def detect_overflow(var, ctx):
+        """
+        Report whether the tensor contains any `nan` or `inf` entries.
+        This is useful for detecting overflows/underflows and best to call right after the function that did some math that
+        modified the tensor in question.
+        This function contains a few other helper features that you can enable and tweak directly if you want to track
+        various other things.
+        Args:
+            var: the tensor variable to check
+            ctx: the message to print as a context
+        Return:
+            `True` if `inf` or `nan` was detected, `False` otherwise
+        """
+        detected = False
+        if torch.isnan(var).any().item():
+            detected = True
+            print(f"{ctx} has nans")
+        if torch.isinf(var).any().item():
+            detected = True
+            print(f"{ctx} has infs")
+        if var.dtype == torch.float32 and torch.ge(var.abs(), 65535).any().item():
+            detected = True
+            print(f"{ctx} has overflow values {var.abs().max().item()}.")
+        return detected

DiT_VAE/diffusion/utils/optimizer.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import math
+from mmcv import Config
+from mmcv.runner import build_optimizer as mm_build_optimizer, OPTIMIZER_BUILDERS, DefaultOptimizerConstructor, \
+    OPTIMIZERS
+from mmcv.utils import _BatchNorm, _InstanceNorm
+from torch.nn import GroupNorm, LayerNorm
+from .logger import get_root_logger
+from typing import Tuple, Optional, Callable
+import torch
+from torch.optim.optimizer import Optimizer
+def auto_scale_lr(effective_bs, optimizer_cfg, rule='linear', base_batch_size=256):
+    assert rule in ['linear', 'sqrt']
+    logger = get_root_logger()
+    # scale by world size
+    if rule == 'sqrt':
+        scale_ratio = math.sqrt(effective_bs / base_batch_size)
+    elif rule == 'linear':
+        scale_ratio = effective_bs / base_batch_size
+    optimizer_cfg['lr'] *= scale_ratio
+    logger.info(f'Automatically adapt lr to {optimizer_cfg["lr"]:.7f} (using {rule} scaling rule).')
+    return scale_ratio
+@OPTIMIZER_BUILDERS.register_module()
+class MyOptimizerConstructor(DefaultOptimizerConstructor):
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        # sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        for name, param in module.named_parameters(recurse=False):
+            base_lr = self.base_lr
+            if name == 'bias' and not is_norm and not is_dcn_module:
+                base_lr *= bias_lr_mult
+            # apply weight decay policies
+            base_wd = self.base_wd
+                # norm decay
+            if is_norm:
+                if self.base_wd is not None:
+                    base_wd *= norm_decay_mult
+            elif name == 'bias' and not is_dcn_module:
+                if self.base_wd is not None:
+                    # TODO: current bias_decay_mult will have affect on DCN
+                    base_wd *= bias_decay_mult
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                param_group['requires_grad'] = False
+                params.append(param_group)
+                continue
+            if bypass_duplicate and self._is_in(param_group, params):
+                logger = get_root_logger()
+                logger.warn(f'{prefix} is duplicate. It is skipped since '
+                            f'bypass_duplicate={bypass_duplicate}')
+                continue
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in custom_keys:
+                scope, key_name = key if isinstance(key, tuple) else (None, key)
+                if scope is not None and scope not in f'{prefix}':
+                    continue
+                if key_name in f'{prefix}.{name}':
+                    is_custom = True
+                    if 'lr_mult' in custom_keys[key]:
+                        # if 'base_classes' in f'{prefix}.{name}' or 'attn_base' in f'{prefix}.{name}':
+                        #     param_group['lr'] = self.base_lr
+                        # else:
+                        param_group['lr'] = self.base_lr * custom_keys[key]['lr_mult']
+                    elif 'lr' not in param_group:
+                        param_group['lr'] = base_lr
+                    if self.base_wd is not None:
+                        if 'decay_mult' in custom_keys[key]:
+                            param_group['weight_decay'] = self.base_wd * custom_keys[key]['decay_mult']
+                        elif 'weight_decay' not in param_group:
+                            param_group['weight_decay'] = base_wd
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if base_lr != self.base_lr:
+                    param_group['lr'] = base_lr
+                if base_wd != self.base_wd:
+                    param_group['weight_decay'] = base_wd
+            params.append(param_group)
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+def build_optimizer(model, optimizer_cfg):
+    # default parameter-wise config
+    logger = get_root_logger()
+    if hasattr(model, 'module'):
+        model = model.module
+    # set optimizer constructor
+    optimizer_cfg.setdefault('constructor', 'MyOptimizerConstructor')
+    # parameter-wise setting: cancel weight decay for some specific modules
+    custom_keys = dict()
+    for name, module in model.named_modules():
+        if hasattr(module, 'zero_weight_decay'):
+            custom_keys |= {
+                (name, key): dict(decay_mult=0)
+                for key in module.zero_weight_decay
+            }
+    paramwise_cfg = Config(dict(cfg=dict(custom_keys=custom_keys)))
+    if given_cfg := optimizer_cfg.get('paramwise_cfg'):
+        paramwise_cfg.merge_from_dict(dict(cfg=given_cfg))
+    optimizer_cfg['paramwise_cfg'] = paramwise_cfg.cfg
+    # build optimizer
+    optimizer = mm_build_optimizer(model, optimizer_cfg)
+    weight_decay_groups = dict()
+    lr_groups = dict()
+    for group in optimizer.param_groups:
+        if not group.get('requires_grad', True): continue
+        lr_groups.setdefault(group['lr'], []).append(group)
+        weight_decay_groups.setdefault(group['weight_decay'], []).append(group)
+    learnable_count, fix_count = 0, 0
+    for p in model.parameters():
+        if p.requires_grad:
+            learnable_count += 1
+        else:
+            fix_count += 1
+    fix_info = f"{learnable_count} are learnable, {fix_count} are fix"
+    lr_info = "Lr group: " + ", ".join([f'{len(group)} params with lr {lr:.5f}' for lr, group in lr_groups.items()])
+    wd_info = "Weight decay group: " + ", ".join(
+        [f'{len(group)} params with weight decay {wd}' for wd, group in weight_decay_groups.items()])
+    opt_info = f"Optimizer: total {len(optimizer.param_groups)} param groups, {fix_info}. {lr_info}; {wd_info}."
+    logger.info(opt_info)
+    return optimizer
+@OPTIMIZERS.register_module()
+class Lion(Optimizer):
+    def __init__(
+            self,
+            params,
+            lr: float = 1e-4,
+            betas: Tuple[float, float] = (0.9, 0.99),
+            weight_decay: float = 0.0,
+    ):
+        assert lr > 0.
+        assert all(0. <= beta <= 1. for beta in betas)
+        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
+        super().__init__(params, defaults)
+    @staticmethod
+    def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2):
+        # stepweight decay
+        p.data.mul_(1 - lr * wd)
+        # weight update
+        update = exp_avg.clone().lerp_(grad, 1 - beta1).sign_()
+        p.add_(update, alpha=-lr)
+        # decay the momentum running average coefficient
+        exp_avg.lerp_(grad, 1 - beta2)
+    @staticmethod
+    def exists(val):
+        return val is not None
+    @torch.no_grad()
+    def step(
+            self,
+            closure: Optional[Callable] = None
+    ):
+        loss = None
+        if self.exists(closure):
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for p in filter(lambda p: self.exists(p.grad), group['params']):
+                grad, lr, wd, beta1, beta2, state = p.grad, group['lr'], group['weight_decay'], *group['betas'], \
+                                                    self.state[p]
+                # init state - exponential moving average of gradient values
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                exp_avg = state['exp_avg']
+                self.update_fn(
+                    p,
+                    grad,
+                    exp_avg,
+                    lr,
+                    wd,
+                    beta1,
+                    beta2
+                )
+        return loss

DiT_VAE/train_diffusion.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# TODO: Implement model training and evaluation.
+# This script will load data, train a deep learning model, and evaluate its performance.
+# Future improvements may include hyperparameter tuning and multi-GPU training.

DiT_VAE/train_vae.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import argparse
+import math
+import os
+import sys
+current_path = os.path.abspath(__file__)
+father_path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")
+sys.path.append((os.path.join(father_path, 'Next3d')))
+from typing import Dict, Optional, Tuple
+from omegaconf import OmegaConf
+import torch
+import logging
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+import inspect
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+import dnnlib
+from diffusers.optimization import get_scheduler
+from tqdm.auto import tqdm
+from vae.triplane_vae import AutoencoderKL, AutoencoderKLRollOut
+from vae.data.dataset_online_vae import TriplaneDataset
+from einops import rearrange
+from vae.utils.common_utils import instantiate_from_config
+from Next3d.training_avatar_texture.triplane_generation import TriPlaneGenerator
+import Next3d.legacy as legacy
+from torch_utils import misc
+import datetime
+logger = get_logger(__name__, log_level="INFO")
+def collate_fn(data):
+    model_names = [example["data_model_name"] for example in data]
+    zs = torch.cat([example["data_z"] for example in data], dim=0)
+    verts = torch.cat([example["data_vert"] for example in data], dim=0)
+    return {
+        'model_names': model_names,
+        'zs': zs,
+        'verts': verts
+    }
+def rollout_fn(triplane):
+    triplane = rearrange(triplane, "b c f h w -> b f c h w")
+    b, f, c, h, w = triplane.shape
+    triplane = triplane.permute(0, 2, 3, 1, 4).reshape(-1, c, h, f * w)
+    return triplane
+def unrollout_fn(triplane):
+    res = triplane.shape[-2]
+    ch = triplane.shape[1]
+    triplane = triplane.reshape(-1, ch // 3, res, 3, res).permute(0, 3, 1, 2, 4).reshape(-1, 3, ch, res, res)
+    triplane = rearrange(triplane, "b f c h w -> b c f h w")
+    return triplane
+def triplane_generate(G_model, z, conditioning_params, std, mean, truncation_psi=0.7, truncation_cutoff=14):
+    w = G_model.mapping(z, conditioning_params, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff)
+    triplane = G_model.synthesis(w, noise_mode='const')
+    triplane = (triplane - mean) / std
+    return triplane
+def gan_model(gan_models, device, gan_model_base_dir):
+    gan_model_dict = gan_models
+    gan_model_load = {}
+    for model_name in gan_model_dict.keys():
+        model_pkl = os.path.join(gan_model_base_dir, model_name + '.pkl')
+        with dnnlib.util.open_url(model_pkl) as f:
+            G = legacy.load_network_pkl(f)['G_ema'].to(device)  # type: ignore
+        G_new = TriPlaneGenerator(*G.init_args, **G.init_kwargs).eval().requires_grad_(False).to(device)
+        misc.copy_params_and_buffers(G, G_new, require_all=True)
+        G_new.neural_rendering_resolution = G.neural_rendering_resolution
+        G_new.rendering_kwargs = G.rendering_kwargs
+        gan_model_load[model_name] = G_new
+    return gan_model_load
+def main(vae_config: str,
+         gan_model_config: str,
+         output_dir: str,
+         std_dir: str,
+         mean_dir: str,
+         conditioning_params_dir: str,
+         gan_model_base_dir: str,
+         train_data: Dict,
+         train_batch_size: int = 2,
+         max_train_steps: int = 500,
+         learning_rate: float = 3e-5,
+         scale_lr: bool = False,
+         lr_scheduler: str = "constant",
+         lr_warmup_steps: int = 0,
+         adam_beta1: float = 0.5,
+         adam_beta2: float = 0.9,
+         adam_weight_decay: float = 1e-2,
+         adam_epsilon: float = 1e-08,
+         max_grad_norm: float = 1.0,
+         gradient_accumulation_steps: int = 1,
+         gradient_checkpointing: bool = True,
+         checkpointing_steps: int = 500,
+         pretrained_model_path_zero123: str = None,
+         resume_from_checkpoint: Optional[str] = None,
+         mixed_precision: Optional[str] = "fp16",
+         use_8bit_adam: bool = False,
+         rollout: bool = False,
+         enable_xformers_memory_efficient_attention: bool = True,
+         seed: Optional[int] = None, ):
+    *_, config = inspect.getargvalues(inspect.currentframe())
+    base_dir = output_dir
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+    )
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    # If passed along, set the training seed now.
+    if seed is not None:
+        set_seed(seed)
+    if accelerator.is_main_process:
+        now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+        output_dir = os.path.join(output_dir, now)
+        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(f"{output_dir}/samples", exist_ok=True)
+        os.makedirs(f"{output_dir}/inv_latents", exist_ok=True)
+        OmegaConf.save(config, os.path.join(output_dir, 'config.yaml'))
+    config_vae = OmegaConf.load(vae_config)
+    if rollout:
+        vae = AutoencoderKLRollOut(ddconfig=config_vae['ddconfig'], lossconfig=config_vae['lossconfig'], embed_dim=8)
+    else:
+        vae = AutoencoderKL(ddconfig=config_vae['ddconfig'], lossconfig=config_vae['lossconfig'], embed_dim=8)
+    print(f"VAE total params = {len(list(vae.named_parameters()))} ")
+    if 'perceptual_weight' in config_vae['lossconfig']['params'].keys():
+        config_vae['lossconfig']['params']['device'] = str(accelerator.device)
+    loss_fn = instantiate_from_config(config_vae['lossconfig'])
+    conditioning_params = torch.load(conditioning_params_dir).to(str(accelerator.device))
+    data_std = torch.load(std_dir).to(str(accelerator.device)).reshape(1, -1, 1, 1, 1)
+    data_mean = torch.load(mean_dir).to(str(accelerator.device)).reshape(1, -1, 1, 1, 1)
+    # define the gan model
+    print("########## gan model load ##########")
+    config_gan_model = OmegaConf.load(gan_model_config)
+    gan_model_all = gan_model(config_gan_model['gan_models'], str(accelerator.device), gan_model_base_dir)
+    print("########## gan model loaded ##########")
+    if scale_lr:
+        learning_rate = (
+                learning_rate * gradient_accumulation_steps * train_batch_size * accelerator.num_processes
+        )
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(
+        vae.parameters(),
+        lr=learning_rate,
+        betas=(adam_beta1, adam_beta2),
+        weight_decay=adam_weight_decay,
+        eps=adam_epsilon,
+    )
+    train_dataset = TriplaneDataset(**train_data)
+    # Preprocessing the dataset
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=train_batch_size, collate_fn=collate_fn, shuffle=True, num_workers=2
+    )
+    lr_scheduler = get_scheduler(
+        lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
+        num_training_steps=max_train_steps * gradient_accumulation_steps,
+    )
+    vae, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        vae, optimizer, train_dataloader, lr_scheduler
+    )
+    weight_dtype = torch.float32
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    vae.to(accelerator.device, dtype=weight_dtype)
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    # Afterwards we recalculate our number of training epochs
+    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("trainvae", config=vars(args))
+    # Train!
+    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if resume_from_checkpoint:
+        if resume_from_checkpoint != "latest":
+            path = os.path.basename(resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        if resume_from_checkpoint != "latest":
+            accelerator.load_state(resume_from_checkpoint)
+        else:
+            accelerator.load_state(os.path.join(output_dir, path))
+        global_step = int(path.split("-")[1])
+        first_epoch = global_step // num_update_steps_per_epoch
+        resume_step = global_step % num_update_steps_per_epoch
+    else:
+        all_final_training_dirs = []
+        dirs = os.listdir(base_dir)
+        if len(dirs) != 0:
+            dirs = [d for d in dirs if d.startswith("2024")]  # specific years
+            if len(dirs) != 0:
+                base_resume_paths = [os.path.join(base_dir, d) for d in dirs]
+                for base_resume_path in base_resume_paths:
+                    checkpoint_file_names = os.listdir(base_resume_path)
+                    checkpoint_file_names = [d for d in checkpoint_file_names if d.startswith("checkpoint")]
+                    if len(checkpoint_file_names) != 0:
+                        for checkpoint_file_name in checkpoint_file_names:
+                            final_training_dir = os.path.join(base_resume_path, checkpoint_file_name)
+                            all_final_training_dirs.append(final_training_dir)
+                if len(all_final_training_dirs) != 0:
+                    sorted_all_final_training_dirs = sorted(all_final_training_dirs, key=lambda x: int(x.split("-")[1]))
+                    latest_dir = sorted_all_final_training_dirs[-1]
+                    path = os.path.basename( latest_dir)
+                    accelerator.print(f"Resuming from checkpoint {path}")
+                    accelerator.load_state(latest_dir)
+                    global_step = int(path.split("-")[1])
+                    first_epoch = global_step // num_update_steps_per_epoch
+                    resume_step = global_step % num_update_steps_per_epoch
+                else:
+                    accelerator.print(f"Training from start")
+            else:
+                accelerator.print(f"Training from start")
+        else:
+            accelerator.print(f"Training from start")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    for epoch in range(first_epoch, num_train_epochs):
+        vae.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            # if resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+            #     print(epoch)
+            #     print(first_epoch)
+            #     print(step)
+            #     if step % gradient_accumulation_steps == 0:
+            #         progress_bar.update(1)
+            #     continue
+            with accelerator.accumulate(vae):
+                # Convert images to latent space
+                z_values = batch["zs"].to(weight_dtype)
+                model_names = batch["model_names"]
+                triplane_values = []
+                with torch.no_grad():
+                    for z_id in range(z_values.shape[0]):
+                        z_value = z_values[z_id].unsqueeze(0)
+                        model_name = model_names[z_id]
+                        triplane_value = triplane_generate(gan_model_all[model_name], z_value,
+                                                           conditioning_params, data_std, data_mean)
+                        triplane_values.append(triplane_value)
+                triplane_values = torch.cat(triplane_values, dim=0)
+                vert_values = batch["verts"].to(weight_dtype)
+                triplane_values = rearrange(triplane_values, "b f c h w -> b c f h w")
+                if rollout:
+                    triplane_values_roll = rollout_fn(triplane_values.clone())
+                    reconstructions, posterior = vae(triplane_values_roll)
+                    reconstructions_unroll = unrollout_fn(reconstructions)
+                    loss, log_dict_ae = loss_fn(triplane_values, reconstructions_unroll, posterior, vert_values,
+                                                split="train")
+                else:
+                    reconstructions, posterior = vae(triplane_values)
+                    loss, log_dict_ae = loss_fn(triplane_values, reconstructions, posterior, vert_values,
+                                                split="train")
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(vae.parameters(), max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                if global_step % checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+            # logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            logs = log_dict_ae
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= max_train_steps:
+                break
+        accelerator.wait_for_everyone()
+    accelerator.end_training()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="./configs/triplane_vae.yaml")
+    args = parser.parse_args()
+    main(**OmegaConf.load(args.config))

DiT_VAE/util.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import imageio
+import numpy as np
+from typing import Union
+import torch
+import torchvision
+from tqdm import tqdm
+from einops import rearrange
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=4, fps=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps)
+# DDIM Inversion
+@torch.no_grad()
+def init_prompt(prompt, pipeline):
+    uncond_input = pipeline.tokenizer(
+        [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length,
+        return_tensors="pt"
+    )
+    uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
+    text_input = pipeline.tokenizer(
+        [prompt],
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
+    context = torch.cat([uncond_embeddings, text_embeddings])
+    return context
+def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
+              sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler):
+    timestep, next_timestep = min(
+        timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep
+    alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
+    alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
+    beta_prod_t = 1 - alpha_prod_t
+    next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+    next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+    next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
+    return next_sample
+def get_noise_pred_single(latents, t, context, unet):
+    noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"]
+    return noise_pred
+@torch.no_grad()
+def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt):
+    context = init_prompt(prompt, pipeline)
+    uncond_embeddings, cond_embeddings = context.chunk(2)
+    all_latent = [latent]
+    latent = latent.clone().detach()
+    for i in tqdm(range(num_inv_steps)):
+        t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
+        noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet)
+        latent = next_step(noise_pred, t, latent, ddim_scheduler)
+        all_latent.append(latent)
+    return all_latent
+@torch.no_grad()
+def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""):
+    ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt)
+    return ddim_latents
+def rendering():
+    pass
+def force_zero_snr(betas):
+    alphas = 1 - betas
+    alphas_bar = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_bar ** (1/2)
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() - 1e-6
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    alphas_bar = alphas_bar_sqrt ** 2
+    alphas = alphas_bar[1:] / alphas_bar[:-1]
+    alphas = torch.cat([alphas_bar[0:1], alphas], 0)
+    betas = 1 - alphas
+    return betas
+def make_beta_schedule(schedule="scaled_linear", n_timestep=1000, linear_start=0.00085, linear_end=0.012, cosine_s=8e-3, shift_scale=None):
+    if schedule == "scaled_linear":
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+    elif schedule == 'linear':
+        betas = (
+                torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+        )
+    elif schedule == "cosine":
+        timesteps = (
+                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == "sqrt":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
+    elif schedule == 'linear_force_zero_snr':
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+        betas = force_zero_snr(betas)
+    elif schedule == 'linear_100':
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+        betas = betas[:100]
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    if shift_scale is not None:
+        print("shift_scale")
+        betas = shift_schedule(betas, shift_scale)
+    return betas.numpy()
+def shift_schedule(base_betas, shift_scale):
+    alphas = 1 - base_betas
+    alphas_bar = torch.cumprod(alphas, dim=0)
+    snr = alphas_bar / (1 - alphas_bar) # snr(1-ab)=ab; snr-snr*ab=ab; snr=(1+snr)ab; ab=snr/(1+snr)
+    shifted_snr = snr * ((1 / shift_scale) **  2)
+    shifted_alphas_bar = shifted_snr / (1 + shifted_snr)
+    shifted_alphas = shifted_alphas_bar[1:] / shifted_alphas_bar[:-1]
+    shifted_alphas = torch.cat([shifted_alphas_bar[0:1], shifted_alphas], 0)
+    shifted_betas = 1 - shifted_alphas
+    return shifted_betas
+def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
+    n_dims = len(x.shape)
+    if src_dim < 0:
+        src_dim = n_dims + src_dim
+    if dest_dim < 0:
+        dest_dim = n_dims + dest_dim
+    assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims
+    dims = list(range(n_dims))
+    del dims[src_dim]
+    permutation = []
+    ctr = 0
+    for i in range(n_dims):
+        if i == dest_dim:
+            permutation.append(src_dim)
+        else:
+            permutation.append(dims[ctr])
+            ctr += 1
+    x = x.permute(permutation)
+    if make_contiguous:
+        x = x.contiguous()
+    return x
+# reshapes tensor start from dim i (inclusive)
+# to dim j (exclusive) to the desired shape
+# e.g. if x.shape = (b, thw, c) then
+# view_range(x, 1, 2, (t, h, w)) returns
+# x of shape (b, t, h, w, c)
+def view_range(x, i, j, shape):
+    shape = tuple(shape)
+    n_dims = len(x.shape)
+    if i < 0:
+        i = n_dims + i
+    if j is None:
+        j = n_dims
+    elif j < 0:
+        j = n_dims + j
+    assert 0 <= i < j <= n_dims
+    x_shape = x.shape
+    target_shape = x_shape[:i] + shape + x_shape[j:]
+    return x.view(target_shape)
+def tensor_slice(x, begin, size):
+    assert all([b >= 0 for b in begin])
+    size = [l - b if s == -1 else s
+            for s, b, l in zip(size, begin, x.shape)]
+    assert all([s >= 0 for s in size])
+    slices = [slice(b, b + s) for b, s in zip(begin, size)]
+    return x[slices]

DiT_VAE/vae/__init__.py ADDED Viewed

File without changes

DiT_VAE/vae/aemodules3d.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# TATS
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .attention_vae import MultiHeadAttention
+def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
+    n_dims = len(x.shape)
+    if src_dim < 0:
+        src_dim = n_dims + src_dim
+    if dest_dim < 0:
+        dest_dim = n_dims + dest_dim
+    assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims
+    dims = list(range(n_dims))
+    del dims[src_dim]
+    permutation = []
+    ctr = 0
+    for i in range(n_dims):
+        if i == dest_dim:
+            permutation.append(src_dim)
+        else:
+            permutation.append(dims[ctr])
+            ctr += 1
+    x = x.permute(permutation)
+    if make_contiguous:
+        x = x.contiguous()
+    return x
+def silu(x):
+    return x * torch.sigmoid(x)
+class SiLU(nn.Module):
+    def __init__(self):
+        super(SiLU, self).__init__()
+    def forward(self, x):
+        return silu(x)
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1. - logits_real))
+    loss_fake = torch.mean(F.relu(1. + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+            torch.mean(torch.nn.functional.softplus(-logits_real)) +
+            torch.mean(torch.nn.functional.softplus(logits_fake)))
+    return d_loss
+def Normalize(in_channels, norm_type='group'):
+    assert norm_type in ['group', 'batch']
+    if norm_type == 'group':
+        return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    elif norm_type == 'batch':
+        return torch.nn.SyncBatchNorm(in_channels)
+class ResBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, norm_type='group',
+                 padding_type='replicate'):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.norm2 = Normalize(in_channels, norm_type)
+        self.conv2 = SamePadConv3d(out_channels, out_channels, kernel_size=3, padding_type=padding_type)
+        if self.in_channels != self.out_channels:
+            self.conv_shortcut = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = silu(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.conv_shortcut(x)
+        return x + h
+# Does not support dilation
+class SamePadConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, padding_type='replicate'):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        # assumes that the input shape is divisible by stride
+        total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
+        pad_input = []
+        for p in total_pad[::-1]:  # reverse since F.pad starts from last dim
+            pad_input.append((p // 2 + p % 2, p // 2))
+        pad_input = sum(pad_input, tuple())
+        self.pad_input = pad_input
+        self.padding_type = padding_type
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size,
+                              stride=stride, padding=0, bias=bias)
+        self.weight = self.conv.weight
+    def forward(self, x):
+        return self.conv(F.pad(x, self.pad_input, mode=self.padding_type))
+class SamePadConvTranspose3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, padding_type='replicate'):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
+        pad_input = []
+        for p in total_pad[::-1]:  # reverse since F.pad starts from last dim
+            pad_input.append((p // 2 + p % 2, p // 2))
+        pad_input = sum(pad_input, tuple())
+        self.pad_input = pad_input
+        self.padding_type = padding_type
+        self.convt = nn.ConvTranspose3d(in_channels, out_channels, kernel_size,
+                                        stride=stride, bias=bias,
+                                        padding=tuple([k - 1 for k in kernel_size]))
+    def forward(self, x):
+        return self.convt(F.pad(x, self.pad_input, mode=self.padding_type))
+class AxialBlock(nn.Module):
+    def __init__(self, n_hiddens, n_head):
+        super().__init__()
+        kwargs = dict(shape=(0,) * 3, dim_q=n_hiddens,
+                      dim_kv=n_hiddens, n_head=n_head,
+                      n_layer=1, causal=False, attn_type='axial')
+        self.attn_w = MultiHeadAttention(attn_kwargs=dict(axial_dim=-2),
+                                         **kwargs)
+        self.attn_h = MultiHeadAttention(attn_kwargs=dict(axial_dim=-3),
+                                         **kwargs)
+        self.attn_t = MultiHeadAttention(attn_kwargs=dict(axial_dim=-4),
+                                         **kwargs)
+    def forward(self, x):
+        x = shift_dim(x, 1, -1)
+        x = self.attn_w(x, x, x) + self.attn_h(x, x, x) + self.attn_t(x, x, x)
+        x = shift_dim(x, -1, 1)
+        return x
+class AttentionResidualBlock(nn.Module):
+    def __init__(self, n_hiddens):
+        super().__init__()
+        self.block = nn.Sequential(
+            Normalize(n_hiddens),
+            SiLU(),
+            SamePadConv3d(n_hiddens, n_hiddens // 2, 3, bias=False),
+            Normalize(n_hiddens // 2),
+            SiLU(),
+            SamePadConv3d(n_hiddens // 2, n_hiddens, 1, bias=False),
+            Normalize(n_hiddens),
+            SiLU(),
+            AxialBlock(n_hiddens, 2)
+        )
+    def forward(self, x):
+        return x + self.block(x)
+class Encoder(nn.Module):
+    def __init__(self, n_hiddens, downsample, z_channels, double_z, image_channel=3, norm_type='group',
+                 padding_type='replicate', res_num=1):
+        super().__init__()
+        n_times_downsample = np.array([int(math.log2(d)) for d in downsample])
+        self.conv_blocks = nn.ModuleList()
+        max_ds = n_times_downsample.max()
+        self.conv_first = SamePadConv3d(image_channel, n_hiddens, kernel_size=3, padding_type=padding_type)
+        for i in range(max_ds):
+            block = nn.Module()
+            in_channels = n_hiddens * 2 ** i
+            out_channels = n_hiddens * 2 ** (i + 1)
+            stride = tuple([2 if d > 0 else 1 for d in n_times_downsample])
+            stride = list(stride)
+            stride[0] = 1
+            stride = tuple(stride)
+            block.down = SamePadConv3d(in_channels, out_channels, 4, stride=stride, padding_type=padding_type)
+            block.res = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            self.conv_blocks.append(block)
+            n_times_downsample -= 1
+        self.final_block = nn.Sequential(
+            Normalize(out_channels, norm_type),
+            SiLU(),
+            SamePadConv3d(out_channels, 2 * z_channels if double_z else z_channels,
+                          kernel_size=3,
+                          stride=1,
+                          padding_type=padding_type)
+        )
+        self.out_channels = out_channels
+    def forward(self, x):
+        h = self.conv_first(x)
+        for block in self.conv_blocks:
+            h = block.down(h)
+            h = block.res(h)
+        h = self.final_block(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, n_hiddens, upsample, z_channels, image_channel, norm_type='group'):
+        super().__init__()
+        n_times_upsample = np.array([int(math.log2(d)) for d in upsample])
+        max_us = n_times_upsample.max()
+        in_channels = z_channels
+        self.conv_blocks = nn.ModuleList()
+        for i in range(max_us):
+            block = nn.Module()
+            in_channels = in_channels if i == 0 else n_hiddens * 2 ** (max_us - i + 1)
+            out_channels = n_hiddens * 2 ** (max_us - i)
+            us = tuple([2 if d > 0 else 1 for d in n_times_upsample])
+            us = list(us)
+            us[0] = 1
+            us = tuple(us)
+            block.up = SamePadConvTranspose3d(in_channels, out_channels, 4, stride=us)
+            block.res1 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            block.res2 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            self.conv_blocks.append(block)
+            n_times_upsample -= 1
+        self.conv_out = SamePadConv3d(out_channels, image_channel, kernel_size=3)
+    def forward(self, x):
+        h = x
+        for i, block in enumerate(self.conv_blocks):
+            h = block.up(h)
+            h = block.res1(h)
+            h = block.res2(h)
+        h = self.conv_out(h)
+        return h
+class EncoderRe(nn.Module):
+    def __init__(self, n_hiddens, downsample, z_channels, double_z, image_channel=3, norm_type='group',
+                 padding_type='replicate', n_res_layers=2):
+        super().__init__()
+        # n_times_downsample = np.array([int(math.log2(d)) for d in downsample])
+        self.conv_blocks = nn.ModuleList()
+        # max_ds = n_times_downsample.max()
+        self.conv_first = SamePadConv3d(image_channel, n_hiddens, kernel_size=3, padding_type=padding_type)
+        for i, step in enumerate(downsample):
+            block = nn.Module()
+            in_channels = n_hiddens
+            out_channels = n_hiddens
+            stride = [1, downsample[i], downsample[i]]
+            stride = tuple(stride)
+            block.down = SamePadConv3d(in_channels, out_channels, 4, stride=stride, padding_type=padding_type)
+            block.res1 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            block.res2 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            self.conv_blocks.append(block)
+        self.res_stack = nn.Sequential(
+            *[AttentionResidualBlock(out_channels)
+              for _ in range(n_res_layers)]
+        )
+        self.final_block = nn.Sequential(
+            Normalize(out_channels, norm_type),
+            SiLU(),
+            SamePadConv3d(out_channels, 2 * z_channels if double_z else z_channels,
+                          kernel_size=3,
+                          stride=1,
+                          padding_type=padding_type)
+        )
+        self.out_channels = out_channels
+    def forward(self, x):
+        h = self.conv_first(x)
+        for block in self.conv_blocks:
+            h = block.down(h)
+            h = block.res1(h)
+            h = block.res2(h)
+        h = self.res_stack(h)
+        h = self.final_block(h)
+        return h
+class DecoderRe(nn.Module):
+    def __init__(self, n_hiddens, upsample, z_channels, image_channel, norm_type='group', padding_type='replicate', n_res_layers=2):
+        super().__init__()
+        self.conv_first = SamePadConv3d(z_channels, n_hiddens, kernel_size=3, padding_type=padding_type)
+        self.res_stack = nn.Sequential(
+            *[AttentionResidualBlock(n_hiddens)
+              for _ in range(n_res_layers)]
+        )
+        # n_times_upsample = np.array([int(math.log2(d)) for d in upsample])
+        # max_us = n_times_upsample.max()
+        # in_channels = n_hiddens
+        self.conv_blocks = nn.ModuleList()
+        for i, step in enumerate(upsample):
+            block = nn.Module()
+            in_channels = n_hiddens
+            out_channels = n_hiddens
+            stride = [1, upsample[i], upsample[i]]
+            stride = tuple(stride)
+            block.up = SamePadConvTranspose3d(in_channels, out_channels, 4, stride=stride)
+            block.res1 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            block.res2 = ResBlock(out_channels, out_channels, norm_type=norm_type)
+            self.conv_blocks.append(block)
+        self.conv_out = SamePadConv3d(out_channels, image_channel, kernel_size=3)
+    def forward(self, x):
+        h = x
+        h = self.conv_first(h)
+        h = self.res_stack(h)
+        for i, block in enumerate(self.conv_blocks):
+            h = block.up(h)
+            h = block.res1(h)
+            h = block.res2(h)
+        h = self.conv_out(h)
+        return h
+# unit test
+if __name__ == '__main__':
+    encoder = EncoderRe(n_hiddens=320, downsample=[1, 2, 2, 2], z_channels=8, double_z=True, image_channel=96,
+                      norm_type='group', padding_type='replicate')
+    encoder = encoder.cuda()
+    en_input = torch.rand(1, 96, 3, 256, 256).cuda()
+    out = encoder(en_input)
+    print(out.shape)
+    mean, logvar = torch.chunk(out, 2, dim=1)
+    # print(mean.shape)
+    decoder = DecoderRe(n_hiddens=320, upsample=[2, 2, 2, 1], z_channels=8,   image_channel=96,
+                      norm_type='group' )
+    decoder = decoder.cuda()
+    out = decoder(mean)
+    print(out.shape)
+    # logvar = nn.Parameter(torch.ones(size=()) * 0.0)
+    # print(logvar)

DiT_VAE/vae/attention_vae.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+def tensor_slice(x, begin, size):
+    assert all([b >= 0 for b in begin])
+    size = [l - b if s == -1 else s
+            for s, b, l in zip(size, begin, x.shape)]
+    assert all([s >= 0 for s in size])
+    slices = [slice(b, b + s) for b, s in zip(begin, size)]
+    return x[slices]
+# reshapes tensor start from dim i (inclusive)
+# to dim j (exclusive) to the desired shape
+# e.g. if x.shape = (b, thw, c) then
+# view_range(x, 1, 2, (t, h, w)) returns
+# x of shape (b, t, h, w, c)
+def view_range(x, i, j, shape):
+    shape = tuple(shape)
+    n_dims = len(x.shape)
+    if i < 0:
+        i = n_dims + i
+    if j is None:
+        j = n_dims
+    elif j < 0:
+        j = n_dims + j
+    assert 0 <= i < j <= n_dims
+    x_shape = x.shape
+    target_shape = x_shape[:i] + shape + x_shape[j:]
+    return x.view(target_shape)
+def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True):
+    n_dims = len(x.shape)
+    if src_dim < 0:
+        src_dim = n_dims + src_dim
+    if dest_dim < 0:
+        dest_dim = n_dims + dest_dim
+    assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims
+    dims = list(range(n_dims))
+    del dims[src_dim]
+    permutation = []
+    ctr = 0
+    for i in range(n_dims):
+        if i == dest_dim:
+            permutation.append(src_dim)
+        else:
+            permutation.append(dims[ctr])
+            ctr += 1
+    x = x.permute(permutation)
+    if make_contiguous:
+        x = x.contiguous()
+    return x
+class AttentionStack(nn.Module):
+    def __init__(
+        self, shape, embd_dim, n_head, n_layer, dropout,
+        attn_type, attn_dropout, class_cond_dim, frame_cond_shape,
+    ):
+        super().__init__()
+        self.shape = shape
+        self.embd_dim = embd_dim
+        self.use_frame_cond = frame_cond_shape is not None
+        self.right_shift = RightShift(embd_dim)
+        self.pos_embd = AddBroadcastPosEmbed(
+            shape=shape, embd_dim=embd_dim
+        )
+        self.attn_nets = nn.ModuleList(
+            [
+                AttentionBlock(
+                    shape=shape,
+                    embd_dim=embd_dim,
+                    n_head=n_head,
+                    n_layer=n_layer,
+                    dropout=dropout,
+                    attn_type=attn_type,
+                    attn_dropout=attn_dropout,
+                    class_cond_dim=class_cond_dim,
+                    frame_cond_shape=frame_cond_shape
+                )
+                for i in range(n_layer)
+            ]
+        )
+    def forward(self, x, cond, decode_step, decode_idx):
+        """
+        Args
+        ------
+            x: (b, d1, d2, ..., dn, embd_dim)
+            cond: a dictionary of conditioning tensors
+            (below is used only when sampling for fast decoding)
+            decode: the enumerated rasterscan order of the current idx being sampled
+            decode_step: a tuple representing the current idx being sampled
+        """
+        x = self.right_shift(x, decode_step)
+        x = self.pos_embd(x, decode_step, decode_idx)
+        for net in self.attn_nets:
+            x = net(x, cond, decode_step, decode_idx)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, shape, embd_dim, n_head, n_layer, dropout,
+                 attn_type, attn_dropout, class_cond_dim, frame_cond_shape):
+        super().__init__()
+        self.use_frame_cond = frame_cond_shape is not None
+        self.pre_attn_norm = LayerNorm(embd_dim, class_cond_dim)
+        self.post_attn_dp = nn.Dropout(dropout)
+        self.attn = MultiHeadAttention(shape, embd_dim, embd_dim, n_head,
+                                       n_layer, causal=True, attn_type=attn_type,
+                                       attn_kwargs=dict(attn_dropout=attn_dropout))
+        if frame_cond_shape is not None:
+            enc_len = np.prod(frame_cond_shape[:-1])
+            self.pre_enc_norm = LayerNorm(embd_dim, class_cond_dim)
+            self.post_enc_dp = nn.Dropout(dropout)
+            self.enc_attn = MultiHeadAttention(shape, embd_dim, frame_cond_shape[-1],
+                                               n_head, n_layer, attn_type='full',
+                                               attn_kwargs=dict(attn_dropout=0.), causal=False)
+        self.pre_fc_norm = LayerNorm(embd_dim, class_cond_dim)
+        self.post_fc_dp = nn.Dropout(dropout)
+        self.fc_block = nn.Sequential(
+            nn.Linear(in_features=embd_dim, out_features=embd_dim * 4),
+            GeLU2(),
+            nn.Linear(in_features=embd_dim * 4, out_features=embd_dim),
+        )
+    def forward(self, x, cond, decode_step, decode_idx):
+        h = self.pre_attn_norm(x, cond)
+        if self.training:
+            h = checkpoint(self.attn, h, h, h, decode_step, decode_idx)
+        else:
+            h = self.attn(h, h, h, decode_step, decode_idx)
+        h = self.post_attn_dp(h)
+        x = x + h
+        if self.use_frame_cond:
+            h = self.pre_enc_norm(x, cond)
+            if self.training:
+                h = checkpoint(self.enc_attn, h, cond['frame_cond'], cond['frame_cond'],
+                               decode_step, decode_idx)
+            else:
+                h = self.enc_attn(h, cond['frame_cond'], cond['frame_cond'],
+                                  decode_step, decode_idx)
+            h = self.post_enc_dp(h)
+            x = x + h
+        h = self.pre_fc_norm(x, cond)
+        if self.training:
+            h = checkpoint(self.fc_block, h)
+        else:
+            h = self.fc_block(h)
+        h = self.post_fc_dp(h)
+        x = x + h
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(self, shape, dim_q, dim_kv, n_head, n_layer,
+                 causal, attn_type, attn_kwargs):
+        super().__init__()
+        self.causal = causal
+        self.shape = shape
+        self.d_k = dim_q // n_head
+        self.d_v = dim_kv // n_head
+        self.n_head = n_head
+        self.w_qs = nn.Linear(dim_q, n_head * self.d_k, bias=False) # q
+        self.w_qs.weight.data.normal_(std=1.0 / np.sqrt(dim_q))
+        self.w_ks = nn.Linear(dim_kv, n_head * self.d_k, bias=False) # k
+        self.w_ks.weight.data.normal_(std=1.0 / np.sqrt(dim_kv))
+        self.w_vs = nn.Linear(dim_kv, n_head * self.d_v, bias=False) # v
+        self.w_vs.weight.data.normal_(std=1.0 / np.sqrt(dim_kv))
+        self.fc = nn.Linear(n_head * self.d_v, dim_q, bias=True) # c
+        self.fc.weight.data.normal_(std=1.0 / np.sqrt(dim_q * n_layer))
+        if attn_type == 'full':
+            self.attn = FullAttention(shape, causal, **attn_kwargs)
+        elif attn_type == 'axial':
+            assert not causal, 'causal axial attention is not supported'
+            self.attn = AxialAttention(len(shape), **attn_kwargs)
+        elif attn_type == 'sparse':
+            self.attn = SparseAttention(shape, n_head, causal, **attn_kwargs)
+        self.cache = None
+    def forward(self, q, k, v, decode_step=None, decode_idx=None):
+        """ Compute multi-head attention
+        Args
+            q, k, v: a [b, d1, ..., dn, c] tensor or
+                     a [b, 1, ..., 1, c] tensor if decode_step is not None
+        Returns
+            The output after performing attention
+        """
+        # compute k, q, v
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        q = view_range(self.w_qs(q), -1, None, (n_head, d_k))
+        k = view_range(self.w_ks(k), -1, None, (n_head, d_k))
+        v = view_range(self.w_vs(v), -1, None, (n_head, d_v))
+        # b x n_head x seq_len x d
+        # (b, *d_shape, n_head, d) -> (b, n_head, *d_shape, d)
+        q = shift_dim(q, -2, 1)
+        k = shift_dim(k, -2, 1)
+        v = shift_dim(v, -2, 1)
+        # fast decoding
+        if decode_step is not None:
+            if decode_step == 0:
+                if self.causal:
+                    k_shape = (q.shape[0], n_head, *self.shape, self.d_k)
+                    v_shape = (q.shape[0], n_head, *self.shape, self.d_v)
+                    self.cache = dict(k=torch.zeros(k_shape, dtype=k.dtype, device=q.device),
+                                    v=torch.zeros(v_shape, dtype=v.dtype, device=q.device))
+                else:
+                    # cache only once in the non-causal case
+                    self.cache = dict(k=k.clone(), v=v.clone())
+            if self.causal:
+                idx = (slice(None, None), slice(None, None), *[slice(i, i+ 1) for i in decode_idx])
+                self.cache['k'][idx] = k
+                self.cache['v'][idx] = v
+            k, v = self.cache['k'], self.cache['v']
+        a = self.attn(q, k, v, decode_step, decode_idx)
+        # (b, *d_shape, n_head, d) -> (b, *d_shape, n_head * d)
+        a = shift_dim(a, 1, -2).flatten(start_dim=-2)
+        a = self.fc(a) # (b x seq_len x embd_dim)
+        return a
+############## Attention #######################
+class FullAttention(nn.Module):
+    def __init__(self, shape, causal, attn_dropout):
+        super().__init__()
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        seq_len = np.prod(shape)
+        if self.causal:
+            self.register_buffer('mask', torch.tril(torch.ones(seq_len, seq_len)))
+    def forward(self, q, k, v, decode_step, decode_idx):
+        mask = self.mask if self.causal else None
+        if decode_step is not None and mask is not None:
+            mask = mask[[decode_step]]
+        old_shape = q.shape[2:-1]
+        q = q.flatten(start_dim=2, end_dim=-2)
+        k = k.flatten(start_dim=2, end_dim=-2)
+        v = v.flatten(start_dim=2, end_dim=-2)
+        out = scaled_dot_product_attention(q, k, v, mask=mask,
+                                           attn_dropout=self.attn_dropout,
+                                           training=self.training)
+        return view_range(out, 2, 3, old_shape)
+class AxialAttention(nn.Module):
+    def __init__(self, n_dim, axial_dim):
+        super().__init__()
+        if axial_dim < 0:
+            axial_dim = 2 + n_dim + 1 + axial_dim
+        else:
+            axial_dim += 2 # account for batch, head, dim
+        self.axial_dim = axial_dim
+    def forward(self, q, k, v, decode_step, decode_idx):
+        q = shift_dim(q, self.axial_dim, -2).flatten(end_dim=-3)
+        k = shift_dim(k, self.axial_dim, -2).flatten(end_dim=-3)
+        v = shift_dim(v, self.axial_dim, -2)
+        old_shape = list(v.shape)
+        v = v.flatten(end_dim=-3)
+        out = scaled_dot_product_attention(q, k, v, training=self.training)
+        out = out.view(*old_shape)
+        out = shift_dim(out, -2, self.axial_dim)
+        return out
+class SparseAttention(nn.Module):
+    ops = dict()
+    attn_mask = dict()
+    block_layout = dict()
+    def __init__(self, shape, n_head, causal, num_local_blocks=4, block=32,
+                 attn_dropout=0.): # does not use attn_dropout
+        super().__init__()
+        self.causal = causal
+        self.shape = shape
+        self.sparsity_config = StridedSparsityConfig(shape=shape, n_head=n_head,
+                                                     causal=causal, block=block,
+                                                     num_local_blocks=num_local_blocks)
+        if self.shape not in SparseAttention.block_layout:
+            SparseAttention.block_layout[self.shape] = self.sparsity_config.make_layout()
+        if causal and self.shape not in SparseAttention.attn_mask:
+            SparseAttention.attn_mask[self.shape] = self.sparsity_config.make_sparse_attn_mask()
+    def get_ops(self):
+        try:
+            from deepspeed.ops.sparse_attention import MatMul, Softmax
+        except:
+            raise Exception('Error importing deepspeed. Please install using `DS_BUILD_SPARSE_ATTN=1 pip install deepspeed`')
+        if self.shape not in SparseAttention.ops:
+            sparsity_layout = self.sparsity_config.make_layout()
+            sparse_dot_sdd_nt = MatMul(sparsity_layout,
+                                       self.sparsity_config.block,
+                                       'sdd',
+                                       trans_a=False,
+                                       trans_b=True)
+            sparse_dot_dsd_nn = MatMul(sparsity_layout,
+                                       self.sparsity_config.block,
+                                       'dsd',
+                                       trans_a=False,
+                                       trans_b=False)
+            sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block)
+            SparseAttention.ops[self.shape] = (sparse_dot_sdd_nt,
+                                               sparse_dot_dsd_nn,
+                                               sparse_softmax)
+        return SparseAttention.ops[self.shape]
+    def forward(self, q, k, v, decode_step, decode_idx):
+        if self.training and self.shape not in SparseAttention.ops:
+            self.get_ops()
+        SparseAttention.block_layout[self.shape] = SparseAttention.block_layout[self.shape].to(q)
+        if self.causal:
+            SparseAttention.attn_mask[self.shape] = SparseAttention.attn_mask[self.shape].to(q).type_as(q)
+        attn_mask = SparseAttention.attn_mask[self.shape] if self.causal else None
+        old_shape = q.shape[2:-1]
+        q = q.flatten(start_dim=2, end_dim=-2)
+        k = k.flatten(start_dim=2, end_dim=-2)
+        v = v.flatten(start_dim=2, end_dim=-2)
+        if decode_step is not None:
+            mask = self.sparsity_config.get_non_block_layout_row(SparseAttention.block_layout[self.shape], decode_step)
+            out = scaled_dot_product_attention(q, k, v, mask=mask, training=self.training)
+        else:
+            if q.shape != k.shape or k.shape != v.shape:
+                raise Exception('SparseAttention only support self-attention')
+            sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax = self.get_ops()
+            scaling = float(q.shape[-1]) ** -0.5
+            attn_output_weights = sparse_dot_sdd_nt(q, k)
+            if attn_mask is not None:
+                attn_output_weights = attn_output_weights.masked_fill(attn_mask == 0,
+                                                                      float('-inf'))
+            attn_output_weights = sparse_softmax(
+                attn_output_weights,
+                scale=scaling
+            )
+            out = sparse_dot_dsd_nn(attn_output_weights, v)
+        return view_range(out, 2, 3, old_shape)
+class StridedSparsityConfig(object):
+    """
+    Strided Sparse configuration specified in https://arxiv.org/abs/1904.10509 that
+    generalizes to arbitrary dimensions
+    """
+    def __init__(self, shape, n_head, causal, block, num_local_blocks):
+        self.n_head = n_head
+        self.shape = shape
+        self.causal = causal
+        self.block = block
+        self.num_local_blocks = num_local_blocks
+        assert self.num_local_blocks >= 1, 'Must have at least 1 local block'
+        assert self.seq_len % self.block == 0, 'seq len must be divisible by block size'
+        self._block_shape = self._compute_block_shape()
+        self._block_shape_cum = self._block_shape_cum_sizes()
+    @property
+    def seq_len(self):
+        return np.prod(self.shape)
+    @property
+    def num_blocks(self):
+        return self.seq_len // self.block
+    def set_local_layout(self, layout):
+        num_blocks = self.num_blocks
+        for row in range(0, num_blocks):
+            end = min(row + self.num_local_blocks, num_blocks)
+            for col in range(
+                    max(0, row - self.num_local_blocks),
+                    (row + 1 if self.causal else end)):
+                layout[:, row, col] = 1
+        return layout
+    def set_global_layout(self, layout):
+        num_blocks = self.num_blocks
+        n_dim = len(self._block_shape)
+        for row in range(num_blocks):
+            assert self._to_flattened_idx(self._to_unflattened_idx(row)) == row
+            cur_idx = self._to_unflattened_idx(row)
+            # no strided attention over last dim
+            for d in range(n_dim - 1):
+                end = self._block_shape[d]
+                for i in range(0, (cur_idx[d] + 1 if self.causal else end)):
+                    new_idx = list(cur_idx)
+                    new_idx[d] = i
+                    new_idx = tuple(new_idx)
+                    col = self._to_flattened_idx(new_idx)
+                    layout[:, row, col] = 1
+        return layout
+    def make_layout(self):
+        layout = torch.zeros((self.n_head, self.num_blocks, self.num_blocks), dtype=torch.int64)
+        layout = self.set_local_layout(layout)
+        layout = self.set_global_layout(layout)
+        return layout
+    def make_sparse_attn_mask(self):
+        block_layout = self.make_layout()
+        assert block_layout.shape[1] == block_layout.shape[2] == self.num_blocks
+        num_dense_blocks = block_layout.sum().item()
+        attn_mask = torch.ones(num_dense_blocks, self.block, self.block)
+        counter = 0
+        for h in range(self.n_head):
+            for i in range(self.num_blocks):
+                for j in range(self.num_blocks):
+                    elem = block_layout[h, i, j].item()
+                    if elem == 1:
+                        assert i >= j
+                        if i == j: # need to mask within block on diagonals
+                            attn_mask[counter] = torch.tril(attn_mask[counter])
+                        counter += 1
+        assert counter == num_dense_blocks
+        return attn_mask.unsqueeze(0)
+    def get_non_block_layout_row(self, block_layout, row):
+        block_row = row // self.block
+        block_row = block_layout[:, [block_row]] # n_head x 1 x n_blocks
+        block_row = block_row.repeat_interleave(self.block, dim=-1)
+        block_row[:, :, row + 1:] = 0.
+        return block_row
+    ############# Helper functions ##########################
+    def _compute_block_shape(self):
+        n_dim = len(self.shape)
+        cum_prod = 1
+        for i in range(n_dim - 1, -1, -1):
+            cum_prod *= self.shape[i]
+            if cum_prod > self.block:
+                break
+        assert cum_prod % self.block == 0
+        new_shape = (*self.shape[:i], cum_prod // self.block)
+        assert np.prod(new_shape) == np.prod(self.shape) // self.block
+        return new_shape
+    def _block_shape_cum_sizes(self):
+        bs = np.flip(np.array(self._block_shape))
+        return tuple(np.flip(np.cumprod(bs)[:-1])) + (1,)
+    def _to_flattened_idx(self, idx):
+        assert len(idx) == len(self._block_shape), f"{len(idx)} != {len(self._block_shape)}"
+        flat_idx = 0
+        for i in range(len(self._block_shape)):
+            flat_idx += idx[i] * self._block_shape_cum[i]
+        return flat_idx
+    def _to_unflattened_idx(self, flat_idx):
+        assert flat_idx < np.prod(self._block_shape)
+        idx = []
+        for i in range(len(self._block_shape)):
+            idx.append(flat_idx // self._block_shape_cum[i])
+            flat_idx %= self._block_shape_cum[i]
+        return tuple(idx)
+################ Spatiotemporal broadcasted positional embeddings ###############
+class AddBroadcastPosEmbed(nn.Module):
+    def __init__(self, shape, embd_dim, dim=-1):
+        super().__init__()
+        assert dim in [-1, 1] # only first or last dim supported
+        self.shape = shape
+        self.n_dim = n_dim = len(shape)
+        self.embd_dim = embd_dim
+        self.dim = dim
+        assert embd_dim % n_dim == 0, f"{embd_dim} % {n_dim} != 0"
+        self.emb = nn.ParameterDict({
+             f'd_{i}': nn.Parameter(torch.randn(shape[i], embd_dim // n_dim) * 0.01
+                                    if dim == -1 else
+                                    torch.randn(embd_dim // n_dim, shape[i]) * 0.01)
+             for i in range(n_dim)
+        })
+    def forward(self, x, decode_step=None, decode_idx=None):
+        embs = []
+        for i in range(self.n_dim):
+            e = self.emb[f'd_{i}']
+            if self.dim == -1:
+                # (1, 1, ..., 1, self.shape[i], 1, ..., -1)
+                e = e.view(1, *((1,) * i), self.shape[i], *((1,) * (self.n_dim - i - 1)), -1)
+                e = e.expand(1, *self.shape, -1)
+            else:
+                e = e.view(1, -1, *((1,) * i), self.shape[i], *((1,) * (self.n_dim - i - 1)))
+                e = e.expand(1, -1, *self.shape)
+            embs.append(e)
+        embs = torch.cat(embs, dim=self.dim)
+        if decode_step is not None:
+            embs = tensor_slice(embs, [0, *decode_idx, 0],
+                                [x.shape[0], *(1,) * self.n_dim, x.shape[-1]])
+        return x + embs
+################# Helper Functions ###################################
+def scaled_dot_product_attention(q, k, v, mask=None, attn_dropout=0., training=True):
+    # Performs scaled dot-product attention over the second to last dimension dn
+    # (b, n_head, d1, ..., dn, d)
+    attn = torch.matmul(q, k.transpose(-1, -2))
+    attn = attn / np.sqrt(q.shape[-1])
+    if mask is not None:
+        attn = attn.masked_fill(mask == 0, float('-inf'))
+    attn_float = F.softmax(attn, dim=-1)
+    attn = attn_float.type_as(attn) # b x n_head x d1 x ... x dn x d
+    attn = F.dropout(attn, p=attn_dropout, training=training)
+    a = torch.matmul(attn, v) # b x n_head x d1 x ... x dn x d
+    return a
+class RightShift(nn.Module):
+    def __init__(self, embd_dim):
+        super().__init__()
+        self.embd_dim = embd_dim
+        self.sos = nn.Parameter(torch.FloatTensor(embd_dim).normal_(std=0.02), requires_grad=True)
+    def forward(self, x, decode_step):
+        if decode_step is not None and decode_step > 0:
+            return x
+        x_shape = list(x.shape)
+        x = x.flatten(start_dim=1, end_dim=-2) # (b, seq_len, embd_dim)
+        sos = torch.ones(x_shape[0], 1, self.embd_dim, dtype=torch.float32).to(self.sos) * self.sos
+        sos = sos.type_as(x)
+        x = torch.cat([sos, x[:, :-1, :]], axis=1)
+        x = x.view(*x_shape)
+        return x
+class GeLU2(nn.Module):
+    def forward(self, x):
+        return (1.702 * x).sigmoid() * x
+class LayerNorm(nn.Module):
+    def __init__(self, embd_dim, class_cond_dim):
+        super().__init__()
+        self.conditional = class_cond_dim is not None
+        if self.conditional:
+            self.w = nn.Linear(class_cond_dim, embd_dim, bias=False)
+            nn.init.constant_(self.w.weight.data, 1. / np.sqrt(class_cond_dim))
+            self.wb = nn.Linear(class_cond_dim, embd_dim, bias=False)
+        else:
+            self.g = nn.Parameter(torch.ones(embd_dim, dtype=torch.float32), requires_grad=True)
+            self.b = nn.Parameter(torch.zeros(embd_dim, dtype=torch.float32), requires_grad=True)
+    def forward(self, x, cond):
+        if self.conditional:  # (b, cond_dim)
+            g = 1 + self.w(cond['class_cond']).view(x.shape[0], *(1,)*(len(x.shape)-2), x.shape[-1]) # (b, ..., embd_dim)
+            b = self.wb(cond['class_cond']).view(x.shape[0], *(1,)*(len(x.shape)-2), x.shape[-1])
+        else:
+            g = self.g  # (embd_dim,)
+            b = self.b
+        x_float = x.float()
+        mu = x_float.mean(dim=-1, keepdims=True)
+        s = (x_float - mu).square().mean(dim=-1, keepdims=True)
+        x_float = (x_float - mu) * (1e-5 + s.rsqrt())  # (b, ..., embd_dim)
+        x_float = x_float * g + b
+        x = x_float.type_as(x)
+        return x

DiT_VAE/vae/data/__init__.py ADDED Viewed

File without changes

DiT_VAE/vae/data/dataset_online_vae.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import numpy
+import json
+import zipfile
+import torch
+from PIL import Image
+# from transformers import CLIPImageProcessor
+from torch.utils.data import Dataset
+import io
+from omegaconf import OmegaConf
+import numpy as np
+# from torchvision import transforms
+# from einops import rearrange
+# import random
+# import os
+# from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler, DDIMScheduler
+# import time
+# import io
+# import array
+# import numpy as np
+#
+# from training.triplane import TriPlaneGenerator
+def to_rgb_image(maybe_rgba: Image.Image):
+    if maybe_rgba.mode == 'RGB':
+        return maybe_rgba
+    elif maybe_rgba.mode == 'RGBA':
+        rgba = maybe_rgba
+        img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8)
+        img = Image.fromarray(img, 'RGB')
+        img.paste(rgba, mask=rgba.getchannel('A'))
+        return img
+    else:
+        raise ValueError("Unsupported image type.", maybe_rgba.mode)
+# image(contain style),z,pose,text
+class TriplaneDataset(Dataset):
+    # image, triplane, ref_feature
+    def __init__(self, json_file, data_base_dir, model_names):
+        super().__init__()
+        self.dict_data_image = json.load(open(json_file))  # {'image_name': pose}
+        self.data_base_dir = data_base_dir
+        self.data_list = list(self.dict_data_image.keys())
+        self.zip_file_dict = {}
+        config_gan_model = OmegaConf.load(model_names)
+        all_models = config_gan_model['gan_models'].keys()
+        for model_name in all_models:
+            zipfile_path = os.path.join(self.data_base_dir, model_name+'.zip')
+            zipfile_load = zipfile.ZipFile(zipfile_path)
+            self.zip_file_dict[model_name] = zipfile_load
+    def getdata(self, idx):
+        # need z and expression and model name
+        # image:"seed0035.png"
+        # data_each_dict = {
+        #     'vert_dir': vert_dir,
+        #     'z_dir': z_dir,
+        #     'pose_dir': pose_dir,
+        #     'img_dir': img_dir,
+        #     'model_name': model_name
+        # }
+        data_name = self.data_list[idx]
+        data_model_name = self.dict_data_image[data_name]['model_name']
+        zipfile_loaded = self.zip_file_dict[data_model_name]
+        # zipfile_path = os.path.join(self.data_base_dir, data_model_name)
+        # zipfile_loaded = zipfile.ZipFile(zipfile_path)
+        with zipfile_loaded.open(self.dict_data_image[data_name]['z_dir'], 'r') as f:
+            buffer = io.BytesIO(f.read())
+            data_z = torch.load(buffer)
+        buffer.close()
+        f.close()
+        with zipfile_loaded.open(self.dict_data_image[data_name]['vert_dir'], 'r') as ff:
+            buffer_v = io.BytesIO(ff.read())
+            data_vert = torch.load(buffer_v)
+        buffer_v.close()
+        ff.close()
+        #     raw_image = to_rgb_image(Image.open(f))
+        #
+        # data_model_name = self.dict_data_image[data_name]['model_name']
+        # data_z_dir = os.path.join(self.data_base_dir, data_model_name, self.dict_data_image[data_name]['z_dir'])
+        # data_vert_dir = os.path.join(self.data_base_dir, data_model_name, self.dict_data_image[data_name]['vert_dir'])
+        # data_z = torch.load(data_z_dir)
+        # data_vert = torch.load(data_vert_dir)
+        return {
+            "data_z": data_z,
+            "data_vert": data_vert,
+            "data_model_name": data_model_name
+        }
+    def __getitem__(self, idx):
+        for _ in range(20):
+            try:
+                return self.getdata(idx)
+            except Exception as e:
+                print(f"Error details: {str(e)}")
+                idx = np.random.randint(len(self))
+        raise RuntimeError('Too many bad data.')
+    def __len__(self):
+        return len(self.data_list)
+# for zip files

DiT_VAE/vae/distributions.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import numpy as np
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+    def mode(self):
+        raise NotImplementedError()
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+    def sample(self):
+        return self.value
+    def mode(self):
+        return self.value
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self, noise=None):
+        if noise is None:
+            noise = torch.randn(self.mean.shape)
+        x = self.mean + self.std * noise.to(device=self.parameters.device, dtype=self.parameters.dtype)
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+    def mode(self):
+        return self.mean
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )

DiT_VAE/vae/losses/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .contperceptual import LPIPSithTVLoss