wusize commited on Mar 30

Commit

22a2012

verified ·

1 Parent(s): 0791f85

Add files using upload-large-folder tool

Browse files

Files changed (19) hide show

.gitattributes +1 -0
README.md +218 -0
config.json +53 -0
configuration_harmon.py +9 -0
diffloss.py +249 -0
diffusion_utils.py +73 -0
gaussian_diffusion.py +877 -0
mar.py +470 -0
method.png +3 -0
misc.py +383 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_harmon.py +288 -0
respace.py +129 -0
tokenizer.json +0 -0
tokenizer_config.json +207 -0
vae.py +522 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+method.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,218 @@

+# Harmon: Harmonizing Visual Representations for Unified Multimodal Understanding and Generation
+![](method.png)
+> **[Harmonizing Visual Representations for Unified Multimodal Understanding and Generation](https://arxiv.org/abs/2406.05821)**
+>
+> Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Zhonghua Wu, Qingyi Tao, Wentao Liu, Wei Li, Chen Change Loy
+>
+> [![arXiv](https://img.shields.io/badge/arXiv-2406.05821-b31b1b.svg)](https://arxiv.org/abs/2406.05821)
+> [![Project Page](https://img.shields.io/badge/Project-Page-green)](https://wusize.github.io/projects/Harmon)
+> [![Bibtex](https://img.shields.io/badge/Cite-BibTeX-blue)](https://github.com/wusize/Harmon#citation)
+## Introduction
+**Harmon** is a novel unified framework for multimodal understanding and generation. Unlike existing state-of-the-art
+architectures that disentangle visual understanding and generation with different encoder models, the proposed framework harmonizes
+the visual presentations of understanding and generation via a shared MAR encoder. Harmon achieves advanced generation
+performance on mainstream text-to-image generation benchmarks, and exhibits competitive results on multimodal understanding
+tasks. In this repo, we provide inference code to run Harmon for image understanding (image-to-text) and text-to-image
+generation, with two model variants Harmon-0.5B and Harmon-1.5B.
+## Usage
+### 🖌️ Image-to-text Generation
+```python
+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+from einops import rearrange
+from PIL import Image
+import requests
+PROMPT_TEMPLATE = dict(
+    SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
+    INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
+    SUFFIX='<|im_end|>',
+    SUFFIX_AS_EOS=True,
+    SEP='\n',
+    STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+@torch.no_grad()
+def question_answer(question,
+                    image,
+                    model,
+                    tokenizer,
+                    max_new_tokens=512,
+                    image_size=512
+                    ):
+    assert image_size == 512
+    image = expand2square(
+        image, (127, 127, 127))
+    image = image.resize(size=(image_size, image_size))
+    image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=model.device)
+    image = rearrange(image, 'h w c -> c h w')[None]
+    image = 2 * (image / 255) - 1
+    prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
+    assert '<image>' in prompt
+    image_length = (image_size // 16) ** 2 + model.mar.buffer_size
+    prompt = prompt.replace('<image>', '<image>'*image_length)
+    input_ids = tokenizer.encode(
+        prompt, add_special_tokens=True, return_tensors='pt').cuda()
+    _, z_enc = model.extract_visual_feature(model.encode(image))
+    inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
+    inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
+    inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
+        input_ids[input_ids != image_token_idx]
+    )
+    output = model.llm.generate(inputs_embeds=inputs_embeds,
+                                use_cache=True,
+                                do_sample=False,
+                                max_new_tokens=max_new_tokens,
+                                eos_token_id=tokenizer.eos_token_id,
+                                pad_token_id=tokenizer.pad_token_id
+                                if tokenizer.pad_token_id is not None else
+                                tokenizer.eos_token_id
+                                )
+    return tokenizer.decode(output[0])
+harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-1_5B",
+                                                 trust_remote_code=True)
+harmon_model = AutoModel.from_pretrained("wusize/Harmon-1_5B",
+                                         trust_remote_code=True).eval().cuda().bfloat16()
+special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
+num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
+assert num_added_toks == 1
+image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
+print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}")
+image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+raw_image = Image.open(requests.get(image_file, stream=True).raw).convert('RGB')
+output_text = question_answer(question='Describe the image in detail.',
+                              image=raw_image,
+                              model=harmon_model,
+                              tokenizer=harmon_tokenizer,
+                              )
+print(output_text)
+```
+### 🖼️ Text-to-image Generation
+```python
+import os
+import torch
+from transformers import AutoTokenizer, AutoModel
+from einops import rearrange
+from PIL import Image
+PROMPT_TEMPLATE = dict(
+    SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
+    INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
+    SUFFIX='<|im_end|>',
+    SUFFIX_AS_EOS=True,
+    SEP='\n',
+    STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
+GENERATION_TEMPLATE = "Generate an image: {text}"
+@torch.no_grad()
+def generate_images(prompts,
+                    negative_prompt,
+                    tokenizer,
+                    model,
+                    output,
+                    grid_size=2,   # will produce 2 x 2 images per prompt
+                    num_steps=64, cfg_scale=3.0, temperature=1.0, image_size=512):
+    assert image_size == 512
+    m = n = image_size // 16
+    prompts = [
+                  PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)
+                  for prompt in prompts
+              ] * (grid_size ** 2)
+    if cfg_scale != 1.0:
+        prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
+    inputs = tokenizer(
+        prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(model.device)
+    images = model.sample(**inputs, num_iter=num_steps, cfg=cfg_scale, cfg_schedule="constant",
+                          temperature=temperature, progress=True, image_shape=(m, n))
+    images = rearrange(images, '(m n b) c h w -> b (m h) (n w) c', m=grid_size, n=grid_size)
+    images = torch.clamp(
+        127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
+    os.makedirs(output, exist_ok=True)
+    for idx, image in enumerate(images):
+        Image.fromarray(image).save(f"{output}/{idx:08d}.jpg")
+harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-1_5B",
+                                                 trust_remote_code=True)
+harmon_model = AutoModel.from_pretrained("wusize/Harmon-1_5B",
+                                         trust_remote_code=True).cuda().bfloat16().eval()
+texts = ['a dog on the left and a cat on the right.',
+         'a photo of a pink stop sign.']
+pos_prompts = [GENERATION_TEMPLATE.format(text=text) for text in texts]
+neg_prompt = 'Generate an image.'   # for classifier-free guidance
+generate_images(prompts=pos_prompts,
+                negative_prompt=neg_prompt,
+                tokenizer=harmon_tokenizer,
+                model=harmon_model,
+                output='output',)
+```
+## 📚 Citation
+If you find Harmon useful for your research or applications, please cite our paper using the following BibTeX:
+```bibtex
+@misc{wu2025harmon,
+      title={Harmonizing Visual Representations for Unified Multimodal Understanding and
+      Generation},
+      author={Size Wu and Wenwei Zhang and Lumin Xu and Sheng Jin and Zhonghua Wu and
+      Qingyi Tao and Wentao Liu and Wei Li and Chen Change Loy},
+      year={2025},
+      eprint={2405.xxxxx},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## 📜 License
+This project is licensed under [NTU S-Lab License 1.0](LICENSE).

config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "architectures": [
+    "HarmonModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_harmon.HarmonConfig",
+    "AutoModel": "modeling_harmon.HarmonModel"
+  },
+  "llm": {
+    "_attn_implementation": "flash_attention_2",
+    "attention_dropout": 0.0,
+    "attn_implementation": null,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "initializer_range": 0.02,
+    "intermediate_size": 8960,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 21,
+    "model_type": "qwen2",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 1000000.0,
+    "sliding_window": 32768,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "mar": {
+    "attn_dropout": 0.1,
+    "buffer_size": 64,
+    "class_num": 1000,
+    "diffloss_d": 12,
+    "diffloss_w": 1536,
+    "diffusion_batch_mul": 4,
+    "grad_checkpointing": false,
+    "img_size": 256,
+    "label_drop_prob": 0.1,
+    "mask_ratio_min": 0.7,
+    "num_sampling_steps": "100",
+    "patch_size": 1,
+    "proj_dropout": 0.1,
+    "type": "mar_huge",
+    "vae_embed_dim": 16,
+    "vae_stride": 16
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2"
+}

configuration_harmon.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import PretrainedConfig
+class HarmonConfig(PretrainedConfig):
+    model_type = "harmon"
+    def __init__(self, llm=None, mar=None, **kwargs):
+        super().__init__(**kwargs)
+        self.llm = llm
+        self.mar = mar

diffloss.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import math
+from .misc import create_diffusion
+class DiffLoss(nn.Module):
+    """Diffusion Loss"""
+    def __init__(self, target_channels, z_channels, depth, width, num_sampling_steps, grad_checkpointing=False):
+        super(DiffLoss, self).__init__()
+        self.in_channels = target_channels
+        self.net = SimpleMLPAdaLN(
+            in_channels=target_channels,
+            model_channels=width,
+            out_channels=target_channels * 2,  # for vlb loss
+            z_channels=z_channels,
+            num_res_blocks=depth,
+            grad_checkpointing=grad_checkpointing
+        )
+        self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="cosine")
+        self.gen_diffusion = create_diffusion(timestep_respacing=num_sampling_steps, noise_schedule="cosine")
+    def forward(self, target, z, mask=None):
+        t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+        model_kwargs = dict(c=z)
+        loss_dict = self.train_diffusion.training_losses(self.net, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        if mask is not None:
+            loss = (loss * mask).sum() / mask.sum()
+        return loss.mean()
+    def sample(self, z, temperature=1.0, cfg=1.0):
+        # diffusion loss sampling
+        if not cfg == 1.0:
+            noise = torch.randn(z.shape[0] // 2, self.in_channels).cuda()
+            noise = torch.cat([noise, noise], dim=0)
+            model_kwargs = dict(c=z, cfg_scale=cfg)
+            sample_fn = self.net.forward_with_cfg
+        else:
+            noise = torch.randn(z.shape[0], self.in_channels).cuda()
+            model_kwargs = dict(c=z)
+            sample_fn = self.net.forward
+        sampled_token_latent = self.gen_diffusion.p_sample_loop(
+            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs, progress=False,
+            temperature=temperature
+        )
+        return sampled_token_latent
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.data.dtype))
+        return t_emb
+class ResBlock(nn.Module):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    """
+    def __init__(
+        self,
+        channels
+    ):
+        super().__init__()
+        self.channels = channels
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(channels, 3 * channels, bias=True)
+        )
+    def forward(self, x, y):
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
+        h = self.mlp(h)
+        return x + gate_mlp * h
+class FinalLayer(nn.Module):
+    """
+    The final layer adopted from DiT.
+    """
+    def __init__(self, model_channels, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(model_channels, 2 * model_channels, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SimpleMLPAdaLN(nn.Module):
+    """
+    The MLP for Diffusion Loss.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param z_channels: channels in the condition.
+    :param num_res_blocks: number of residual blocks per downsample.
+    """
+    def __init__(
+        self,
+        in_channels,
+        model_channels,
+        out_channels,
+        z_channels,
+        num_res_blocks,
+        grad_checkpointing=False
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.grad_checkpointing = grad_checkpointing
+        self.time_embed = TimestepEmbedder(model_channels)
+        self.cond_embed = nn.Linear(z_channels, model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        res_blocks = []
+        for i in range(num_res_blocks):
+            res_blocks.append(ResBlock(
+                model_channels,
+            ))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.final_layer = FinalLayer(model_channels, out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def forward(self, x, t, c):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C] Tensor of inputs.
+        :param t: a 1-D batch of timesteps.
+        :param c: conditioning from AR transformer.
+        :return: an [N x C] Tensor of outputs.
+        """
+        # import pdb; pdb.set_trace()
+        x = self.input_proj(x.to(self.input_proj.weight.data.dtype))
+        t = self.time_embed(t)
+        c = self.cond_embed(c.to(self.cond_embed.weight.data.dtype))
+        y = t + c
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.res_blocks:
+                x = checkpoint(block, x, y)
+        else:
+            for block in self.res_blocks:
+                x = block(x, y)
+        return self.final_layer(x, y)
+    def forward_with_cfg(self, x, t, c, cfg_scale):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, c)
+        eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)

diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,877 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        temperature=1.0
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param temperature: temperature scaling during Diff Loss sampling.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # scale the noise by temperature
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise * temperature
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        temperature=1.0,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :param temperature: temperature scaling during Diff Loss sampling.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            temperature=temperature,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        temperature=1.0,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape).cuda()
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0]).cuda()
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    temperature=temperature,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape).cuda()
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0]).cuda()
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

mar.py ADDED Viewed

	@@ -0,0 +1,470 @@

+from functools import partial
+import numpy as np
+from tqdm import tqdm
+import scipy.stats as stats
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.utils.checkpoint import checkpoint
+from timm.models.vision_transformer import Block
+from .diffloss import DiffLoss
+def mask_by_order(mask_len, order, bsz, seq_len):
+    masking = torch.zeros(bsz, seq_len).to(order.device)
+    masking = torch.scatter(masking, dim=-1, index=order[:, :mask_len.long()],
+                            src=torch.ones(bsz, seq_len).to(order.device)).bool()
+    return masking
+class MAR(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=256, vae_stride=16, patch_size=1,
+                 encoder_embed_dim=1024, encoder_depth=16, encoder_num_heads=16,
+                 decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm,
+                 vae_embed_dim=16,
+                 mask_ratio_min=0.7,
+                 label_drop_prob=0.1,
+                 class_num=1000,
+                 attn_dropout=0.1,
+                 proj_dropout=0.1,
+                 buffer_size=64,
+                 diffloss_d=3,
+                 diffloss_w=1024,
+                 num_sampling_steps='100',
+                 diffusion_batch_mul=4,
+                 grad_checkpointing=False,
+                 ):
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # VAE and patchify specifics
+        self.vae_embed_dim = vae_embed_dim
+        self.img_size = img_size
+        self.vae_stride = vae_stride
+        self.patch_size = patch_size
+        self.seq_h = self.seq_w = img_size // vae_stride // patch_size
+        self.seq_len = self.seq_h * self.seq_w
+        self.token_embed_dim = vae_embed_dim * patch_size**2
+        self.grad_checkpointing = grad_checkpointing
+        # --------------------------------------------------------------------------
+        # Class Embedding
+        self.num_classes = class_num
+        self.class_emb = nn.Embedding(class_num, encoder_embed_dim)
+        self.label_drop_prob = label_drop_prob
+        # Fake class embedding for CFG's unconditional generation
+        self.fake_latent = nn.Parameter(torch.zeros(1, encoder_embed_dim))
+        # --------------------------------------------------------------------------
+        # MAR variant masking ratio, a left-half truncated Gaussian centered at 100% masking ratio with std 0.25
+        self.mask_ratio_generator = stats.truncnorm((mask_ratio_min - 1.0) / 0.25, 0, loc=1.0, scale=0.25)
+        # --------------------------------------------------------------------------
+        # MAR encoder specifics
+        self.encoder_embed_dim = encoder_embed_dim
+        self.z_proj = nn.Linear(self.token_embed_dim, encoder_embed_dim, bias=True)
+        self.z_proj_ln = nn.LayerNorm(encoder_embed_dim, eps=1e-6)
+        self.buffer_size = buffer_size
+        self.encoder_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len + self.buffer_size, encoder_embed_dim))
+        self.encoder_blocks = nn.ModuleList([
+            Block(encoder_embed_dim, encoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(encoder_depth)])
+        self.encoder_norm = norm_layer(encoder_embed_dim)
+        # --------------------------------------------------------------------------
+        # MAR decoder specifics
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_embed = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len + self.buffer_size, decoder_embed_dim))
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.diffusion_pos_embed_learned = nn.Parameter(torch.zeros(1, self.seq_len, decoder_embed_dim))
+        self.initialize_weights()
+        # --------------------------------------------------------------------------
+        # Diffusion Loss
+        self.diffloss = DiffLoss(
+            target_channels=self.token_embed_dim,
+            z_channels=decoder_embed_dim,
+            width=diffloss_w,
+            depth=diffloss_d,
+            num_sampling_steps=num_sampling_steps,
+            grad_checkpointing=self.grad_checkpointing
+        )
+        self.diffusion_batch_mul = diffusion_batch_mul
+    def get_encoder_pos_embed(self, h, w):
+        if h == self.seq_h and w == self.seq_w:
+            return self.encoder_pos_embed_learned
+        buffer_pe, image_pe = self.encoder_pos_embed_learned.split(
+            [self.buffer_size, self.seq_len], dim=1)
+        image_pe = rearrange(image_pe, 'b (h w) c -> b c h w',
+                             h=self.seq_h, w=self.seq_w)
+        image_pe = F.interpolate(image_pe, size=(h, w), mode='bilinear')
+        image_pe = rearrange(image_pe, 'b c h w -> b (h w) c')
+        return torch.cat([buffer_pe, image_pe], dim=1)
+    def get_decoder_pos_embed(self, h, w):
+        if h == self.seq_h and w == self.seq_w:
+            return self.decoder_pos_embed_learned
+        buffer_pe, image_pe = self.decoder_pos_embed_learned.split(
+            [self.buffer_size, self.seq_len], dim=1)
+        image_pe = rearrange(image_pe, 'b (h w) c -> b c h w',
+                             h=self.seq_h, w=self.seq_w)
+        image_pe = F.interpolate(image_pe, size=(h, w), mode='bilinear')
+        image_pe = rearrange(image_pe, 'b c h w -> b (h w) c')
+        return torch.cat([buffer_pe, image_pe], dim=1)
+    def get_diffusion_pos_embed(self, h, w):
+        if h == self.seq_h and w == self.seq_w:
+            return self.diffusion_pos_embed_learned
+        image_pe = self.diffusion_pos_embed_learned
+        image_pe = rearrange(image_pe, 'b (h w) c -> b c h w',
+                             h=self.seq_h, w=self.seq_w)
+        image_pe = F.interpolate(image_pe, size=(h, w), mode='bilinear')
+        image_pe = rearrange(image_pe, 'b c h w -> b (h w) c')
+        return image_pe
+    def initialize_weights(self):
+        # parameters
+        torch.nn.init.normal_(self.class_emb.weight, std=.02)
+        torch.nn.init.normal_(self.fake_latent, std=.02)
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        torch.nn.init.normal_(self.encoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.decoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.diffusion_pos_embed_learned, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+            if m.weight is not None:
+                nn.init.constant_(m.weight, 1.0)
+    @property
+    def device(self):
+        return self.fake_latent.data.device
+    @property
+    def dtype(self):
+        return self.fake_latent.data.dtype
+    def patchify(self, x):
+        bsz, c, h, w = x.shape
+        p = self.patch_size
+        h_, w_ = h // p, w // p
+        x = x.reshape(bsz, c, h_, p, w_, p)
+        x = torch.einsum('nchpwq->nhwcpq', x)
+        x = x.reshape(bsz, h_ * w_, c * p ** 2)
+        return x  # [n, l, d]
+    def unpatchify(self, x):
+        bsz = x.shape[0]
+        p = self.patch_size
+        c = self.vae_embed_dim
+        h_, w_ = self.seq_h, self.seq_w
+        x = x.reshape(bsz, h_, w_, c, p, p)
+        x = torch.einsum('nhwcpq->nchpwq', x)
+        x = x.reshape(bsz, c, h_ * p, w_ * p)
+        return x  # [n, c, h, w]
+    def sample_orders(self, bsz, seq_len=None):
+        if seq_len is None:
+            seq_len = self.seq_len
+        # generate a batch of random generation orders
+        orders = []
+        for _ in range(bsz):
+            order = np.array(list(range(seq_len)))
+            np.random.shuffle(order)
+            orders.append(order)
+        orders = torch.Tensor(np.array(orders)).to(self.device).long()
+        return orders
+    def random_masking(self, x, orders):
+        # generate token mask
+        bsz, seq_len, embed_dim = x.shape
+        assert seq_len == orders.shape[1]
+        mask_rate = self.mask_ratio_generator.rvs(1)[0]
+        num_masked_tokens = int(np.ceil(seq_len * mask_rate))
+        mask = torch.zeros(bsz, seq_len, device=x.device)
+        mask = torch.scatter(mask, dim=-1, index=orders[:, :num_masked_tokens],
+                             src=torch.ones(bsz, seq_len, device=x.device))
+        return mask
+    def forward_mae_encoder(self, x, mask, class_embedding, image_shape=None):
+        x = x.to(self.dtype)
+        x = self.z_proj(x)
+        bsz, seq_len, embed_dim = x.shape
+        # concat buffer
+        x = torch.cat([x.new_zeros(bsz, self.buffer_size, embed_dim), x], dim=1)
+        mask_with_buffer = torch.cat([mask.new_zeros(x.size(0), self.buffer_size), mask], dim=1)
+        # random drop class embedding during training
+        # if self.training:
+        #     drop_latent_mask = torch.rand(bsz) < self.label_drop_prob
+        #     drop_latent_mask = drop_latent_mask.unsqueeze(-1).to(self.device).to(x.dtype)
+        #     class_embedding = drop_latent_mask * self.fake_latent + (1 - drop_latent_mask) * class_embedding
+        x[:, :self.buffer_size] = class_embedding.view(bsz, -1, embed_dim)
+        # encoder position embedding
+        # x = x + self.encoder_pos_embed_learned
+        if image_shape is None:
+            x = x + self.encoder_pos_embed_learned
+        else:
+            h, w = image_shape
+            assert h * w == seq_len
+            x = x + self.get_encoder_pos_embed(h=h, w=w)
+        # import pdb; pdb.set_trace()
+        x = self.z_proj_ln(x)
+        # dropping
+        x = x[(1-mask_with_buffer).nonzero(as_tuple=True)].reshape(bsz, -1, embed_dim)
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.encoder_blocks:
+                x = checkpoint(block, x,
+                               use_reentrant=False
+                               )
+        else:
+            for block in self.encoder_blocks:
+                x = block(x)
+        x = self.encoder_norm(x)
+        return x
+    def forward_mae_decoder(self, x, mask, image_shape=None, x_con=None):
+        bsz, seq_len = mask.shape
+        x = self.decoder_embed(x)
+        mask_with_buffer = torch.cat([torch.zeros(x.size(0), self.buffer_size, device=x.device), mask], dim=1)
+        # pad mask tokens
+        mask_tokens = self.mask_token.repeat(mask_with_buffer.shape[0], mask_with_buffer.shape[1], 1).to(x.dtype)
+        if x_con is not None:
+            x_after_pad = self.decoder_embed(x_con)
+        else:
+            x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - mask_with_buffer).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        # decoder position embedding
+        # x = x_after_pad + self.decoder_pos_embed_learned
+        if image_shape is None:
+            x = x_after_pad + self.decoder_pos_embed_learned
+        else:
+            h, w = image_shape
+            assert h * w == seq_len
+            x = x_after_pad + self.get_decoder_pos_embed(h=h, w=w)
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.decoder_blocks:
+                x = checkpoint(block, x,
+                               # use_reentrant=False
+                               )
+        else:
+            for block in self.decoder_blocks:
+                x = block(x)
+        x = self.decoder_norm(x)
+        x = x[:, self.buffer_size:]
+        # x = x + self.diffusion_pos_embed_learned
+        if image_shape is None:
+            x = x + self.diffusion_pos_embed_learned
+        else:
+            h, w = image_shape
+            assert h * w == seq_len
+            x = x + self.get_diffusion_pos_embed(h=h, w=w)
+        return x
+    def mae_decoder_prepare(self, x, mask):
+        x = self.decoder_embed(x)
+        mask_with_buffer = torch.cat([torch.zeros(x.size(0), self.buffer_size, device=x.device), mask], dim=1)
+        # pad mask tokens
+        mask_tokens = self.mask_token.repeat(mask_with_buffer.shape[0], mask_with_buffer.shape[1], 1).to(x.dtype)
+        x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - mask_with_buffer).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        # decoder position embedding
+        x = x_after_pad + self.decoder_pos_embed_learned
+        return x
+    def mae_decoder_forward(self, x):
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.decoder_blocks:
+                x = checkpoint(block, x,
+                               # use_reentrant=False
+                               )
+        else:
+            for block in self.decoder_blocks:
+                x = block(x)
+        x = self.decoder_norm(x)
+        x = x[:, self.buffer_size:]
+        x = x + self.diffusion_pos_embed_learned
+        return x
+    def forward_loss(self, z, target, mask):
+        bsz, seq_len, _ = target.shape
+        target = target.reshape(bsz * seq_len, -1).repeat(self.diffusion_batch_mul, 1)
+        z = z.reshape(bsz*seq_len, -1).repeat(self.diffusion_batch_mul, 1)
+        mask = mask.reshape(bsz*seq_len).repeat(self.diffusion_batch_mul)
+        loss = self.diffloss(z=z, target=target, mask=mask)
+        return loss
+    def forward(self, imgs, labels):
+        # class embed
+        class_embedding = self.class_emb(labels)
+        # patchify and mask (drop) tokens
+        x = self.patchify(imgs)
+        gt_latents = x.clone().detach()
+        orders = self.sample_orders(bsz=x.size(0))
+        mask = self.random_masking(x, orders)
+        # mae encoder
+        x = self.forward_mae_encoder(x, mask, class_embedding)
+        # mae decoder
+        z = self.forward_mae_decoder(x, mask)
+        # diffloss
+        loss = self.forward_loss(z=z, target=gt_latents, mask=mask)
+        return loss
+    def sample_tokens(self, bsz, num_iter=64, cfg=1.0, cfg_schedule="linear", labels=None, temperature=1.0, progress=False):
+        import pdb; pdb.set_trace()
+        # init and sample generation orders
+        mask = torch.ones(bsz, self.seq_len).to(self.device)
+        tokens = torch.zeros(bsz, self.seq_len, self.token_embed_dim).to(self.device)
+        orders = self.sample_orders(bsz)
+        indices = list(range(num_iter))
+        if progress:
+            indices = tqdm(indices)
+        # generate latents
+        for step in indices:
+            cur_tokens = tokens.clone()
+            # class embedding and CFG
+            if labels is not None:
+                class_embedding = self.class_emb(labels)
+            else:
+                class_embedding = self.fake_latent.repeat(bsz, 1)
+            if not cfg == 1.0:
+                tokens = torch.cat([tokens, tokens], dim=0)
+                class_embedding = torch.cat([class_embedding, self.fake_latent.repeat(bsz, 1)], dim=0)
+                mask = torch.cat([mask, mask], dim=0)
+            # mae encoder
+            x = self.forward_mae_encoder(tokens, mask.to(self.dtype), class_embedding)
+            # mae decoder
+            z = self.forward_mae_decoder(x, mask.to(self.dtype))
+            import pdb; pdb.set_trace()
+            # mask ratio for the next round, following MaskGIT and MAGE.
+            mask_ratio = np.cos(math.pi / 2. * (step + 1) / num_iter)
+            mask_len = torch.Tensor([np.floor(self.seq_len * mask_ratio)]).to(self.device)
+            import pdb; pdb.set_trace()
+            # masks out at least one for the next iteration
+            mask_len = torch.maximum(torch.Tensor([1]).to(self.device),
+                                     torch.minimum(torch.sum(mask, dim=-1, keepdims=True) - 1, mask_len))
+            import pdb; pdb.set_trace()
+            # get masking for next iteration and locations to be predicted in this iteration
+            mask_next = mask_by_order(mask_len[0], orders, bsz, self.seq_len)
+            import pdb; pdb.set_trace()
+            if step >= num_iter - 1:
+                mask_to_pred = mask[:bsz].bool()
+            else:
+                mask_to_pred = torch.logical_xor(mask[:bsz].bool(), mask_next.bool())
+            mask = mask_next
+            if not cfg == 1.0:
+                mask_to_pred = torch.cat([mask_to_pred, mask_to_pred], dim=0)
+            import pdb; pdb.set_trace()
+            # sample token latents for this step
+            z = z[mask_to_pred.nonzero(as_tuple=True)]
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                cfg_iter = 1 + (cfg - 1) * (self.seq_len - mask_len[0]) / self.seq_len
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+            else:
+                raise NotImplementedError
+            sampled_token_latent = self.diffloss.sample(z, temperature, cfg_iter)
+            if not cfg == 1.0:
+                sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)  # Remove null class samples
+                mask_to_pred, _ = mask_to_pred.chunk(2, dim=0)
+            import pdb; pdb.set_trace()
+            cur_tokens[mask_to_pred.nonzero(as_tuple=True)] = sampled_token_latent
+            tokens = cur_tokens.clone()
+        # unpatchify
+        tokens = self.unpatchify(tokens)
+        return tokens
+    def gradient_checkpointing_enable(self):
+        self.grad_checkpointing = True
+    def gradient_checkpointing_disable(self):
+        self.grad_checkpointing = False
+def mar_base(**kwargs):
+    model = MAR(
+        encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12,
+        decoder_embed_dim=768, decoder_depth=12, decoder_num_heads=12,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_large(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1024, encoder_depth=16, encoder_num_heads=16,
+        decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_huge(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1280, encoder_depth=20, encoder_num_heads=16,
+        decoder_embed_dim=1280, decoder_depth=20, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model

method.png ADDED Viewed

Git LFS Details

SHA256: 381621b09d21382593586db80c77b2935e34094f1a155c02d47934d928020b28
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

misc.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import builtins
+import datetime
+import os
+import time
+from collections import defaultdict, deque
+from pathlib import Path
+import torch
+import torch.distributed as dist
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+    from torch._six import inf
+else:
+    from torch import inf
+import copy
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+    builtins.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list or 'diffloss' in name:
+            no_decay.append(param)  # no weight decay on bias, norm and diffloss
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, ema_params=None, epoch_name=None):
+    if epoch_name is None:
+        epoch_name = str(epoch)
+    output_dir = Path(args.output_dir)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % epoch_name)
+    # ema
+    if ema_params is not None:
+        ema_state_dict = copy.deepcopy(model_without_ddp.state_dict())
+        for i, (name, _value) in enumerate(model_without_ddp.named_parameters()):
+            assert name in ema_state_dict
+            ema_state_dict[name] = ema_params[i]
+    else:
+        ema_state_dict = None
+    to_save = {
+        'model': model_without_ddp.state_dict(),
+        'model_ema': ema_state_dict,
+        'optimizer': optimizer.state_dict(),
+        'epoch': epoch,
+        'scaler': loss_scaler.state_dict(),
+        'args': args,
+    }
+    save_on_master(to_save, checkpoint_path)
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6932594180617297db1bf532283356734f6d98b392e58e04029907fcbd75ae86
+size 4994116086

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64a2820961dd24cbbf166a95f67a620f17da56b282df6ba53f084ba71e2b0df3
+size 595233832

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_harmon.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import torch
+import math
+import numpy as np
+import torch.nn as nn
+import copy
+from einops import rearrange
+from torch.nn.modules.module import T
+from transformers.cache_utils import DynamicCache
+from tqdm import tqdm
+from transformers import Qwen2ForCausalLM, Qwen2Config, PreTrainedModel
+from .diffusion_utils import *
+from .gaussian_diffusion import *
+from .respace import *
+from .misc import *
+from .diffloss import *
+from .configuration_harmon import HarmonConfig
+from .vae import AutoencoderKL
+from .mar import mar_base, mar_large, mar_huge
+def build_mlp(hidden_size, projector_dim, z_dim):
+    return nn.Sequential(
+        nn.Linear(hidden_size, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, z_dim),)
+def mask_by_order(mask_len, order, bsz, seq_len):
+    masking = torch.zeros(bsz, seq_len, device=order.device)
+    masking = torch.scatter(masking, dim=-1, index=order[:, :mask_len.long()],
+                            src=torch.ones(bsz, seq_len, device=order.device)).bool()
+    return masking
+class HarmonModel(PreTrainedModel):
+    config_class = HarmonConfig
+    def __init__(self, config: HarmonConfig):
+        super().__init__(config)
+        # VAE
+        self.vae = AutoencoderKL(
+            embed_dim=16,
+            ch_mult=(1, 1, 2, 2, 4)
+        )
+        self.vae_scale = 0.2325
+        # LLM
+        self.llm = Qwen2ForCausalLM(config=Qwen2Config.from_dict(config.llm))
+        # MAR
+        mar_config = copy.deepcopy(config.mar)
+        mar_type = mar_config.pop('type')
+        if mar_type == 'mar_base':
+            self.mar = mar_base(**mar_config)
+        elif mar_type == 'mar_large':
+            self.mar = mar_large(**mar_config)
+        elif mar_type == 'mar_huge':
+            self.mar = mar_huge(**mar_config)
+        else:
+            raise ValueError
+        # projection layers
+        self.proj_in = build_mlp(hidden_size=self.mar.encoder_embed_dim,
+                                 projector_dim=self.llm.config.hidden_size,
+                                 z_dim=self.llm.config.hidden_size)
+        self.proj_out = build_mlp(hidden_size=self.llm.config.hidden_size,
+                                  projector_dim=self.llm.config.hidden_size,
+                                  z_dim=self.mar.encoder_embed_dim)
+    @property
+    def llm_model(self):
+        return self.llm.model
+    @property
+    def device(self):
+        return self.llm.device
+    @property
+    def dtype(self):
+        return self.llm.dtype
+    @property
+    def gen_seq_len(self):
+        return self.mar.seq_len
+    @property
+    def token_embed_dim(self):
+        return self.vae.embed_dim * (self.mar.patch_size ** 2)
+    @torch.no_grad()
+    def encode(self, x):
+        posterior = self.vae.encode(x)
+        z = posterior.sample().mul_(self.vae_scale)
+        z = rearrange(z, 'b c (m p) (n q) -> b m n (c p q)',
+                      p=self.mar.patch_size, q=self.mar.patch_size)
+        return z
+    @torch.no_grad()
+    def decode(self, z):
+        z /= self.vae_scale
+        z = rearrange(z, 'b m n (c p q) -> b c (m p) (n q)',
+                      p=self.mar.patch_size, q=self.mar.patch_size)
+        x = self.vae.decode(z)
+        return x
+    def prepare_forward_input(self,
+                              x,
+                              inputs_embeds=None,
+                              input_ids=None,
+                              attention_mask=None,
+                              past_key_values=None):
+        b, l, _ = x.shape
+        attention_mask = attention_mask.to(device=self.device, dtype=torch.bool)
+        attention_mask = torch.cat([
+            attention_mask, attention_mask.new_ones(b, l)
+        ], dim=1)
+        position_ids = torch.cumsum(attention_mask, dim=1) - 1
+        position_ids[position_ids < 0] = 0
+        # import pdb; pdb.set_trace()
+        # prepare context
+        if past_key_values is not None:
+            inputs_embeds = x
+            position_ids = position_ids[:, -l:]
+        else:
+            if inputs_embeds is None:
+                input_ids = input_ids.to(self.device)
+                inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+            inputs_embeds = torch.cat([inputs_embeds, x], dim=1)
+        return dict(inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values)
+    def extract_visual_feature(self, x, mask=None, detach=False):
+        b, m, n, _ = x.shape
+        x = x.view(b, m*n, -1)
+        # x: b mn c
+        if mask is None:
+            mask = torch.zeros_like(x[..., 0])
+        null_embeds = self.mar.fake_latent.expand(x.shape[0], -1)
+        x_enc = self.mar.forward_mae_encoder(x, mask, null_embeds, image_shape=(m, n))
+        z_enc = self.proj_in(x_enc)
+        # Move buffers to the end of the image sequence
+        z_enc = torch.cat([
+            z_enc[:, self.mar.buffer_size:],
+            z_enc[:, :self.mar.buffer_size]], dim=1)
+        if detach:
+            x_enc = x_enc.detach()
+            z_enc = z_enc.detach()
+        return x_enc, z_enc
+    def forward_mae_encoder(self, x, mask, detach=False, **context):
+        b, m, n, _ = x.shape
+        x_enc, z_enc = self.extract_visual_feature(x, mask=mask, detach=detach)
+        inputs = self.prepare_forward_input(x=z_enc, **context)
+        output = self.llm_model(**inputs, return_dict=True)
+        z_llm = output.last_hidden_state[:, -z_enc.shape[1]:]
+        # move buffers back to the start of the image sequence
+        z_llm = torch.cat([
+            z_llm[:, -self.mar.buffer_size:],
+            z_llm[:, :-self.mar.buffer_size]], dim=1)
+        # residual learning
+        x_enc = x_enc + self.proj_out(z_llm)
+        return x_enc
+    @staticmethod
+    def curtail_cache(past_key_values, cur_len):
+        for past_key_values_ in past_key_values:
+            keys, values = past_key_values_
+            keys.data = keys.data[:, :, :cur_len]
+            values.data = values.data[:, :, :cur_len]
+    @torch.no_grad()
+    def sample(self,
+               input_ids=None, inputs_embeds=None,
+               attention_mask=None, num_iter=64, cfg=1.0, cfg_schedule="constant", temperature=1.0,
+               progress=False, mask=None, past_key_values=None, image_shape=None, x_con=None, **kwargs):
+        if inputs_embeds is None and input_ids is not None:
+            inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+        bsz = attention_mask.shape[0]
+        if cfg != 1.0:
+            assert bsz % 2 == 0
+        if image_shape is None:
+            m = n = int(self.gen_seq_len ** 0.5)
+        else:
+            m, n = image_shape
+        if mask is None:
+            mask = torch.ones(bsz, m*n, device=self.device, dtype=self.dtype)
+        else:
+            mask = mask.view(bsz, m*n)
+        tokens = torch.zeros(bsz, m*n, self.token_embed_dim,
+                             device=self.device, dtype=self.dtype)
+        orders = self.mar.sample_orders(bsz, seq_len=m*n)
+        if cfg != 1.0:
+            orders[bsz//2:] = orders[:bsz//2]
+        indices = list(range(num_iter))
+        if progress:
+            indices = tqdm(indices)
+        # past key values can be prepared outside (usually in multi-turn editing)
+        if past_key_values is None:
+            output = self.llm_model(inputs_embeds=inputs_embeds,
+                                    attention_mask=None,
+                                    position_ids=None,
+                                    past_key_values=DynamicCache.from_legacy_cache(),
+                                    return_dict=True,
+                                    use_cache=True)
+            past_key_values = output.past_key_values
+        # generate latents
+        for step in indices:
+            cur_tokens = tokens.clone()
+            x_enc = self.forward_mae_encoder(tokens.view(bsz, m, n, -1),
+                                             mask.to(self.dtype),
+                                             past_key_values=past_key_values,
+                                             # inputs_embeds=inputs_embeds,
+                                             attention_mask=attention_mask)
+            # import pdb; pdb.set_trace()
+            self.curtail_cache(past_key_values, inputs_embeds.shape[1])
+            # import pdb; pdb.set_trace()
+            z = self.mar.forward_mae_decoder(x_enc, mask.to(self.dtype), image_shape=(m, n), x_con=x_con)
+            # mask ratio for the next round, following MaskGIT and MAGE.
+            mask_ratio = np.cos(math.pi / 2. * (step + 1) / num_iter)
+            mask_len = torch.Tensor([np.floor(m*n * mask_ratio)]).to(self.device)
+            # masks out at least one for the next iteration
+            mask_len = torch.maximum(torch.Tensor([1]).to(self.device),
+                                     torch.minimum(torch.sum(mask, dim=-1, keepdims=True) - 1, mask_len))
+            # get masking for next iteration and locations to be predicted in this iteration
+            mask_next = mask_by_order(mask_len[0], orders, bsz, m*n).to(self.device)
+            if cfg != 1.0:
+                mask_next[bsz//2:] = mask_next[:bsz//2]
+            if step >= num_iter - 1:
+                mask_to_pred = mask[:bsz].bool()
+            else:
+                mask_to_pred = torch.logical_xor(mask[:bsz].bool(), mask_next.bool())
+            mask = mask_next
+            # if not cfg == 1.0:
+            #     mask_to_pred = torch.cat([mask_to_pred, mask_to_pred], dim=0)
+            # sample token latents for this step
+            z = z[mask_to_pred.nonzero(as_tuple=True)]
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                cfg_iter = 1 + (cfg - 1) * (m*n - mask_len[0]) / (m*n)
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+            else:
+                raise NotImplementedError
+            sampled_token_latent = self.mar.diffloss.sample(z, temperature, cfg_iter).to(self.dtype)
+            # if not cfg == 1.0:
+            #     sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)  # Remove null class samples
+            #     mask_to_pred, _ = mask_to_pred.chunk(2, dim=0)
+            cur_tokens[mask_to_pred.nonzero(as_tuple=True)] = sampled_token_latent
+            if cfg != 1.0:
+                cur_tokens[bsz//2:] = cur_tokens[:bsz//2]
+            tokens = cur_tokens.clone()
+        pred = self.decode(tokens.view(bsz, m, n, -1))
+        if cfg != 1.0:
+            pred = pred[:bsz//2]
+        return pred

respace.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vae.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import torch
+import torch.nn as nn
+import numpy as np
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        out_ch=3,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        attn_resolutions=(16,),
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels=3,
+        resolution=256,
+        z_channels=16,
+        double_z=True,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        out_ch=3,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        attn_resolutions=(),
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels=3,
+        resolution=256,
+        z_channels=16,
+        give_pre_end=False,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self):
+        return self.mean
+class AutoencoderKL(nn.Module):
+    def __init__(self, embed_dim, ch_mult, use_variational=True, ckpt_path=None):
+        super().__init__()
+        self.encoder = Encoder(ch_mult=ch_mult, z_channels=embed_dim)
+        self.decoder = Decoder(ch_mult=ch_mult, z_channels=embed_dim)
+        self.use_variational = use_variational
+        mult = 2 if self.use_variational else 1
+        self.quant_conv = torch.nn.Conv2d(2 * embed_dim, mult * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, embed_dim, 1)
+        self.embed_dim = embed_dim
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path)
+    def init_from_ckpt(self, path):
+        sd = torch.load(path, map_location="cpu")["model"]
+        msg = self.load_state_dict(sd, strict=False)
+        print("Loading pre-trained KL-VAE")
+        print("Missing keys:")
+        print(msg.missing_keys)
+        print("Unexpected keys:")
+        print(msg.unexpected_keys)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        if not self.use_variational:
+            moments = torch.cat((moments, torch.ones_like(moments)), 1)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, inputs, disable=True, train=True, optimizer_idx=0):
+        if train:
+            return self.training_step(inputs, disable, optimizer_idx)
+        else:
+            return self.validation_step(inputs, disable)
+if __name__ == "__main__":
+    from PIL import Image
+    import torch.nn.functional as F
+    vae = AutoencoderKL(
+        embed_dim=16, ch_mult=(1, 1, 2, 2, 4),
+        ckpt_path='checkpoints/kl16.ckpt')
+    image = Image.open('data/ILSVRC2012_val_00023344.JPEG')
+    image = torch.from_numpy(np.array(image))
+    image = image.permute(2, 0, 1).float() / 255
+    image = 2 * image - 1
+    x = F.interpolate(image[None], size=(256, 256), mode='bilinear', align_corners=True)
+    print(x.shape)
+    with torch.no_grad():
+        z = vae.encode(x).sample()
+        print(z.shape)
+        x_rec = vae.decode(z)[0]
+    x_rec = (x_rec + 1.0) * 255 / 2
+    x_rec = torch.clamp(x_rec, min=0, max=255)
+    x_rec = x_rec.to(torch.uint8)
+    x_rec = x_rec.permute(1, 2, 0)
+    x_rec = Image.fromarray(x_rec.numpy())
+    x_rec.show()

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff