Spaces:

alexnasa
/

OmniAvatar-Clay-Fast

Paused

App Files Files Community

alexnasa commited on Sep 4

Commit

bb65ef0

verified ·

1 Parent(s): 3e48e28

Upload 42 files

Browse files

Files changed (43) hide show

.gitattributes +6 -0
LICENSE.txt +201 -0
OmniAvatar/base.py +127 -0
OmniAvatar/configs/__init__.py +0 -0
OmniAvatar/configs/model_config.py +17 -0
OmniAvatar/distributed/__init__.py +0 -0
OmniAvatar/distributed/fsdp.py +43 -0
OmniAvatar/distributed/xdit_context_parallel.py +134 -0
OmniAvatar/models/audio_pack.py +40 -0
OmniAvatar/models/model_manager.py +432 -0
OmniAvatar/models/vsa_util.py +232 -0
OmniAvatar/models/wan_video_dit.py +607 -0
OmniAvatar/models/wan_video_text_encoder.py +269 -0
OmniAvatar/models/wan_video_vae.py +807 -0
OmniAvatar/models/wav2vec.py +208 -0
OmniAvatar/prompters/__init__.py +1 -0
OmniAvatar/prompters/base_prompter.py +70 -0
OmniAvatar/prompters/wan_prompter.py +109 -0
OmniAvatar/schedulers/flow_match.py +79 -0
OmniAvatar/utils/args_config.py +123 -0
OmniAvatar/utils/audio_preprocess.py +21 -0
OmniAvatar/utils/io_utils.py +256 -0
OmniAvatar/vram_management/__init__.py +1 -0
OmniAvatar/vram_management/layers.py +95 -0
OmniAvatar/wan_video.py +344 -0
README.md +13 -12
app.py +942 -0
args_config.yaml +71 -0
assets/logo-omniavatar.png +0 -0
assets/material/pipeline.png +3 -0
assets/material/teaser.png +3 -0
configs/inference.yaml +37 -0
configs/inference_1.3B.yaml +37 -0
examples/audios/fox.wav +3 -0
examples/audios/lion.wav +3 -0
examples/audios/ocean.wav +3 -0
examples/audios/script.wav +3 -0
examples/images/female-002.png +0 -0
examples/images/female-003.png +3 -0
examples/images/female-009.png +0 -0
examples/images/male-001.png +3 -0
requirements.txt +18 -0
scripts/inference.py +383 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/audios/fox.wav filter=lfs diff=lfs merge=lfs -text
+examples/audios/lion.wav filter=lfs diff=lfs merge=lfs -text
+examples/audios/ocean.wav filter=lfs diff=lfs merge=lfs -text
+examples/audios/script.wav filter=lfs diff=lfs merge=lfs -text
+examples/images/female-003.png filter=lfs diff=lfs merge=lfs -text
+examples/images/male-001.png filter=lfs diff=lfs merge=lfs -text

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

OmniAvatar/base.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import numpy as np
+from PIL import Image
+from torchvision.transforms import GaussianBlur
+class BasePipeline(torch.nn.Module):
+    def __init__(self, device="cuda", torch_dtype=torch.float16, height_division_factor=64, width_division_factor=64):
+        super().__init__()
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.cpu_offload = False
+        self.model_names = []
+    def check_resize_height_width(self, height, width):
+        if height % self.height_division_factor != 0:
+            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
+            print(f"The height cannot be evenly divided by {self.height_division_factor}. We round it up to {height}.")
+        if width % self.width_division_factor != 0:
+            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
+            print(f"The width cannot be evenly divided by {self.width_division_factor}. We round it up to {width}.")
+        return height, width
+    def preprocess_image(self, image):
+        image = torch.Tensor(np.array(image, dtype=np.float16) * (2.0 / 255) - 1.0).permute(2, 0, 1).unsqueeze(0)
+        return image
+    def preprocess_images(self, images):
+        return [self.preprocess_image(image) for image in images]
+    def vae_output_to_image(self, vae_output):
+        image = vae_output[0].cpu().float().permute(1, 2, 0).numpy()
+        image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8"))
+        return image
+    def vae_output_to_video(self, vae_output):
+        video = vae_output.cpu().permute(1, 2, 0).numpy()
+        video = [Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8")) for image in video]
+        return video
+    def merge_latents(self, value, latents, masks, scales, blur_kernel_size=33, blur_sigma=10.0):
+        if len(latents) > 0:
+            blur = GaussianBlur(kernel_size=blur_kernel_size, sigma=blur_sigma)
+            height, width = value.shape[-2:]
+            weight = torch.ones_like(value)
+            for latent, mask, scale in zip(latents, masks, scales):
+                mask = self.preprocess_image(mask.resize((width, height))).mean(dim=1, keepdim=True) > 0
+                mask = mask.repeat(1, latent.shape[1], 1, 1).to(dtype=latent.dtype, device=latent.device)
+                mask = blur(mask)
+                value += latent * mask * scale
+                weight += mask * scale
+            value /= weight
+        return value
+    def control_noise_via_local_prompts(self, prompt_emb_global, prompt_emb_locals, masks, mask_scales, inference_callback, special_kwargs=None, special_local_kwargs_list=None):
+        if special_kwargs is None:
+            noise_pred_global = inference_callback(prompt_emb_global)
+        else:
+            noise_pred_global = inference_callback(prompt_emb_global, special_kwargs)
+        if special_local_kwargs_list is None:
+            noise_pred_locals = [inference_callback(prompt_emb_local) for prompt_emb_local in prompt_emb_locals]
+        else:
+            noise_pred_locals = [inference_callback(prompt_emb_local, special_kwargs) for prompt_emb_local, special_kwargs in zip(prompt_emb_locals, special_local_kwargs_list)]
+        noise_pred = self.merge_latents(noise_pred_global, noise_pred_locals, masks, mask_scales)
+        return noise_pred
+    def extend_prompt(self, prompt, local_prompts, masks, mask_scales):
+        local_prompts = local_prompts or []
+        masks = masks or []
+        mask_scales = mask_scales or []
+        extended_prompt_dict = self.prompter.extend_prompt(prompt)
+        prompt = extended_prompt_dict.get("prompt", prompt)
+        local_prompts += extended_prompt_dict.get("prompts", [])
+        masks += extended_prompt_dict.get("masks", [])
+        mask_scales += [100.0] * len(extended_prompt_dict.get("masks", []))
+        return prompt, local_prompts, masks, mask_scales
+    def enable_cpu_offload(self):
+        self.cpu_offload = True
+    def load_models_to_device(self, loadmodel_names=[]):
+        # only load models to device if cpu_offload is enabled
+        if not self.cpu_offload:
+            return
+        # offload the unneeded models to cpu
+        for model_name in self.model_names:
+            if model_name not in loadmodel_names:
+                model = getattr(self, model_name)
+                if model is not None:
+                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                        for module in model.modules():
+                            if hasattr(module, "offload"):
+                                module.offload()
+                    else:
+                        model.cpu()
+        # load the needed models to device
+        for model_name in loadmodel_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                    for module in model.modules():
+                        if hasattr(module, "onload"):
+                            module.onload()
+                else:
+                    model.to(self.device)
+        # fresh the cuda cache
+        torch.cuda.empty_cache()
+    def generate_noise(self, shape, seed=None, device="cpu", dtype=torch.float16):
+        generator = None if seed is None else torch.Generator(device).manual_seed(seed)
+        noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        return noise

OmniAvatar/configs/__init__.py ADDED Viewed

File without changes

OmniAvatar/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing_extensions import Literal, TypeAlias
+from ..models.wan_video_dit import WanModel
+from ..models.wan_video_text_encoder import WanTextEncoder
+from ..models.wan_video_vae import WanVideoVAE
+model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash, state_dict_keys_hash_with_shape, model_names, model_classes, model_resource)
+    (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
+    (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
+    (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+    (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+]

OmniAvatar/distributed/__init__.py ADDED Viewed

File without changes

OmniAvatar/distributed/fsdp.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
+        mixed_precision=MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

OmniAvatar/distributed/xdit_context_parallel.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+from typing import Optional
+from einops import rearrange
+from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                     get_sequence_parallel_world_size,
+                                     get_sp_group)
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+from yunchang import LongContextAttention
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(position.type(torch.float64), torch.pow(
+        10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+def rope_apply(x, freqs, num_heads):
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    s_per_rank = x.shape[1]
+    s_per_rank = get_sp_group().broadcast_object_list([s_per_rank], src=0)[0] # TODO: the size should be devided by sp_size
+    x_out = torch.view_as_complex(x.to(torch.float64).reshape(
+        x.shape[0], x.shape[1], x.shape[2], -1, 2))
+    sp_size = get_sequence_parallel_world_size()
+    sp_rank = get_sequence_parallel_rank()
+    if freqs.shape[0] % sp_size != 0 and freqs.shape[0] // sp_size == s_per_rank:
+        s_per_rank = s_per_rank + 1
+    freqs = pad_freqs(freqs, s_per_rank * sp_size)
+    freqs_rank = freqs[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
+    freqs_rank = freqs_rank[:x.shape[1]]
+    x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
+    return x_out.to(x.dtype)
+def usp_dit_forward(self,
+            x: torch.Tensor,
+            timestep: torch.Tensor,
+            context: torch.Tensor,
+            clip_feature: Optional[torch.Tensor] = None,
+            y: Optional[torch.Tensor] = None,
+            use_gradient_checkpointing: bool = False,
+            use_gradient_checkpointing_offload: bool = False,
+            **kwargs,
+            ):
+    t = self.time_embedding(
+        sinusoidal_embedding_1d(self.freq_dim, timestep))
+    t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+    context = self.text_embedding(context)
+    if self.has_image_input:
+        x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
+        clip_embdding = self.img_emb(clip_feature)
+        context = torch.cat([clip_embdding, context], dim=1)
+    x, (f, h, w) = self.patchify(x)
+    freqs = torch.cat([
+        self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+        self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+        self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+    ], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
+    def create_custom_forward(module):
+        def custom_forward(*inputs):
+            return module(*inputs)
+        return custom_forward
+    # Context Parallel
+    x = torch.chunk(
+        x, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    for block in self.blocks:
+        if self.training and use_gradient_checkpointing:
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x, context, t_mod, freqs,
+                        use_reentrant=False,
+                    )
+            else:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x, context, t_mod, freqs,
+                    use_reentrant=False,
+                )
+        else:
+            x = block(x, context, t_mod, freqs)
+    x = self.head(x, t)
+    # Context Parallel
+    if x.shape[1] * get_sequence_parallel_world_size() < freqs.shape[0]:
+        x = torch.cat([x, x[:, -1:]], 1) # TODO: this may cause some bias, the best way is to use sp_size=2
+    x = get_sp_group().all_gather(x, dim=1) # TODO: the size should be devided by sp_size
+    x = x[:, :freqs.shape[0]]
+    # unpatchify
+    x = self.unpatchify(x, (f, h, w))
+    return x
+def usp_attn_forward(self, x, freqs):
+    q = self.norm_q(self.q(x))
+    k = self.norm_k(self.k(x))
+    v = self.v(x)
+    q = rope_apply(q, freqs, self.num_heads)
+    k = rope_apply(k, freqs, self.num_heads)
+    q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
+    k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
+    v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
+    x = xFuserLongContextAttention()(
+        None,
+        query=q,
+        key=k,
+        value=v,
+    )
+    x = x.flatten(2)
+    return self.o(x)

OmniAvatar/models/audio_pack.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from typing import Tuple, Union
+import torch
+from einops import rearrange
+from torch import nn
+def make_triple(value: Union[int, Tuple[int, int, int]]) -> Tuple[int, int, int]:
+    value = (value,) * 3 if isinstance(value, int) else value
+    assert len(value) == 3
+    return value
+class AudioPack(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            patch_size: Union[int, Tuple[int, int, int]],
+            dim: int,
+            layernorm=False,
+    ):
+        super().__init__()
+        t, h, w = make_triple(patch_size)
+        self.patch_size = t, h, w
+        self.proj = nn.Linear(in_channels * t * h * w, dim)
+        if layernorm:
+            self.norm_out = nn.LayerNorm(dim)
+        else:
+            self.norm_out = None
+    def forward(
+            self,
+            vid: torch.Tensor,
+    ) -> torch.Tensor:
+        t, h, w = self.patch_size
+        vid = rearrange(vid, "b c (T t) (H h) (W w) -> b T H W (t h w c)", t=t, h=h, w=w)
+        vid = self.proj(vid)
+        if self.norm_out is not None:
+            vid = self.norm_out(vid)
+        return vid

OmniAvatar/models/model_manager.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import os, torch, json, importlib
+from typing import List
+import torch.nn as nn
+from ..configs.model_config import model_loader_configs
+from ..utils.io_utils import load_state_dict, init_weights_on_device, hash_state_dict_keys, split_state_dict_with_prefix, smart_load_weights
+class GeneralLoRAFromPeft:
+    def get_name_dict(self, lora_state_dict):
+        lora_name_dict = {}
+        for key in lora_state_dict:
+            if ".lora_B." not in key:
+                continue
+            keys = key.split(".")
+            if len(keys) > keys.index("lora_B") + 2:
+                keys.pop(keys.index("lora_B") + 1)
+            keys.pop(keys.index("lora_B"))
+            if keys[0] == "diffusion_model":
+                keys.pop(0)
+            target_name = ".".join(keys)
+            lora_name_dict[target_name] = (key, key.replace(".lora_B.", ".lora_A."))
+        return lora_name_dict
+    def match(self, model: torch.nn.Module, state_dict_lora):
+        lora_name_dict = self.get_name_dict(state_dict_lora)
+        model_name_dict = {name: None for name, _ in model.named_parameters()}
+        matched_num = sum([i in model_name_dict for i in lora_name_dict])
+        if matched_num == len(lora_name_dict):
+            return "", ""
+        else:
+            return None
+    def fetch_device_and_dtype(self, state_dict):
+        device, dtype = None, None
+        for name, param in state_dict.items():
+            device, dtype = param.device, param.dtype
+            break
+        computation_device = device
+        computation_dtype = dtype
+        if computation_device == torch.device("cpu"):
+            if torch.cuda.is_available():
+                computation_device = torch.device("cuda")
+        if computation_dtype == torch.float8_e4m3fn:
+            computation_dtype = torch.float32
+        return device, dtype, computation_device, computation_dtype
+    def load(self, model, state_dict_lora, lora_prefix="", alpha=1.0, model_resource=""):
+        state_dict_model = model.state_dict()
+        device, dtype, computation_device, computation_dtype = self.fetch_device_and_dtype(state_dict_model)
+        lora_name_dict = self.get_name_dict(state_dict_lora)
+        for name in lora_name_dict:
+            weight_up = state_dict_lora[lora_name_dict[name][0]].to(device=computation_device, dtype=computation_dtype)
+            weight_down = state_dict_lora[lora_name_dict[name][1]].to(device=computation_device, dtype=computation_dtype)
+            if len(weight_up.shape) == 4:
+                weight_up = weight_up.squeeze(3).squeeze(2)
+                weight_down = weight_down.squeeze(3).squeeze(2)
+                weight_lora = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+            else:
+                weight_lora = alpha * torch.mm(weight_up, weight_down)
+            weight_model = state_dict_model[name].to(device=computation_device, dtype=computation_dtype)
+            weight_patched = weight_model + weight_lora
+            state_dict_model[name] = weight_patched.to(device=device, dtype=dtype)
+        print(f"    {len(lora_name_dict)} tensors are updated.")
+        model.load_state_dict(state_dict_model)
+def load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device, infer):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        print(f"    model_name: {model_name} model_class: {model_class.__name__}")
+        state_dict_converter = model_class.state_dict_converter()
+        if model_resource == "civitai":
+            state_dict_results = state_dict_converter.from_civitai(state_dict)
+        elif model_resource == "diffusers":
+            state_dict_results = state_dict_converter.from_diffusers(state_dict)
+        if isinstance(state_dict_results, tuple):
+            model_state_dict, extra_kwargs = state_dict_results
+            print(f"        This model is initialized with extra kwargs: {extra_kwargs}")
+        else:
+            model_state_dict, extra_kwargs = state_dict_results, {}
+        torch_dtype = torch.float32 if extra_kwargs.get("upcast_to_float32", False) else torch_dtype
+        with init_weights_on_device():
+            model = model_class(**extra_kwargs)
+        if hasattr(model, "eval"):
+            model = model.eval()
+        if not infer: # 训练才初始化
+            model = model.to_empty(device=torch.device("cuda"))
+            for name, param in model.named_parameters():
+                if param.dim() > 1:  # 通常只对权重矩阵而不是偏置做初始化
+                    nn.init.xavier_uniform_(param, gain=0.05)
+                else:
+                    nn.init.zeros_(param)
+        else:
+            model = model.to_empty(device=device)
+        model, _, _ = smart_load_weights(model, model_state_dict)
+        # model.load_state_dict(model_state_dict, assign=True, strict=False)
+        model = model.to(dtype=torch_dtype, device=device)
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_model_from_huggingface_folder(file_path, model_names, model_classes, torch_dtype, device):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        if torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            model = model_class.from_pretrained(file_path, torch_dtype=torch_dtype).eval()
+        else:
+            model = model_class.from_pretrained(file_path).eval().to(dtype=torch_dtype)
+        if torch_dtype == torch.float16 and hasattr(model, "half"):
+            model = model.half()
+        try:
+            model = model.to(device=device)
+        except:
+            pass
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_single_patch_model_from_single_file(state_dict, model_name, model_class, base_model, extra_kwargs, torch_dtype, device):
+    print(f"    model_name: {model_name} model_class: {model_class.__name__} extra_kwargs: {extra_kwargs}")
+    base_state_dict = base_model.state_dict()
+    base_model.to("cpu")
+    del base_model
+    model = model_class(**extra_kwargs)
+    model.load_state_dict(base_state_dict, strict=False)
+    model.load_state_dict(state_dict, strict=False)
+    model.to(dtype=torch_dtype, device=device)
+    return model
+def load_patch_model_from_single_file(state_dict, model_names, model_classes, extra_kwargs, model_manager, torch_dtype, device):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        while True:
+            for model_id in range(len(model_manager.model)):
+                base_model_name = model_manager.model_name[model_id]
+                if base_model_name == model_name:
+                    base_model_path = model_manager.model_path[model_id]
+                    base_model = model_manager.model[model_id]
+                    print(f"    Adding patch model to {base_model_name} ({base_model_path})")
+                    patched_model = load_single_patch_model_from_single_file(
+                        state_dict, model_name, model_class, base_model, extra_kwargs, torch_dtype, device)
+                    loaded_model_names.append(base_model_name)
+                    loaded_models.append(patched_model)
+                    model_manager.model.pop(model_id)
+                    model_manager.model_path.pop(model_id)
+                    model_manager.model_name.pop(model_id)
+                    break
+            else:
+                break
+    return loaded_model_names, loaded_models
+class ModelDetectorTemplate:
+    def __init__(self):
+        pass
+    def match(self, file_path="", state_dict={}):
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, **kwargs):
+        return [], []
+class ModelDetectorFromSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        self.keys_hash_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(self, keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (model_names, model_classes, model_resource)
+        if keys_hash is not None:
+            self.keys_hash_dict[keys_hash] = (model_names, model_classes, model_resource)
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            return True
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, infer=False, **kwargs):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, model_resource = self.keys_hash_with_shape_dict[keys_hash_with_shape]
+            loaded_model_names, loaded_models = load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device, infer)
+            return loaded_model_names, loaded_models
+        # Load models without strict matching
+        # (the shape of parameters may be inconsistent, and the state_dict_converter will modify the model architecture)
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            model_names, model_classes, model_resource = self.keys_hash_dict[keys_hash]
+            loaded_model_names, loaded_models = load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device, infer)
+            return loaded_model_names, loaded_models
+        return loaded_model_names, loaded_models
+class ModelDetectorFromSplitedSingleFile(ModelDetectorFromSingleFile):
+    def __init__(self, model_loader_configs=[]):
+        super().__init__(model_loader_configs)
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                return True
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, **kwargs):
+        # Split the state_dict and load from each component
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        valid_state_dict = {}
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                valid_state_dict.update(sub_state_dict)
+        if super().match(file_path, valid_state_dict):
+            loaded_model_names, loaded_models = super().load(file_path, valid_state_dict, device, torch_dtype)
+        else:
+            loaded_model_names, loaded_models = [], []
+            for sub_state_dict in splited_state_dict:
+                if super().match(file_path, sub_state_dict):
+                    loaded_model_names_, loaded_models_ = super().load(file_path, valid_state_dict, device, torch_dtype)
+                    loaded_model_names += loaded_model_names_
+                    loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromPatchedSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(self, keys_hash_with_shape, model_name, model_class, extra_kwargs):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (model_name, model_class, extra_kwargs)
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, model_manager=None, **kwargs):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        loaded_model_names, loaded_models = [], []
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, extra_kwargs = self.keys_hash_with_shape_dict[keys_hash_with_shape]
+            loaded_model_names_, loaded_models_ = load_patch_model_from_single_file(
+                state_dict, model_names, model_classes, extra_kwargs, model_manager, torch_dtype, device)
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelManager:
+    def __init__(
+        self,
+        torch_dtype=torch.float16,
+        device="cuda",
+        model_id_list: List = [],
+        downloading_priority: List = ["ModelScope", "HuggingFace"],
+        file_path_list: List[str] = [],
+        infer: bool = False
+    ):
+        self.torch_dtype = torch_dtype
+        self.device = device
+        self.model = []
+        self.model_path = []
+        self.model_name = []
+        self.infer = infer
+        downloaded_files = []
+        self.model_detector = [
+            ModelDetectorFromSingleFile(model_loader_configs),
+            ModelDetectorFromSplitedSingleFile(model_loader_configs),
+        ]
+        self.load_models(downloaded_files + file_path_list)
+    def load_lora(self, file_path="", state_dict={}, lora_alpha=1.0):
+        if isinstance(file_path, list):
+            for file_path_ in file_path:
+                self.load_lora(file_path_, state_dict=state_dict, lora_alpha=lora_alpha)
+        else:
+            print(f"Loading LoRA models from file: {file_path}")
+            is_loaded = False
+            if len(state_dict) == 0:
+                state_dict = load_state_dict(file_path)
+            for model_name, model, model_path in zip(self.model_name, self.model, self.model_path):
+                lora = GeneralLoRAFromPeft()
+                match_results = lora.match(model, state_dict)
+                if match_results is not None:
+                    print(f"    Adding LoRA to {model_name} ({model_path}).")
+                    lora_prefix, model_resource = match_results
+                    lora.load(model, state_dict, lora_prefix, alpha=lora_alpha, model_resource=model_resource)
+    def load_model_from_single_file(self, file_path="", state_dict={}, model_names=[], model_classes=[], model_resource=None):
+        print(f"Loading models from file: {file_path}")
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        model_names, models = load_model_from_single_file(state_dict, model_names, model_classes, model_resource, self.torch_dtype, self.device, self.infer)
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_model_from_huggingface_folder(self, file_path="", model_names=[], model_classes=[]):
+        print(f"Loading models from folder: {file_path}")
+        model_names, models = load_model_from_huggingface_folder(file_path, model_names, model_classes, self.torch_dtype, self.device)
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_patch_model_from_single_file(self, file_path="", state_dict={}, model_names=[], model_classes=[], extra_kwargs={}):
+        print(f"Loading patch models from file: {file_path}")
+        model_names, models = load_patch_model_from_single_file(
+            state_dict, model_names, model_classes, extra_kwargs, self, self.torch_dtype, self.device)
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following patched models are loaded: {model_names}.")
+    def load_model(self, file_path, model_names=None, device=None, torch_dtype=None):
+        print(f"Loading models from: {file_path}")
+        if device is None: device = self.device
+        if torch_dtype is None: torch_dtype = self.torch_dtype
+        if isinstance(file_path, list):
+            state_dict = {}
+            for path in file_path:
+                state_dict.update(load_state_dict(path))
+        elif os.path.isfile(file_path):
+            state_dict = load_state_dict(file_path)
+        else:
+            state_dict = None
+        for model_detector in self.model_detector:
+            if model_detector.match(file_path, state_dict):
+                model_names, models = model_detector.load(
+                    file_path, state_dict,
+                    device=device, torch_dtype=torch_dtype,
+                    allowed_model_names=model_names, model_manager=self, infer=self.infer
+                )
+                for model_name, model in zip(model_names, models):
+                    self.model.append(model)
+                    self.model_path.append(file_path)
+                    self.model_name.append(model_name)
+                print(f"    The following models are loaded: {model_names}.")
+                break
+        else:
+            print(f"    We cannot detect the model type. No models are loaded.")
+    def load_models(self, file_path_list, model_names=None, device=None, torch_dtype=None):
+        for file_path in file_path_list:
+            self.load_model(file_path, model_names, device=device, torch_dtype=torch_dtype)
+    def fetch_model(self, model_name, file_path=None, require_model_path=False):
+        fetched_models = []
+        fetched_model_paths = []
+        for model, model_path, model_name_ in zip(self.model, self.model_path, self.model_name):
+            if file_path is not None and file_path != model_path:
+                continue
+            if model_name == model_name_:
+                fetched_models.append(model)
+                fetched_model_paths.append(model_path)
+        if len(fetched_models) == 0:
+            print(f"No {model_name} models available.")
+            return None
+        if len(fetched_models) == 1:
+            print(f"Using {model_name} from {fetched_model_paths[0]}.")
+        else:
+            print(f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[0]}.")
+        if require_model_path:
+            return fetched_models[0], fetched_model_paths[0]
+        else:
+            return fetched_models[0]
+    def to(self, device):
+        for model in self.model:
+            model.to(device)

OmniAvatar/models/vsa_util.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# SPDX-License-Identifier: Apache-2.0
+import functools
+import math
+from dataclasses import dataclass
+import torch
+from vsa import video_sparse_attn
+from typing import Any
+VSA_TILE_SIZE = (4, 4, 4)
+@functools.lru_cache(maxsize=10)
+def get_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    T, H, W = dit_seq_shape
+    ts, hs, ws = tile_size
+    indices = torch.arange(T * H * W, device=device,
+                           dtype=torch.long).reshape(T, H, W)
+    ls = []
+    for t in range(math.ceil(T / ts)):
+        for h in range(math.ceil(H / hs)):
+            for w in range(math.ceil(W / ws)):
+                ls.append(indices[t * ts:min(t * ts + ts, T),
+                                  h * hs:min(h * hs + hs, H),
+                                  w * ws:min(w * ws + ws, W)].flatten())
+    index = torch.cat(ls, dim=0)
+    return index
+@functools.lru_cache(maxsize=10)
+def get_reverse_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    return torch.argsort(
+        get_tile_partition_indices(dit_seq_shape, tile_size, device))
+@functools.lru_cache(maxsize=10)
+def construct_variable_block_sizes(
+    dit_seq_shape: tuple[int, int, int],
+    num_tiles: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """
+    Compute the number of valid (non‑padded) tokens inside every
+    (ts_t × ts_h × ts_w) tile after padding ‑‑ flattened in the order
+    (t‑tile, h‑tile, w‑tile) that `rearrange` uses.
+    Returns
+    -------
+    torch.LongTensor  # shape: [∏ full_window_size]
+    """
+    # unpack
+    t, h, w = dit_seq_shape
+    ts_t, ts_h, ts_w = VSA_TILE_SIZE
+    n_t, n_h, n_w = num_tiles
+    def _sizes(dim_len: int, tile: int, n_tiles: int) -> torch.LongTensor:
+        """Vector with the size of each tile along one dimension."""
+        sizes = torch.full((n_tiles, ), tile, dtype=torch.int, device=device)
+        # size of last (possibly partial) tile
+        remainder = dim_len - (n_tiles - 1) * tile
+        sizes[-1] = remainder if remainder > 0 else tile
+        return sizes
+    t_sizes = _sizes(t, ts_t, n_t)  # [n_t]
+    h_sizes = _sizes(h, ts_h, n_h)  # [n_h]
+    w_sizes = _sizes(w, ts_w, n_w)  # [n_w]
+    # broadcast‑multiply to get voxels per tile, then flatten
+    block_sizes = (
+        t_sizes[:, None, None]  # [n_t, 1,   1]
+        * h_sizes[None, :, None]  # [1,   n_h, 1]
+        * w_sizes[None, None, :]  # [1,   1,   n_w]
+    ).reshape(-1)  # [n_t * n_h * n_w]
+    return block_sizes
+@functools.lru_cache(maxsize=10)
+def get_non_pad_index(
+    variable_block_sizes: torch.LongTensor,
+    max_block_size: int,
+):
+    n_win = variable_block_sizes.shape[0]
+    device = variable_block_sizes.device
+    starts_pad = torch.arange(n_win, device=device) * max_block_size
+    index_pad = starts_pad[:, None] + torch.arange(max_block_size,
+                                                   device=device)[None, :]
+    index_mask = torch.arange(
+        max_block_size, device=device)[None, :] < variable_block_sizes[:, None]
+    return index_pad[index_mask]
+@dataclass
+class VideoSparseAttentionMetadata():
+    current_timestep: int
+    dit_seq_shape: list[int]
+    VSA_sparsity: float
+    num_tiles: list[int]
+    total_seq_length: int
+    tile_partition_indices: torch.LongTensor
+    reverse_tile_partition_indices: torch.LongTensor
+    variable_block_sizes: torch.LongTensor
+    non_pad_index: torch.LongTensor
+def build(
+    current_timestep: int,
+    raw_latent_shape: tuple[int, int, int],
+    patch_size: tuple[int, int, int],
+    VSA_sparsity: float,
+    device: torch.device,
+    **kwargs: dict[str, Any],
+) -> VideoSparseAttentionMetadata:
+    patch_size = patch_size
+    dit_seq_shape = (raw_latent_shape[0] // patch_size[0],
+                     raw_latent_shape[1] // patch_size[1],
+                     raw_latent_shape[2] // patch_size[2])
+    num_tiles = (math.ceil(dit_seq_shape[0] / VSA_TILE_SIZE[0]),
+                 math.ceil(dit_seq_shape[1] / VSA_TILE_SIZE[1]),
+                 math.ceil(dit_seq_shape[2] / VSA_TILE_SIZE[2]))
+    total_seq_length = math.prod(dit_seq_shape)
+    tile_partition_indices = get_tile_partition_indices(
+        dit_seq_shape, VSA_TILE_SIZE, device)
+    reverse_tile_partition_indices = get_reverse_tile_partition_indices(
+        dit_seq_shape, VSA_TILE_SIZE, device)
+    variable_block_sizes = construct_variable_block_sizes(
+        dit_seq_shape, num_tiles, device)
+    non_pad_index = get_non_pad_index(variable_block_sizes,
+                                      math.prod(VSA_TILE_SIZE))
+    return VideoSparseAttentionMetadata(
+        current_timestep=current_timestep,
+        dit_seq_shape=dit_seq_shape,  # type: ignore
+        VSA_sparsity=VSA_sparsity,  # type: ignore
+        num_tiles=num_tiles,  # type: ignore
+        total_seq_length=total_seq_length,  # type: ignore
+        tile_partition_indices=tile_partition_indices,  # type: ignore
+        reverse_tile_partition_indices=reverse_tile_partition_indices,
+        variable_block_sizes=variable_block_sizes,
+        non_pad_index=non_pad_index)
+class VideoSparseAttentionImpl():
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        causal: bool,
+        softmax_scale: float,
+        num_kv_heads: int | None = None,
+        prefix: str = "",
+        **extra_impl_args,
+    ) -> None:
+        self.prefix = prefix
+    def tile(self, x: torch.Tensor, num_tiles: list[int],
+             tile_partition_indices: torch.LongTensor,
+             non_pad_index: torch.LongTensor) -> torch.Tensor:
+        t_padded_size = num_tiles[0] * VSA_TILE_SIZE[0]
+        h_padded_size = num_tiles[1] * VSA_TILE_SIZE[1]
+        w_padded_size = num_tiles[2] * VSA_TILE_SIZE[2]
+        x_padded = torch.zeros(
+            (x.shape[0], t_padded_size * h_padded_size * w_padded_size,
+             x.shape[-2], x.shape[-1]),
+            device=x.device,
+            dtype=x.dtype)
+        x_padded[:, non_pad_index] = x[:, tile_partition_indices]
+        return x_padded
+    def untile(self, x: torch.Tensor,
+               reverse_tile_partition_indices: torch.LongTensor,
+               non_pad_index: torch.LongTensor) -> torch.Tensor:
+        x = x[:, non_pad_index][:, reverse_tile_partition_indices]
+        return x
+    def preprocess_qkv(
+        self,
+        qkv: torch.Tensor,
+        attn_metadata: VideoSparseAttentionMetadata,
+    ) -> torch.Tensor:
+        return self.tile(qkv, attn_metadata.num_tiles,
+                         attn_metadata.tile_partition_indices,
+                         attn_metadata.non_pad_index)
+    def postprocess_output(
+        self,
+        output: torch.Tensor,
+        attn_metadata: VideoSparseAttentionMetadata,
+    ) -> torch.Tensor:
+        return self.untile(output, attn_metadata.reverse_tile_partition_indices,
+                           attn_metadata.non_pad_index)
+    def forward(  # type: ignore[override]
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: VideoSparseAttentionMetadata,
+    ) -> torch.Tensor:
+        query = query.transpose(1, 2).contiguous()
+        key = key.transpose(1, 2).contiguous()
+        value = value.transpose(1, 2).contiguous()
+        VSA_sparsity = attn_metadata.VSA_sparsity
+        cur_topk = math.ceil(
+            (1 - VSA_sparsity) *
+            (attn_metadata.total_seq_length / math.prod(VSA_TILE_SIZE)))
+        hidden_states = video_sparse_attn(
+            query,
+            key,
+            value,
+            variable_block_sizes=attn_metadata.variable_block_sizes,
+            topk=cur_topk,
+            block_size=VSA_TILE_SIZE,
+            compress_attn_weight=None).transpose(1, 2)
+        return hidden_states

OmniAvatar/models/wan_video_dit.py ADDED Viewed

	@@ -0,0 +1,607 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Tuple, Optional
+from einops import rearrange
+from ..utils.io_utils import hash_state_dict_keys
+from .audio_pack import AudioPack
+from ..utils.args_config import args
+if args.sp_size > 1:
+    # Context Parallel
+    from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                         get_sequence_parallel_world_size,
+                                         get_sp_group)
+try:
+    import flash_attn_interface
+    print('using flash_attn_interface')
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    print('using flash_attn')
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    print('using sageattention')
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+def flash_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_heads: int, compatibility_mode=False):
+    if compatibility_mode:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_3_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn_interface.flash_attn_func(q, k, v)
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_2_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn.flash_attn_func(q, k, v)
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif SAGE_ATTN_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = sageattn(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    else:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    return x
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
+    return (x * (1 + scale) + shift)
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(position.type(torch.float64), torch.pow(
+        10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def precompute_freqs_cos_sin(dim: int, end: int = 1024, theta: float = 10000.0):
+    # dim is the per-head dim
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float64)[:(dim//2)] / dim))
+    angles = torch.outer(torch.arange(end, dtype=torch.float64, device=freqs.device), freqs)  # [end, dim//2]
+    return angles.cos().to(torch.float32), angles.sin().to(torch.float32)
+def precompute_freqs_cos_sin_3d(dim: int, end: int = 1024, theta: float = 10000.0):
+    fdim = dim - 2 * (dim // 3)
+    hdim = dim // 3
+    wdim = dim // 3
+    fcos, fsin = precompute_freqs_cos_sin(fdim, end, theta)
+    hcos, hsin = precompute_freqs_cos_sin(hdim, end, theta)
+    wcos, wsin = precompute_freqs_cos_sin(wdim, end, theta)
+    return (fcos, hcos, wcos), (fsin, hsin, wsin)
+def rope_apply_real(x, cos, sin, num_heads):
+    # x: [b, s, n*head_dim]
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    # split last dim into pairs
+    d2 = x.shape[-1] // 2
+    x = x.reshape(*x.shape[:-1], d2, 2)              # [..., d/2, 2]
+    x1, x2 = x[..., 0], x[..., 1]                    # two real halves
+    # cos/sin are shaped [seq, 1, d/2]; broadcast across batch/heads
+    rot_x1 = x1 * cos - x2 * sin
+    rot_x2 = x1 * sin + x2 * cos
+    out = torch.stack((rot_x1, rot_x2), dim=-1).reshape(*x.shape[:-2], -1)
+    return rearrange(out, "b s n d -> b s (n d)")
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        dtype = x.dtype
+        return self.norm(x.float()).to(dtype) * self.weight
+class AttentionModule(nn.Module):
+    def __init__(self, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+    def forward(self, q, k, v):
+        x = flash_attention(q=q, k=k, v=v, num_heads=self.num_heads)
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+    def forward(self, x, freqs):
+        cos, sin = freqs
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(x))
+        v = self.v(x)
+        # q = rope_apply(q, freqs, self.num_heads)
+        # k = rope_apply(k, freqs, self.num_heads)
+        q = rope_apply_real(q, cos, sin, self.num_heads)
+        k = rope_apply_real(k, cos, sin, self.num_heads)
+        x = self.attn(q, k, v)
+        return self.o(x)
+class CrossAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6, has_image_input: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.has_image_input = has_image_input
+        if has_image_input:
+            self.k_img = nn.Linear(dim, dim)
+            self.v_img = nn.Linear(dim, dim)
+            self.norm_k_img = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self.has_image_input:
+            img = y[:, :257]
+            ctx = y[:, 257:]
+        else:
+            ctx = y
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(ctx))
+        v = self.v(ctx)
+        x = self.attn(q, k, v)
+        if self.has_image_input:
+            k_img = self.norm_k_img(self.k_img(img))
+            v_img = self.v_img(img)
+            y = flash_attention(q, k_img, v_img, num_heads=self.num_heads)
+            x = x + y
+        return self.o(x)
+class GateModule(nn.Module):
+    def __init__(self,):
+        super().__init__()
+    def forward(self, x, gate, residual):
+        return x + gate * residual
+class DiTBlock(nn.Module):
+    def __init__(self, has_image_input: bool, dim: int, num_heads: int, ffn_dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.self_attn = SelfAttention(dim, num_heads, eps)
+        self.cross_attn = CrossAttention(
+            dim, num_heads, eps, has_image_input=has_image_input)
+        self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm3 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(nn.Linear(dim, ffn_dim), nn.GELU(
+            approximate='tanh'), nn.Linear(ffn_dim, dim))
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.gate = GateModule()
+    def forward(self, x, context, t_mod, freqs):
+        # msa: multi-head self-attention  mlp: multi-layer perceptron
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(6, dim=1)
+        input_x = modulate(self.norm1(x), shift_msa, scale_msa)
+        x = self.gate(x, gate_msa, self.self_attn(input_x, freqs))
+        x = x + self.cross_attn(self.norm3(x), context)
+        input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        x = self.gate(x, gate_mlp, self.ffn(input_x))
+        return x
+class MLP(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        # keep norms outside the MLP core
+        self.ln_in  = nn.LayerNorm(in_dim)
+        self.fc1    = nn.Linear(in_dim, in_dim)
+        self.activation = nn.GELU()
+        self.fc2    = nn.Linear(in_dim, out_dim)
+        self.ln_out = nn.LayerNorm(out_dim)
+    def forward(self, x):
+        x = self.ln_in(x)
+        x = self.fc2(self.activation(self.fc1(x)))
+        x = self.ln_out(x)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim: int, out_dim: int, patch_size: Tuple[int, int, int], eps: float):
+        super().__init__()
+        self.dim = dim
+        self.patch_size = patch_size
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.head = nn.Linear(dim, out_dim * math.prod(patch_size))
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, t_mod):
+        shift, scale = (self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(2, dim=1)
+        x = (self.head(self.norm(x) * (1 + scale) + shift))
+        return x
+class WanModel(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        in_dim: int,
+        ffn_dim: int,
+        out_dim: int,
+        text_dim: int,
+        freq_dim: int,
+        eps: float,
+        patch_size: Tuple[int, int, int],
+        num_heads: int,
+        num_layers: int,
+        has_image_input: bool,
+        audio_hidden_size: int=32,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.freq_dim = freq_dim
+        self.has_image_input = has_image_input
+        self.patch_size = patch_size
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size)
+            # nn.LayerNorm(dim)
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim),
+            nn.GELU(approximate='tanh'),
+            nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(
+            nn.SiLU(), nn.Linear(dim, dim * 6))
+        self.blocks = nn.ModuleList([
+            DiTBlock(has_image_input, dim, num_heads, ffn_dim, eps)
+            for _ in range(num_layers)
+        ])
+        self.head = Head(dim, out_dim, patch_size, eps)
+        head_dim = dim // num_heads
+        self.freqs = precompute_freqs_cos_sin_3d(head_dim)
+        if has_image_input:
+            self.img_emb = MLP(1280, dim)  # clip_feature_dim = 1280
+        if 'use_audio' in args:
+            self.use_audio = args.use_audio
+        else:
+            self.use_audio = False
+        if self.use_audio:
+            audio_input_dim = 10752
+            audio_out_dim = dim
+            self.audio_proj = AudioPack(audio_input_dim, [4, 1, 1], audio_hidden_size, layernorm=True)
+            self.audio_cond_projs = nn.ModuleList()
+            for d in range(num_layers // 2 - 1):
+                l = nn.Linear(audio_hidden_size, audio_out_dim)
+                self.audio_cond_projs.append(l)
+    def patchify(self, x: torch.Tensor):
+        grid_size = x.shape[2:]
+        x = rearrange(x, 'b c f h w -> b (f h w) c').contiguous()
+        return x, grid_size  # x, grid_size: (f, h, w)
+    def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
+        return rearrange(
+            x, 'b (f h w) (x y z c) -> b c (f x) (h y) (w z)',
+            f=grid_size[0], h=grid_size[1], w=grid_size[2],
+            x=self.patch_size[0], y=self.patch_size[1], z=self.patch_size[2]
+        )
+    def forward(self,
+                x: torch.Tensor,
+                timestep: torch.Tensor,
+                context: torch.Tensor,
+                clip_feature: Optional[torch.Tensor] = None,
+                y: Optional[torch.Tensor] = None,
+                use_gradient_checkpointing: bool = False,
+                audio_emb: Optional[torch.Tensor] = None,
+                use_gradient_checkpointing_offload: bool = False,
+                tea_cache = None,
+                **kwargs,
+                ):
+        t = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, timestep))
+        t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+        context = self.text_embedding(context)
+        lat_h, lat_w = x.shape[-2], x.shape[-1]
+        if audio_emb != None and self.use_audio: # TODO  cache
+            audio_emb = audio_emb.permute(0, 2, 1)[:, :, :, None, None]
+            audio_emb = torch.cat([audio_emb[:, :, :1].repeat(1, 1, 3, 1, 1), audio_emb], 2) # 1, 768, 44, 1, 1
+            audio_emb = self.audio_proj(audio_emb)
+            audio_emb = torch.concat([audio_cond_proj(audio_emb) for audio_cond_proj in self.audio_cond_projs], 0)
+        x = torch.cat([x, y], dim=1)
+        x = self.patch_embedding(x)
+        x, (f, h, w) = self.patchify(x)
+        # freqs = torch.cat([
+        #     self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+        #     self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+        #     self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        # ], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
+        (fcos, hcos, wcos), (fsin, hsin, wsin) = self.freqs
+        cos = torch.cat([
+         fcos[:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+         hcos[:h].view(1, h, 1, -1).expand(f, h, w, -1),
+         wcos[:w].view(1, 1, w, -1).expand(f, h, w, -1),
+        ], dim=-1).reshape(f*h*w, 1, -1).to(x.device, dtype=x.dtype)
+        sin = torch.cat([
+         fsin[:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+         hsin[:h].view(1, h, 1, -1).expand(f, h, w, -1),
+         wsin[:w].view(1, 1, w, -1).expand(f, h, w, -1),
+        ], dim=-1).reshape(f*h*w, 1, -1).to(x.device, dtype=x.dtype)
+        freqs = (cos, sin)  # pass both
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        if tea_cache is not None:
+            tea_cache_update = tea_cache.check(self, x, t_mod)
+        else:
+            tea_cache_update = False
+        ori_x_len = x.shape[1]
+        if tea_cache_update:
+            x = tea_cache.update(x)
+        else:
+            if args.sp_size > 1:
+                # Context Parallel
+                sp_size = get_sequence_parallel_world_size()
+                pad_size = 0
+                if ori_x_len % sp_size != 0:
+                    pad_size = sp_size - ori_x_len % sp_size
+                    x = torch.cat([x, torch.zeros_like(x[:, -1:]).repeat(1, pad_size, 1)], 1)
+                x = torch.chunk(x, sp_size, dim=1)[get_sequence_parallel_rank()]
+            if self.use_audio:
+                audio_emb = audio_emb.reshape(x.shape[0], audio_emb.shape[0] // x.shape[0], -1, *audio_emb.shape[2:])
+            for layer_i, block in enumerate(self.blocks):
+                # audio cond
+                if self.use_audio:
+                    au_idx = None
+                    if (layer_i <= len(self.blocks) // 2 and layer_i > 1): # < len(self.blocks) - 1:
+                        au_idx = layer_i - 2
+                        audio_emb_tmp = audio_emb[:, au_idx].repeat(1, 1, lat_h // 2, lat_w // 2, 1) # 1, 11, 45, 25, 128
+                        audio_cond_tmp = self.patchify(audio_emb_tmp.permute(0, 4, 1, 2, 3))[0]
+                        if args.sp_size > 1:
+                            if pad_size > 0:
+                                audio_cond_tmp = torch.cat([audio_cond_tmp, torch.zeros_like(audio_cond_tmp[:, -1:]).repeat(1, pad_size, 1)], 1)
+                            audio_cond_tmp = torch.chunk(audio_cond_tmp, sp_size, dim=1)[get_sequence_parallel_rank()]
+                        x = audio_cond_tmp + x
+                if self.training and use_gradient_checkpointing:
+                    if use_gradient_checkpointing_offload:
+                        with torch.autograd.graph.save_on_cpu():
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block),
+                                x, context, t_mod, freqs,
+                                use_reentrant=False,
+                            )
+                    else:
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            x, context, t_mod, freqs,
+                            use_reentrant=False,
+                        )
+                else:
+                    x = block(x, context, t_mod, freqs)
+            if tea_cache is not None:
+                x_cache = get_sp_group().all_gather(x, dim=1) # TODO: the size should be devided by sp_size
+                x_cache = x_cache[:, :ori_x_len]
+                tea_cache.store(x_cache)
+        x = self.head(x, t)
+        if args.sp_size > 1:
+            # Context Parallel
+            x = get_sp_group().all_gather(x, dim=1) # TODO: the size should be devided by sp_size
+            x = x[:, :ori_x_len]
+        x = self.unpatchify(x, (f, h, w))
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanModelStateDictConverter()
+class WanModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
+            "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
+            "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
+            "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
+            "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
+            "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
+            "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
+            "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
+            "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
+            "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
+            "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
+            "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
+            "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
+            "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
+            "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
+            "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
+            "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
+            "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
+            "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
+            "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
+            "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
+            "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
+            "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
+            "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
+            "blocks.0.norm2.bias": "blocks.0.norm3.bias",
+            "blocks.0.norm2.weight": "blocks.0.norm3.weight",
+            "blocks.0.scale_shift_table": "blocks.0.modulation",
+            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+            "condition_embedder.time_proj.bias": "time_projection.1.bias",
+            "condition_embedder.time_proj.weight": "time_projection.1.weight",
+            "patch_embedding.bias": "patch_embedding.bias",
+            "patch_embedding.weight": "patch_embedding.weight",
+            "scale_shift_table": "head.modulation",
+            "proj_out.bias": "head.head.bias",
+            "proj_out.weight": "head.head.weight",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+            else:
+                name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
+                if name_ in rename_dict:
+                    name_ = rename_dict[name_]
+                    name_ = ".".join(name_.split(".")[:1] + [name.split(".")[1]] + name_.split(".")[2:])
+                    state_dict_[name_] = param
+        if hash_state_dict_keys(state_dict) == "cb104773c6c2cb6df4f9529ad5c60d0b":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        else:
+            config = {}
+        return state_dict_, config
+    def from_civitai(self, state_dict):
+        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 16,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6
+            }
+        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6
+            }
+        else:
+            config = {}
+        if hasattr(args, "model_config"):
+            model_config = args.model_config
+            if model_config is not None:
+                config.update(model_config)
+        return state_dict, config

OmniAvatar/models/wan_video_text_encoder.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+class GELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(
+            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
+                            self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1,
+                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True)
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
+            torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
+            0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
+                                     math.log(self.max_dist / max_exact) *
+                                     (num_buckets - max_exact)).long()
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
+class WanTextEncoder(torch.nn.Module):
+    def __init__(self,
+                 vocab=256384,
+                 dim=4096,
+                 dim_attn=4096,
+                 dim_ffn=10240,
+                 num_heads=64,
+                 num_layers=24,
+                 num_buckets=32,
+                 shared_pos=False,
+                 dropout=0.1):
+        super(WanTextEncoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                            shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanTextEncoderStateDictConverter()
+class WanTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

OmniAvatar/models/wan_video_vae.py ADDED Viewed

	@@ -0,0 +1,807 @@

+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+CACHE_T = 2
+def check_is_instance(model, module_class):
+    if isinstance(model, module_class):
+        return True
+    if hasattr(model, "module") and isinstance(model.module, module_class):
+        return True
+    return False
+def block_causal_mask(x, block_size):
+    # params
+    b, n, s, _, device = *x.size(), x.device
+    assert s % block_size == 0
+    num_blocks = s // block_size
+    # build mask
+    mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
+    for i in range(num_blocks):
+        mask[:, :,
+             i * block_size:(i + 1) * block_size, :(i + 1) * block_size] = 1
+    return mask
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
+    def forward(self, x):
+        return F.normalize(
+            x, dim=(1 if self.channel_first else
+                    -1)) * self.scale * self.gamma + self.bias
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(dim,
+                                          dim * 2, (3, 1, 1),
+                                          padding=(1, 0, 0))
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim,
+                                          dim, (3, 1, 1),
+                                          stride=(2, 1, 1),
+                                          padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(
+            0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            #attn_mask=block_causal_mask(q, block_size=h * w)
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = 'downsample3d' if temperal_downsample[
+                    i] else 'downsample2d'
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(out_dim, out_dim, dropout),
+                                    AttentionBlock(out_dim),
+                                    ResidualBlock(out_dim, out_dim, dropout))
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, z_dim, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
+                                    AttentionBlock(dims[0]),
+                                    ResidualBlock(dims[0], dims[0], dropout))
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if check_is_instance(m, CausalConv3d):
+            count += 1
+    return count
+class VideoVAE_(nn.Module):
+    def __init__(self,
+                 dim=96,
+                 z_dim=16,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_upsample, dropout)
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :],
+                                   feat_cache=self._enc_feat_map,
+                                   feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                                    feat_cache=self._enc_feat_map,
+                                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i:i + 1, :, :],
+                                   feat_cache=self._feat_map,
+                                   feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(x[:, :, i:i + 1, :, :],
+                                    feat_cache=self._feat_map,
+                                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2) # may add tensor offload
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+class WanVideoVAE(nn.Module):
+    def __init__(self, z_dim=16):
+        super().__init__()
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
+        self.upsampling_factor = 8
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip((torch.arange(border_width) + 1) / border_width, dims=(0,))
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, _, H, W = data.shape
+        h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
+        w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
+        h = repeat(h, "H -> H W", H=H, W=W)
+        w = repeat(w, "W -> H W", H=H, W=W)
+        mask = torch.stack([h, w]).min(dim=0).values
+        mask = rearrange(mask, "H W -> 1 1 1 H W")
+        return mask
+    def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
+        _, _, T, H, W = hidden_states.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
+            for w in range(0, W, stride_w):
+                if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = T * 4 - 3
+        weight = torch.zeros((1, 1, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
+        values = torch.zeros((1, 3, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
+        for h, h_, w, w_ in tasks:
+            hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(data_device)
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h==0, h_>=H, w==0, w_>=W),
+                border_width=((size_h - stride_h) * self.upsampling_factor, (size_w - stride_w) * self.upsampling_factor)
+            ).to(dtype=hidden_states.dtype, device=data_device)
+            target_h = h * self.upsampling_factor
+            target_w = w * self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h:target_h + hidden_states_batch.shape[3],
+                target_w:target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h: target_h + hidden_states_batch.shape[3],
+                target_w: target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.clamp_(-1, 1)
+        return values
+    def tiled_encode(self, video, device, tile_size, tile_stride):
+        _, _, T, H, W = video.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
+            for w in range(0, W, stride_w):
+                if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = (T + 3) // 4
+        weight = torch.zeros((1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
+        values = torch.zeros((1, 16, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
+        for h, h_, w, w_ in tasks:
+            hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(data_device)
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h==0, h_>=H, w==0, w_>=W),
+                border_width=((size_h - stride_h) // self.upsampling_factor, (size_w - stride_w) // self.upsampling_factor)
+            ).to(dtype=video.dtype, device=data_device)
+            target_h = h // self.upsampling_factor
+            target_w = w // self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h:target_h + hidden_states_batch.shape[3],
+                target_w:target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h: target_h + hidden_states_batch.shape[3],
+                target_w: target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        return values
+    def single_encode(self, video, device):
+        video = video.to(device)
+        x = self.model.encode(video, self.scale)
+        return x
+    def single_decode(self, hidden_state, device):
+        hidden_state = hidden_state.to(device)
+        video = self.model.decode(hidden_state, self.scale)
+        return video.clamp_(-1, 1)
+    def encode(self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+        videos = [video.to("cpu") for video in videos]
+        hidden_states = []
+        for video in videos:
+            video = video.unsqueeze(0)
+            if tiled:
+                tile_size = (tile_size[0] * 8, tile_size[1] * 8)
+                tile_stride = (tile_stride[0] * 8, tile_stride[1] * 8)
+                hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
+            else:
+                hidden_state = self.single_encode(video, device)
+            hidden_state = hidden_state.squeeze(0)
+            hidden_states.append(hidden_state)
+        hidden_states = torch.stack(hidden_states)
+        return hidden_states
+    def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+        hidden_states = [hidden_state.to("cpu") for hidden_state in hidden_states]
+        videos = []
+        for hidden_state in hidden_states:
+            hidden_state = hidden_state.unsqueeze(0)
+            if tiled:
+                video = self.tiled_decode(hidden_state, device, tile_size, tile_stride)
+            else:
+                video = self.single_decode(hidden_state, device)
+            video = video.squeeze(0)
+            videos.append(video)
+        videos = torch.stack(videos)
+        return videos
+    @staticmethod
+    def state_dict_converter():
+        return WanVideoVAEStateDictConverter()
+class WanVideoVAEStateDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        if 'model_state' in state_dict:
+            state_dict = state_dict['model_state']
+        for name in state_dict:
+            state_dict_['model.' + name] = state_dict[name]
+        return state_dict_

OmniAvatar/models/wav2vec.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# pylint: disable=R0901
+# src/models/wav2vec.py
+"""
+This module defines the Wav2Vec model, which is a pre-trained model for speech recognition and understanding.
+It inherits from the Wav2Vec2Model class in the transformers library and provides additional functionalities
+such as feature extraction and encoding.
+Classes:
+    Wav2VecModel: Inherits from Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+Functions:
+    linear_interpolation: Interpolates the features based on the sequence length.
+"""
+import torch.nn.functional as F
+from transformers import Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+class Wav2VecModel(Wav2Vec2Model):
+    """
+    Wav2VecModel is a custom model class that extends the Wav2Vec2Model class from the transformers library.
+    It inherits all the functionality of the Wav2Vec2Model and adds additional methods for feature extraction and encoding.
+    ...
+    Attributes:
+        base_model (Wav2Vec2Model): The base Wav2Vec2Model object.
+    Methods:
+        forward(input_values, seq_len, attention_mask=None, mask_time_indices=None
+        , output_attentions=None, output_hidden_states=None, return_dict=None):
+            Forward pass of the Wav2VecModel.
+            It takes input_values, seq_len, and other optional parameters as input and returns the output of the base model.
+        feature_extract(input_values, seq_len):
+            Extracts features from the input_values using the base model.
+        encode(extract_features, attention_mask=None, mask_time_indices=None, output_attentions=None, output_hidden_states=None, return_dict=None):
+            Encodes the extracted features using the base model and returns the encoded features.
+    """
+    def forward(
+        self,
+        input_values,
+        seq_len,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Forward pass of the Wav2Vec model.
+        Args:
+            self: The instance of the model.
+            input_values: The input values (waveform) to the model.
+            seq_len: The sequence length of the input values.
+            attention_mask: Attention mask to be used for the model.
+            mask_time_indices: Mask indices to be used for the model.
+            output_attentions: If set to True, returns attentions.
+            output_hidden_states: If set to True, returns hidden states.
+            return_dict: If set to True, returns a BaseModelOutput instead of a tuple.
+        Returns:
+            The output of the Wav2Vec model.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        """
+        Extracts features from the input values and returns the extracted features.
+        Parameters:
+        input_values (torch.Tensor): The input values to be processed.
+        seq_len (torch.Tensor): The sequence lengths of the input values.
+        Returns:
+        extracted_features (torch.Tensor): The extracted features from the input values.
+        """
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        return extract_features
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """
+        Encodes the input features into the output space.
+        Args:
+            extract_features (torch.Tensor): The extracted features from the audio signal.
+            attention_mask (torch.Tensor, optional): Attention mask to be used for padding.
+            mask_time_indices (torch.Tensor, optional): Masked indices for the time dimension.
+            output_attentions (bool, optional): If set to True, returns the attention weights.
+            output_hidden_states (bool, optional): If set to True, returns all hidden states.
+            return_dict (bool, optional): If set to True, returns a BaseModelOutput instead of the tuple.
+        Returns:
+            The encoded output features.
+        """
+        self.config.output_attentions = True
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+def linear_interpolation(features, seq_len):
+    """
+    Transpose the features to interpolate linearly.
+    Args:
+        features (torch.Tensor): The extracted features to be interpolated.
+        seq_len (torch.Tensor): The sequence lengths of the features.
+    Returns:
+        torch.Tensor: The interpolated features.
+    """
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)

OmniAvatar/prompters/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wan_prompter import WanPrompter

OmniAvatar/prompters/base_prompter.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from ..models.model_manager import ModelManager
+import torch
+def tokenize_long_prompt(tokenizer, prompt, max_length=None):
+    # Get model_max_length from self.tokenizer
+    length = tokenizer.model_max_length if max_length is None else max_length
+    # To avoid the warning. set self.tokenizer.model_max_length to +oo.
+    tokenizer.model_max_length = 99999999
+    # Tokenize it!
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    # Determine the real length.
+    max_length = (input_ids.shape[1] + length - 1) // length * length
+    # Restore tokenizer.model_max_length
+    tokenizer.model_max_length = length
+    # Tokenize it again with fixed length.
+    input_ids = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=max_length,
+        truncation=True
+    ).input_ids
+    # Reshape input_ids to fit the text encoder.
+    num_sentence = input_ids.shape[1] // length
+    input_ids = input_ids.reshape((num_sentence, length))
+    return input_ids
+class BasePrompter:
+    def __init__(self):
+        self.refiners = []
+        self.extenders = []
+    def load_prompt_refiners(self, model_manager: ModelManager, refiner_classes=[]):
+        for refiner_class in refiner_classes:
+            refiner = refiner_class.from_model_manager(model_manager)
+            self.refiners.append(refiner)
+    def load_prompt_extenders(self,model_manager:ModelManager,extender_classes=[]):
+        for extender_class in extender_classes:
+            extender = extender_class.from_model_manager(model_manager)
+            self.extenders.append(extender)
+    @torch.no_grad()
+    def process_prompt(self, prompt, positive=True):
+        if isinstance(prompt, list):
+            prompt = [self.process_prompt(prompt_, positive=positive) for prompt_ in prompt]
+        else:
+            for refiner in self.refiners:
+                prompt = refiner(prompt, positive=positive)
+        return prompt
+    @torch.no_grad()
+    def extend_prompt(self, prompt:str, positive=True):
+        extended_prompt = dict(prompt=prompt)
+        for extender in self.extenders:
+            extended_prompt = extender(extended_prompt)
+        return extended_prompt

OmniAvatar/prompters/wan_prompter.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from .base_prompter import BasePrompter
+from ..models.wan_video_text_encoder import WanTextEncoder
+from transformers import AutoTokenizer
+import os, torch
+import ftfy
+import html
+import string
+import regex as re
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace('_', ' ')
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans('', '', string.punctuation))
+            for part in text.split(keep_punctuation_exact_string))
+    else:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop('return_mask', False)
+        # arguments
+        _kwargs = {'return_tensors': 'pt'}
+        if self.seq_len is not None:
+            _kwargs.update({
+                'padding': 'max_length',
+                'truncation': True,
+                'max_length': self.seq_len
+            })
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
+class WanPrompter(BasePrompter):
+    def __init__(self, tokenizer_path=None, text_len=512):
+        super().__init__()
+        self.text_len = text_len
+        self.text_encoder = None
+        self.fetch_tokenizer(tokenizer_path)
+    def fetch_tokenizer(self, tokenizer_path=None):
+        if tokenizer_path is not None:
+            self.tokenizer = HuggingfaceTokenizer(name=tokenizer_path, seq_len=self.text_len, clean='whitespace')
+    def fetch_models(self, text_encoder: WanTextEncoder = None):
+        self.text_encoder = text_encoder
+    def encode_prompt(self, prompt, positive=True, device="cuda"):
+        prompt = self.process_prompt(prompt, positive=positive)
+        ids, mask = self.tokenizer(prompt, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_emb = self.text_encoder(ids, mask)
+        for i, v in enumerate(seq_lens):
+            prompt_emb[:, v:] = 0
+        return prompt_emb

OmniAvatar/schedulers/flow_match.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+class FlowMatchScheduler():
+    def __init__(self, num_inference_steps=100, num_train_timesteps=1000, shift=3.0, sigma_max=1.0, sigma_min=0.003/1.002, inverse_timesteps=False, extra_one_step=False, reverse_sigmas=False):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.inverse_timesteps = inverse_timesteps
+        self.extra_one_step = extra_one_step
+        self.reverse_sigmas = reverse_sigmas
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, shift=None):
+        if shift is not None:
+            self.shift = shift
+        sigma_start = self.sigma_min + (self.sigma_max - self.sigma_min) * denoising_strength
+        if self.extra_one_step:
+            self.sigmas = torch.linspace(sigma_start, self.sigma_min, num_inference_steps + 1)[:-1]
+        else:
+            self.sigmas = torch.linspace(sigma_start, self.sigma_min, num_inference_steps)
+        if self.inverse_timesteps:
+            self.sigmas = torch.flip(self.sigmas, dims=[0])
+        self.sigmas = self.shift * self.sigmas / (1 + (self.shift - 1) * self.sigmas)
+        if self.reverse_sigmas:
+            self.sigmas = 1 - self.sigmas
+        self.timesteps = self.sigmas * self.num_train_timesteps
+        if training:
+            x = self.timesteps
+            y = torch.exp(-2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2)
+            y_shifted = y - y.min()
+            bsmntw_weighing = y_shifted * (num_inference_steps / y_shifted.sum())
+            self.linear_timesteps_weights = bsmntw_weighing
+    def step(self, model_output, timestep, sample, to_final=False, **kwargs):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            sigma_ = 1 if (self.inverse_timesteps or self.reverse_sigmas) else 0
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+        prev_sample = sample + model_output * (sigma_ - sigma)
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        model_output = (sample - sample_stablized) / sigma
+        return model_output
+    def add_noise(self, original_samples, noise, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * original_samples + sigma * noise
+        return sample
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep.to(self.timesteps.device)).abs())
+        weights = self.linear_timesteps_weights[timestep_id]
+        return weights

OmniAvatar/utils/args_config.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import json
+import os
+import argparse
+import yaml
+args = None
+def set_global_args(local_args):
+    global args
+    args = local_args
+def parse_hp_string(hp_string):
+    result = {}
+    for pair in hp_string.split(','):
+        if not pair:
+            continue
+        key, value = pair.split('=')
+        try:
+            # 自动转换为 int / float / str
+            ori_value = value
+            value = float(value)
+            if '.' not in str(ori_value):
+                value = int(value)
+        except ValueError:
+            pass
+        if value in ['true', 'True']:
+            value = True
+        if value in ['false', 'False']:
+            value = False
+        if '.' in key:
+            keys = key.split('.')
+            keys = keys
+            current = result
+            for key in keys[:-1]:
+                if key not in current or not isinstance(current[key], dict):
+                    current[key] = {}
+                current = current[key]
+            current[keys[-1]] = value
+        else:
+            result[key.strip()] = value
+    return result
+def parse_args():
+    global args
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML config file.")
+    # 定义 argparse 参数
+    parser.add_argument("--exp_path", type=str, help="Path to save the model.")
+    parser.add_argument("--input_file", type=str, help="Path to inference txt.")
+    parser.add_argument("--debug", action='store_true', default=None)
+    parser.add_argument("--infer", action='store_true')
+    parser.add_argument("-hp", "--hparams", type=str, default="")
+    args = parser.parse_args()
+    # 读取 YAML 配置（如果提供了 --config 参数）
+    if args.config:
+        with open(args.config, "r") as f:
+            yaml_config = yaml.safe_load(f)
+        # 遍历 YAML 配置，将其添加到 args（如果 argparse 里没有定义）
+        for key, value in yaml_config.items():
+            if not hasattr(args, key):  # argparse 没有的参数
+                setattr(args, key, value)
+            elif getattr(args, key) is None:  # argparse 有但值为空
+                setattr(args, key, value)
+    args.rank = int(os.getenv("RANK", "0"))
+    args.world_size = int(os.getenv("WORLD_SIZE", "1"))
+    args.local_rank = int(os.getenv("LOCAL_RANK", "0"))  # torchrun
+    args.device = 'cuda'
+    debug = args.debug
+    if not os.path.exists(args.exp_path):
+        args.exp_path = f'checkpoints/{args.exp_path}'
+    if hasattr(args, 'reload_cfg') and args.reload_cfg:
+        # 重新加载配置文件
+        conf_path = os.path.join(args.exp_path, "config.json")
+        if os.path.exists(conf_path):
+            print('| Reloading config from:', conf_path)
+            args = reload(args, conf_path)
+    if len(args.hparams) > 0:
+        hp_dict = parse_hp_string(args.hparams)
+        for key, value in hp_dict.items():
+            if not hasattr(args, key):
+                setattr(args, key, value)
+            else:
+                if isinstance(value, dict):
+                    ori_v = getattr(args, key)
+                    ori_v.update(value)
+                    setattr(args, key, ori_v)
+                else:
+                    setattr(args, key, value)
+    args.debug = debug
+    dict_args = convert_namespace_to_dict(args)
+    if args.local_rank == 0:
+        print(dict_args)
+    return args
+def reload(args, conf_path):
+    """重新加载配置文件,不覆盖已有的参数"""
+    with open(conf_path, "r") as f:
+        yaml_config = yaml.safe_load(f)
+    # 遍历 YAML 配置，将其添加到 args（如果 argparse 里没有定义）
+    for key, value in yaml_config.items():
+        if not hasattr(args, key):  # argparse 没有的参数
+            setattr(args, key, value)
+        elif getattr(args, key) is None:  # argparse 有但值为空
+            setattr(args, key, value)
+    return args
+def convert_namespace_to_dict(namespace):
+    """将 argparse.Namespace 转为字典，并处理不可序列化对象"""
+    result = {}
+    for key, value in vars(namespace).items():
+        try:
+            json.dumps(value)  # 检查是否可序列化
+            result[key] = value
+        except (TypeError, OverflowError):
+            result[key] = str(value)  # 将不可序列化的对象转为字符串表示
+    return result

OmniAvatar/utils/audio_preprocess.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import subprocess
+def add_silence_to_audio_ffmpeg(audio_path, tmp_audio_path, silence_duration_s=0.5):
+    # 使用 ffmpeg 命令在音频前加上静音
+    cmd = [
+        'ffmpeg',
+        '-i', audio_path,  # 输入音频文件路径
+        '-f', 'lavfi',  # 使用 lavfi 虚拟输入设备生成静音
+        '-t', str(silence_duration_s),  # 静音时长，单位秒
+        '-i', 'anullsrc=r=16000:cl=stereo',  # 创建静音片段（假设音频为 stereo，采样率 44100）
+        '-filter_complex', '[1][0]concat=n=2:v=0:a=1[out]',  # 合并静音和原音频
+        '-map', '[out]',  # 输出合并后的音频
+        '-y', tmp_audio_path,  # 输出文件路径
+        '-loglevel', 'error'
+    ]
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"ffmpeg failed ({e.returncode}): {e.stderr.strip()}")

OmniAvatar/utils/io_utils.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import subprocess
+import torch, os
+from safetensors import safe_open
+from OmniAvatar.utils.args_config import args
+from contextlib import contextmanager
+import re
+import tempfile
+import numpy as np
+import imageio
+from glob import glob
+import soundfile as sf
+from einops import rearrange
+import hashlib
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+@contextmanager
+def init_weights_on_device(device = torch.device("meta"), include_buffers :bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+def load_state_dict_from_folder(file_path, torch_dtype=None):
+    state_dict = {}
+    for file_name in os.listdir(file_path):
+        if "." in file_name and file_name.split(".")[-1] in [
+            "safetensors", "bin", "ckpt", "pth", "pt"
+        ]:
+            state_dict.update(load_state_dict(os.path.join(file_path, file_name), torch_dtype=torch_dtype))
+    return state_dict
+def load_state_dict(file_path, torch_dtype=None):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
+    else:
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
+def load_state_dict_from_safetensors(file_path, torch_dtype=None):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None):
+    state_dict = torch.load(file_path, map_location="cpu", weights_only=True)
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def smart_load_weights(model, ckpt_state_dict):
+    model_state_dict = model.state_dict()
+    new_state_dict = {}
+    for name, param in model_state_dict.items():
+        if name in ckpt_state_dict:
+            ckpt_param = ckpt_state_dict[name]
+            if param.shape == ckpt_param.shape:
+                new_state_dict[name] = ckpt_param
+            else:
+                # 自动修剪维度以匹配
+                if all(p >= c for p, c in zip(param.shape, ckpt_param.shape)):
+                    print(f"[Truncate] {name}: ckpt {ckpt_param.shape} -> model {param.shape}")
+                    # 创建新张量，拷贝旧数据
+                    new_param = param.clone()
+                    slices = tuple(slice(0, s) for s in ckpt_param.shape)
+                    new_param[slices] = ckpt_param
+                    new_state_dict[name] = new_param
+                else:
+                    print(f"[Skip] {name}: ckpt {ckpt_param.shape} is larger than model {param.shape}")
+    # 更新 state_dict，只更新那些匹配的
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, assign=True, strict=False)
+    return model, missing_keys, unexpected_keys
+def save_wav(audio, audio_path):
+    if isinstance(audio, torch.Tensor):
+        audio = audio.float().detach().cpu().numpy()
+    if audio.ndim == 1:
+        audio = np.expand_dims(audio, axis=0)  # (1, samples)
+    sf.write(audio_path, audio.T, 16000)
+    return True
+def save_video_as_grid_and_mp4(video_batch: torch.Tensor, save_path: str, fps: float = 5,prompt=None, prompt_path=None, audio=None, audio_path=None, prefix=None):
+    os.makedirs(save_path, exist_ok=True)
+    out_videos = []
+    with tempfile.TemporaryDirectory() as tmp_path:
+        print(f'video batch shape:{video_batch.shape}')
+        for i, vid in enumerate(video_batch):
+            gif_frames = []
+            for frame in vid:
+                ft = frame.detach().cpu().clone()
+                ft = rearrange(ft, "c h w -> h w c")
+                arr = (255.0 * ft).numpy().astype(np.uint8)
+                gif_frames.append(arr)
+            if prefix is not None:
+                now_save_path = os.path.join(save_path, f"{prefix}_{i:03d}.mp4")
+                tmp_save_path = os.path.join(tmp_path, f"{prefix}_{i:03d}.mp4")
+            else:
+                now_save_path = os.path.join(save_path, f"{i:03d}.mp4")
+                tmp_save_path = os.path.join(tmp_path, f"{i:03d}.mp4")
+            with imageio.get_writer(tmp_save_path, fps=fps) as writer:
+                for frame in gif_frames:
+                    writer.append_data(frame)
+            subprocess.run([f"cp {tmp_save_path} {now_save_path}"], check=True, shell=True)
+            print(f'save res video to : {now_save_path}')
+            final_video_path = now_save_path
+            if audio is not None or audio_path is not None:
+                if audio is not None:
+                    audio_path = os.path.join(tmp_path, f"{i:06d}.mp3")
+                    save_wav(audio[i], audio_path)
+                # cmd = f'/usr/bin/ffmpeg -i {tmp_save_path} -i {audio_path} -v quiet -c:v copy -c:a libmp3lame -strict experimental {tmp_save_path[:-4]}_wav.mp4 -y'
+                cmd = f'/usr/bin/ffmpeg -i {tmp_save_path} -i {audio_path} -v quiet -map 0:v:0 -map 1:a:0 -c:v copy -c:a aac {tmp_save_path[:-4]}_wav.mp4 -y'
+                subprocess.check_call(cmd, stdout=None, stdin=subprocess.PIPE, shell=True)
+                final_video_path = f"{now_save_path[:-4]}_wav.mp4"
+                subprocess.run([f"cp {tmp_save_path[:-4]}_wav.mp4 {final_video_path}"], check=True, shell=True)
+                os.remove(now_save_path)
+            if prompt is not None and prompt_path is not None:
+                with open(prompt_path, "w") as f:
+                    f.write(prompt)
+            out_videos.append(final_video_path)
+    return out_videos
+def is_zero_stage_3(trainer):
+    strategy = getattr(trainer, "strategy", None)
+    if strategy and hasattr(strategy, "model"):
+        ds_engine = strategy.model
+        stage = ds_engine.config.get("zero_optimization", {}).get("stage", 0)
+        return stage == 3
+    return False
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in  keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in  keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+def search_for_files(folder, extensions):
+    files = []
+    if os.path.isdir(folder):
+        for file in sorted(os.listdir(folder)):
+            files += search_for_files(os.path.join(folder, file), extensions)
+    elif os.path.isfile(folder):
+        for extension in extensions:
+            if folder.endswith(extension):
+                files.append(folder)
+                break
+    return files
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str

OmniAvatar/vram_management/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .layers import *

OmniAvatar/vram_management/layers.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch, copy
+from ..utils.io_utils import init_weights_on_device
+def cast_to(weight, dtype, device):
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight)
+    return r
+class AutoWrappedModule(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device):
+        super().__init__()
+        self.module = module.to(dtype=offload_dtype, device=offload_device)
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.module.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.module.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, *args, **kwargs):
+        if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+            module = self.module
+        else:
+            module = copy.deepcopy(self.module).to(dtype=self.computation_dtype, device=self.computation_device)
+        return module(*args, **kwargs)
+class AutoWrappedLinear(torch.nn.Linear):
+    def __init__(self, module: torch.nn.Linear, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None, dtype=offload_dtype, device=offload_device)
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, x, *args, **kwargs):
+        if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+            weight, bias = self.weight, self.bias
+        else:
+            weight = cast_to(self.weight, self.computation_dtype, self.computation_device)
+            bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
+        return torch.nn.functional.linear(x, weight, bias)
+def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, total_num_param=0):
+    for name, module in model.named_children():
+        for source_module, target_module in module_map.items():
+            if isinstance(module, source_module):
+                num_param = sum(p.numel() for p in module.parameters())
+                if max_num_param is not None and total_num_param + num_param > max_num_param:
+                    module_config_ = overflow_module_config
+                else:
+                    module_config_ = module_config
+                module_ = target_module(module, **module_config_)
+                setattr(model, name, module_)
+                total_num_param += num_param
+                break
+        else:
+            total_num_param = enable_vram_management_recursively(module, module_map, module_config, max_num_param, overflow_module_config, total_num_param)
+    return total_num_param
+def enable_vram_management(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None):
+    enable_vram_management_recursively(model, module_map, module_config, max_num_param, overflow_module_config, total_num_param=0)
+    model.vram_management_enabled = True

OmniAvatar/wan_video.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import types
+from .models.model_manager import ModelManager
+from .models.wan_video_dit import WanModel
+from .models.wan_video_text_encoder import WanTextEncoder
+from .models.wan_video_vae import WanVideoVAE
+from .schedulers.flow_match import FlowMatchScheduler
+from .base import BasePipeline
+from .prompters import WanPrompter
+import torch, os
+from einops import rearrange
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from typing import Optional
+from .vram_management import enable_vram_management, AutoWrappedModule, AutoWrappedLinear
+from .models.wan_video_text_encoder import T5RelativeEmbedding, T5LayerNorm
+from .models.wan_video_dit import RMSNorm
+from .models.wan_video_vae import RMS_norm, CausalConv3d, Upsample
+class WanVideoPipeline(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.float16, tokenizer_path=None):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
+        self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder = None
+        self.dit: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.model_names = ['text_encoder', 'dit', 'vae', 'image_encoder']
+        self.height_division_factor = 16
+        self.width_division_factor = 16
+        self.use_unified_sequence_parallel = False
+        self.sp_size = 1
+    def enable_vram_management(self, num_persistent_param_in_dit=None):
+        dtype = next(iter(self.text_encoder.parameters())).dtype
+        enable_vram_management(
+            self.text_encoder,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Embedding: AutoWrappedModule,
+                T5RelativeEmbedding: AutoWrappedModule,
+                T5LayerNorm: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.dit.parameters())).dtype
+        enable_vram_management(
+            self.dit,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv3d: AutoWrappedModule,
+                torch.nn.LayerNorm: AutoWrappedModule,
+                RMSNorm: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+            max_num_param=num_persistent_param_in_dit,
+            overflow_module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.vae.parameters())).dtype
+        enable_vram_management(
+            self.vae,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv2d: AutoWrappedModule,
+                RMS_norm: AutoWrappedModule,
+                CausalConv3d: AutoWrappedModule,
+                Upsample: AutoWrappedModule,
+                torch.nn.SiLU: AutoWrappedModule,
+                torch.nn.Dropout: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        self.enable_cpu_offload()
+    def fetch_models(self, model_manager: ModelManager):
+        text_encoder_model_and_path = model_manager.fetch_model("wan_video_text_encoder", require_model_path=True)
+        if text_encoder_model_and_path is not None:
+            self.text_encoder, tokenizer_path = text_encoder_model_and_path
+            self.prompter.fetch_models(self.text_encoder)
+            self.prompter.fetch_tokenizer(os.path.join(os.path.dirname(tokenizer_path), "google/umt5-xxl"))
+        self.dit = model_manager.fetch_model("wan_video_dit")
+        self.vae = model_manager.fetch_model("wan_video_vae")
+        self.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager, torch_dtype=None, device=None, use_usp=False, infer=False):
+        if device is None: device = model_manager.device
+        if torch_dtype is None: torch_dtype = model_manager.torch_dtype
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        pipe.fetch_models(model_manager)
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size, get_sp_group
+            from OmniAvatar.distributed.xdit_context_parallel import usp_attn_forward
+            for block in pipe.dit.blocks:
+                block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+            pipe.sp_size = get_sequence_parallel_world_size()
+            pipe.use_unified_sequence_parallel = True
+            pipe.sp_group = get_sp_group()
+        return pipe
+    def denoising_model(self):
+        return self.dit
+    def encode_prompt(self, prompt, positive=True):
+        prompt_emb = self.prompter.encode_prompt(prompt, positive=positive, device=self.device)
+        return {"context": prompt_emb}
+    def encode_image(self, image, num_frames, height, width):
+        image = self.preprocess_image(image.resize((width, height))).to(self.device, dtype=self.torch_dtype)
+        clip_context = self.image_encoder.encode_image([image])
+        clip_context = clip_context.to(dtype=self.torch_dtype)
+        msk = torch.ones(1, num_frames, height//8, width//8, device=self.device, dtype=self.torch_dtype)
+        msk[:, 1:] = 0
+        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, height//8, width//8)
+        msk = msk.transpose(1, 2)[0]
+        vae_input = torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device, dtype=self.torch_dtype)], dim=1)
+        y = self.vae.encode([vae_input.to(dtype=self.torch_dtype, device=self.device)], device=self.device)[0]
+        y = torch.concat([msk, y])
+        y = y.unsqueeze(0)
+        clip_context = clip_context.to(dtype=self.torch_dtype, device=self.device)
+        y = y.to(dtype=self.torch_dtype, device=self.device)
+        return {"clip_feature": clip_context, "y": y}
+    def tensor2video(self, frames):
+        frames = rearrange(frames, "C T H W -> T H W C")
+        frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)
+        frames = [Image.fromarray(frame) for frame in frames]
+        return frames
+    def prepare_extra_input(self, latents=None):
+        return {}
+    def encode_video(self, input_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        latents = self.vae.encode(input_video, device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        return latents
+    def decode_video(self, latents, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        frames = self.vae.decode(latents, device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        return frames
+    def prepare_unified_sequence_parallel(self):
+        return {"use_unified_sequence_parallel": self.use_unified_sequence_parallel}
+    @torch.no_grad()
+    def log_video(
+        self,
+        lat,
+        prompt,
+        fixed_frame=0, # lat frames
+        image_emb={},
+        audio_emb={},
+        negative_prompt="",
+        cfg_scale=5.0,
+        audio_cfg_scale=5.0,
+        num_inference_steps=50,
+        denoising_strength=1.0,
+        sigma_shift=5.0,
+        tiled=True,
+        tile_size=(30, 52),
+        tile_stride=(15, 26),
+        tea_cache_l1_thresh=None,
+        tea_cache_model_id="",
+        progress_bar_cmd=None,
+        return_latent=False,
+    ):
+        tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        # Scheduler
+        self.scheduler.set_timesteps(num_inference_steps, denoising_strength=denoising_strength, shift=sigma_shift)
+        lat = lat.to(dtype=self.torch_dtype)
+        latents = lat.clone()
+        latents = torch.randn_like(latents, dtype=self.torch_dtype)
+        # Encode prompts
+        self.load_models_to_device(["text_encoder"])
+        prompt_emb_posi = self.encode_prompt(prompt, positive=True)
+        if cfg_scale != 1.0:
+            prompt_emb_nega = self.encode_prompt(negative_prompt, positive=False)
+        # Extra input
+        extra_input = self.prepare_extra_input(latents)
+        # TeaCache
+        tea_cache_posi = {"tea_cache": None}
+        tea_cache_nega = {"tea_cache": None}
+        # Denoise
+        self.load_models_to_device(["dit"])
+        for progress_id, timestep in enumerate(tqdm(self.scheduler.timesteps) if progress_bar_cmd is None else self.scheduler.timesteps ):
+            if fixed_frame > 0: # new
+                latents[:, :, :fixed_frame] = lat[:, :, :fixed_frame]
+            timestep = timestep.unsqueeze(0).to(dtype=self.torch_dtype, device=self.device)
+            # Inference
+            noise_pred_posi = self.dit(x=latents, timestep=timestep, **prompt_emb_posi, **image_emb, **audio_emb, **tea_cache_posi, **extra_input)
+            if cfg_scale != 1.0:
+                audio_emb_uc = {}
+                for key in audio_emb.keys():
+                    audio_emb_uc[key] = torch.zeros_like(audio_emb[key], dtype=self.torch_dtype)
+                if audio_cfg_scale == cfg_scale:
+                    noise_pred_nega = self.dit(x=latents, timestep=timestep, **prompt_emb_nega, **image_emb, **audio_emb_uc, **tea_cache_nega, **extra_input)
+                    noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
+                else:
+                    tea_cache_nega_audio = {"tea_cache": None}
+                    audio_noise_pred_nega = self.dit(x=latents, timestep=timestep, **prompt_emb_posi, **image_emb, **audio_emb_uc, **tea_cache_nega_audio, **extra_input)
+                    text_noise_pred_nega = self.dit(x=latents, timestep=timestep, **prompt_emb_nega, **image_emb, **audio_emb_uc, **tea_cache_nega, **extra_input)
+                    noise_pred = text_noise_pred_nega + cfg_scale * (audio_noise_pred_nega - text_noise_pred_nega) + audio_cfg_scale * (noise_pred_posi - audio_noise_pred_nega)
+            else:
+                noise_pred = noise_pred_posi
+            # Scheduler
+            latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents)
+            if progress_bar_cmd is not None:
+                progress_bar_cmd.update(1)
+        if fixed_frame > 0: # new
+            latents[:, :, :fixed_frame] = lat[:, :, :fixed_frame]
+        # Decode
+        self.load_models_to_device(['vae'])
+        frames = self.decode_video(latents, **tiler_kwargs)
+        recons = self.decode_video(lat, **tiler_kwargs)
+        self.load_models_to_device([])
+        frames = (frames.permute(0, 2, 1, 3, 4).float() + 1.0) / 2.0
+        recons = (recons.permute(0, 2, 1, 3, 4).float() + 1.0) / 2.0
+        if return_latent:
+            return frames, recons, latents
+        return frames, recons
+class TeaCache:
+    def __init__(self, num_inference_steps, rel_l1_thresh, model_id):
+        self.num_inference_steps = num_inference_steps
+        self.step = 0
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.rel_l1_thresh = rel_l1_thresh
+        self.previous_residual = None
+        self.previous_hidden_states = None
+        self.coefficients_dict = {
+            "Wan2.1-T2V-1.3B": [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02],
+            "Wan2.1-T2V-14B": [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01],
+            "Wan2.1-I2V-14B-480P": [2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01],
+            "Wan2.1-I2V-14B-720P": [ 8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02],
+        }
+        if model_id not in self.coefficients_dict:
+            supported_model_ids = ", ".join([i for i in self.coefficients_dict])
+            raise ValueError(f"{model_id} is not a supported TeaCache model id. Please choose a valid model id in ({supported_model_ids}).")
+        self.coefficients = self.coefficients_dict[model_id]
+    def check(self, dit: WanModel, x, t_mod):
+        modulated_inp = t_mod.clone()
+        if self.step == 0 or self.step == self.num_inference_steps - 1:
+            should_calc = True
+            self.accumulated_rel_l1_distance = 0
+        else:
+            coefficients = self.coefficients
+            rescale_func = np.poly1d(coefficients)
+            self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
+            if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                should_calc = False
+            else:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = modulated_inp
+        self.step += 1
+        if self.step == self.num_inference_steps:
+            self.step = 0
+        if should_calc:
+            self.previous_hidden_states = x.clone()
+        return not should_calc
+    def store(self, hidden_states):
+        self.previous_residual = hidden_states - self.previous_hidden_states
+        self.previous_hidden_states = None
+    def update(self, hidden_states):
+        hidden_states = hidden_states + self.previous_residual
+        return hidden_states

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
----
-title: OmniAvatar Clay Fast
-emoji: 📚
-colorFrom: green
-colorTo: gray
-sdk: gradio
-sdk_version: 5.44.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: OmniAvatar-Clay-Fast
+emoji: 🐨
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 5.36.2
+app_file: app.py
+pinned: false
+short_description: Generate claymation style avatar to do your podcast
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,942 @@

+import spaces
+import subprocess
+import gradio as gr
+import os, sys
+from glob import glob
+from datetime import datetime
+import math
+import random
+import librosa
+import numpy as np
+import uuid
+import shutil
+from tqdm import tqdm
+import importlib, site, sys
+from huggingface_hub import hf_hub_download, snapshot_download
+# Re-discover all .pth/.egg-link files
+for sitedir in site.getsitepackages():
+    site.addsitedir(sitedir)
+# Clear caches so importlib will pick up new modules
+importlib.invalidate_caches()
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+flash_attention_installed = False
+try:
+    print("Attempting to download and install FlashAttention wheel...")
+    flash_attention_wheel = hf_hub_download(
+            repo_id="alexnasa/flash-attn-3",
+            repo_type="model",
+            filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
+        )
+    sh(f"pip install {flash_attention_wheel}")
+    # tell Python to re-scan site-packages now that the egg-link exists
+    import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
+    flash_attention_installed = True
+    print("FlashAttention installed successfully.")
+except Exception as e:
+    print(f"⚠️ Could not install FlashAttention: {e}")
+    print("Continuing without FlashAttention...")
+import torch
+print(f"Torch version: {torch.__version__}")
+# print(f"FlashAttention available: {flash_attention_installed}")
+import torch.nn as nn
+from tqdm import tqdm
+from functools import partial
+from omegaconf import OmegaConf
+from argparse import Namespace
+from gradio_extendedimage import extendedimage
+import torchaudio
+# load the one true config you dumped
+_args_cfg = OmegaConf.load("args_config.yaml")
+args = Namespace(**OmegaConf.to_container(_args_cfg, resolve=True))
+from OmniAvatar.utils.args_config import set_global_args
+set_global_args(args)
+# args = parse_args()
+from OmniAvatar.utils.io_utils import load_state_dict
+from peft import LoraConfig, inject_adapter_in_model
+from OmniAvatar.models.model_manager import ModelManager
+from OmniAvatar.schedulers.flow_match import FlowMatchScheduler
+from OmniAvatar.wan_video import WanVideoPipeline
+from OmniAvatar.utils.io_utils import save_video_as_grid_and_mp4
+import torchvision.transforms as TT
+from transformers import Wav2Vec2FeatureExtractor
+import torchvision.transforms as transforms
+import torch.nn.functional as F
+from OmniAvatar.utils.audio_preprocess import add_silence_to_audio_ffmpeg
+from diffusers import FluxKontextPipeline
+from diffusers.utils import load_image
+from PIL import Image
+os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/processed_results"
+flux_pipe = FluxKontextPipeline.from_pretrained("black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16)
+flux_pipe.load_lora_weights("alexnasa/Claymation-Kontext-Dev-Lora")
+flux_pipe.to("cuda")
+flux_inference = 10
+def tensor_to_pil(tensor):
+    """
+    Args:
+        tensor: torch.Tensor with shape like
+                (1, C, H, W), (1, C, 1, H, W), (C, H, W), etc.
+                values in [-1, 1], on any device.
+    Returns:
+        A PIL.Image in RGB mode.
+    """
+    # 1) Remove batch dim if it exists
+    if tensor.dim() > 3 and tensor.shape[0] == 1:
+        tensor = tensor[0]
+    # 2) Squeeze out any other singleton dims (e.g. that extra frame axis)
+    tensor = tensor.squeeze()
+    # Now we should have exactly 3 dims: (C, H, W)
+    if tensor.dim() != 3:
+        raise ValueError(f"Expected 3 dims after squeeze, got {tensor.dim()}")
+    # 3) Move to CPU float32
+    tensor = tensor.cpu().float()
+    # 4) Undo normalization from [-1,1] -> [0,1]
+    tensor = (tensor + 1.0) / 2.0
+    # 5) Clamp to [0,1]
+    tensor = torch.clamp(tensor, 0.0, 1.0)
+    # 6) To NumPy H×W×C in [0,255]
+    np_img = (tensor.permute(1, 2, 0).numpy() * 255.0).round().astype("uint8")
+    # 7) Build PIL Image
+    return Image.fromarray(np_img)
+def set_seed(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)  # 设置当前GPU
+    torch.cuda.manual_seed_all(seed)  # 设置所有GPU
+def read_from_file(p):
+    with open(p, "r") as fin:
+        for l in fin:
+            yield l.strip()
+def match_size(image_size, h, w):
+    ratio_ = 9999
+    size_ = 9999
+    select_size = None
+    for image_s in image_size:
+        ratio_tmp = abs(image_s[0] / image_s[1] - h / w)
+        size_tmp = abs(max(image_s) - max(w, h))
+        if ratio_tmp < ratio_:
+            ratio_ = ratio_tmp
+            size_ = size_tmp
+            select_size = image_s
+        if ratio_ == ratio_tmp:
+            if size_ == size_tmp:
+                select_size = image_s
+    return select_size
+def resize_pad(image, ori_size, tgt_size):
+    h, w = ori_size
+    scale_ratio = max(tgt_size[0] / h, tgt_size[1] / w)
+    scale_h = int(h * scale_ratio)
+    scale_w = int(w * scale_ratio)
+    image = transforms.Resize(size=[scale_h, scale_w])(image)
+    padding_h = tgt_size[0] - scale_h
+    padding_w = tgt_size[1] - scale_w
+    pad_top = padding_h // 2
+    pad_bottom = padding_h - pad_top
+    pad_left = padding_w // 2
+    pad_right = padding_w - pad_left
+    image = F.pad(image, (pad_left, pad_right, pad_top, pad_bottom), mode='constant', value=0)
+    return image
+class WanInferencePipeline(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.device = torch.device(f"cuda")
+        self.dtype = torch.bfloat16
+        self.pipe = self.load_model()
+        chained_trainsforms = []
+        chained_trainsforms.append(TT.ToTensor())
+        self.transform = TT.Compose(chained_trainsforms)
+        if self.args.use_audio:
+            from OmniAvatar.models.wav2vec import Wav2VecModel
+            self.wav_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+                    self.args.wav2vec_path
+                )
+            self.audio_encoder = Wav2VecModel.from_pretrained(self.args.wav2vec_path, local_files_only=True).to(device=self.device, dtype=self.dtype)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+    def load_model(self):
+        ckpt_path = f'{self.args.exp_path}/pytorch_model.pt'
+        assert os.path.exists(ckpt_path), f"pytorch_model.pt not found in {self.args.exp_path}"
+        if self.args.train_architecture == 'lora':
+            self.args.pretrained_lora_path = pretrained_lora_path = ckpt_path
+        else:
+            resume_path = ckpt_path
+        self.step = 0
+        # Load models
+        model_manager = ModelManager(device="cuda", infer=True)
+        model_manager.load_models(
+            [
+                self.args.dit_path.split(","),
+                self.args.vae_path,
+                self.args.text_encoder_path
+            ],
+            torch_dtype=self.dtype,
+            device='cuda',
+        )
+        pipe = WanVideoPipeline.from_model_manager(model_manager,
+                                                torch_dtype=self.dtype,
+                                                device="cuda",
+                                                use_usp=False,
+                                                infer=True)
+        if self.args.train_architecture == "lora":
+            print(f'Use LoRA: lora rank: {self.args.lora_rank}, lora alpha: {self.args.lora_alpha}')
+            self.add_lora_to_model(
+                    pipe.denoising_model(),
+                    lora_rank=self.args.lora_rank,
+                    lora_alpha=self.args.lora_alpha,
+                    lora_target_modules=self.args.lora_target_modules,
+                    init_lora_weights=self.args.init_lora_weights,
+                    pretrained_lora_path=pretrained_lora_path,
+                )
+            print(next(pipe.denoising_model().parameters()).device)
+        else:
+            missing_keys, unexpected_keys = pipe.denoising_model().load_state_dict(load_state_dict(resume_path), strict=True)
+            print(f"load from {resume_path}, {len(missing_keys)} missing keys, {len(unexpected_keys)} unexpected keys")
+        pipe.requires_grad_(False)
+        pipe.eval()
+        # pipe.enable_vram_management(num_persistent_param_in_dit=args.num_persistent_param_in_dit)
+        return pipe
+    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="q,k,v,o,ffn.0,ffn.2", init_lora_weights="kaiming", pretrained_lora_path=None, state_dict_converter=None):
+        # Add LoRA to UNet
+        self.lora_alpha = lora_alpha
+        if init_lora_weights == "kaiming":
+            init_lora_weights = True
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            init_lora_weights=init_lora_weights,
+            target_modules=lora_target_modules.split(","),
+        )
+        model = inject_adapter_in_model(lora_config, model)
+        # Lora pretrained lora weights
+        if pretrained_lora_path is not None:
+            state_dict = load_state_dict(pretrained_lora_path, torch_dtype=self.dtype)
+            if state_dict_converter is not None:
+                state_dict = state_dict_converter(state_dict)
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            all_keys = [i for i, _ in model.named_parameters()]
+            num_updated_keys = len(all_keys) - len(missing_keys)
+            num_unexpected_keys = len(unexpected_keys)
+            print(f"{num_updated_keys} parameters are loaded from {pretrained_lora_path}. {num_unexpected_keys} parameters are unexpected.")
+    def get_times(self, prompt,
+            image_path=None,
+            audio_path=None,
+            seq_len=101, # not used while audio_path is not None
+            height=720,
+            width=720,
+            overlap_frame=None,
+            num_steps=None,
+            negative_prompt=None,
+            guidance_scale=None,
+            audio_scale=None):
+        overlap_frame = overlap_frame if overlap_frame is not None else  self.args.overlap_frame
+        num_steps = num_steps if num_steps is not None else  self.args.num_steps
+        negative_prompt = negative_prompt if negative_prompt is not None else  self.args.negative_prompt
+        guidance_scale = guidance_scale if guidance_scale is not None else  self.args.guidance_scale
+        audio_scale = audio_scale if audio_scale is not None else  self.args.audio_scale
+        if image_path is not None:
+            image = Image.open(image_path).convert("RGB")
+            image = self.transform(image).unsqueeze(0).to(dtype=self.dtype)
+            _, _, h, w = image.shape
+            select_size = match_size(getattr( self.args, f'image_sizes_{ self.args.max_hw}'), h, w)
+            image = resize_pad(image, (h, w), select_size)
+            image = image * 2.0 - 1.0
+            image = image[:, :, None]
+        else:
+            image = None
+            select_size = [height, width]
+        num = self.args.max_tokens * 16 * 16 * 4
+        den = select_size[0] * select_size[1]
+        L0 = num // den
+        diff = (L0 - 1) % 4
+        L  = L0 - diff
+        if L < 1:
+            L = 1
+        T = (L + 3) // 4
+        if  self.args.random_prefix_frames:
+            fixed_frame = overlap_frame
+            assert fixed_frame % 4 == 1
+        else:
+            fixed_frame = 1
+        prefix_lat_frame = (3 + fixed_frame) // 4
+        first_fixed_frame = 1
+        audio, sr = librosa.load(audio_path, sr= self.args.sample_rate)
+        input_values = np.squeeze(
+            self.wav_feature_extractor(audio, sampling_rate=16000).input_values
+        )
+        input_values = torch.from_numpy(input_values).float().to(dtype=self.dtype)
+        audio_len = math.ceil(len(input_values) / self.args.sample_rate * self.args.fps)
+        if audio_len < L - first_fixed_frame:
+            audio_len = audio_len + ((L - first_fixed_frame) - audio_len % (L - first_fixed_frame))
+        elif (audio_len - (L - first_fixed_frame)) % (L - fixed_frame) != 0:
+            audio_len = audio_len + ((L - fixed_frame) - (audio_len - (L - first_fixed_frame)) % (L - fixed_frame))
+        seq_len = audio_len
+        times = (seq_len - L + first_fixed_frame) // (L-fixed_frame) + 1
+        if times * (L-fixed_frame) + fixed_frame < seq_len:
+            times += 1
+        return times
+    @torch.no_grad()
+    def forward(self, prompt,
+                image_path=None,
+                audio_path=None,
+                seq_len=101, # not used while audio_path is not None
+                height=720,
+                width=720,
+                overlap_frame=None,
+                num_steps=None,
+                negative_prompt=None,
+                guidance_scale=None,
+                audio_scale=None):
+        overlap_frame = overlap_frame if overlap_frame is not None else self.args.overlap_frame
+        num_steps = num_steps if num_steps is not None else self.args.num_steps
+        negative_prompt = negative_prompt if negative_prompt is not None else self.args.negative_prompt
+        guidance_scale = guidance_scale if guidance_scale is not None else self.args.guidance_scale
+        audio_scale = audio_scale if audio_scale is not None else self.args.audio_scale
+        if image_path is not None:
+            image = Image.open(image_path).convert("RGB")
+            image = self.transform(image).unsqueeze(0).to(self.device, dtype=self.dtype)
+            _, _, h, w = image.shape
+            select_size = match_size(getattr(self.args, f'image_sizes_{self.args.max_hw}'), h, w)
+            image = resize_pad(image, (h, w), select_size)
+            image = image * 2.0 - 1.0
+            image = image[:, :, None]
+        else:
+            image = None
+            select_size = [height, width]
+        # L = int(self.args.max_tokens * 16 * 16 * 4 / select_size[0] / select_size[1])
+        # L = L // 4 * 4 + 1 if L % 4 != 0 else L - 3  # video frames
+        # T = (L + 3) // 4  # latent frames
+        # step 1: numerator and denominator as ints
+        num = args.max_tokens * 16 * 16 * 4
+        den = select_size[0] * select_size[1]
+        # step 2: integer division
+        L0 = num // den  # exact floor division, no float in sight
+        # step 3: make it ≡ 1 mod 4
+        #    if L0 % 4 == 1, keep L0;
+        #    otherwise subtract the difference so that (L0 - diff) % 4 == 1,
+        #    but ensure the result stays positive.
+        diff = (L0 - 1) % 4
+        L  = L0 - diff
+        if L < 1:
+            L = 1  # or whatever your minimal frame count is
+        # step 4: latent frames
+        T = (L + 3) // 4
+        if self.args.i2v:
+            if self.args.random_prefix_frames:
+                fixed_frame = overlap_frame
+                assert fixed_frame % 4 == 1
+            else:
+                fixed_frame = 1
+            prefix_lat_frame = (3 + fixed_frame) // 4
+            first_fixed_frame = 1
+        else:
+            fixed_frame = 0
+            prefix_lat_frame = 0
+            first_fixed_frame = 0
+        if audio_path is not None and self.args.use_audio:
+            audio, sr = librosa.load(audio_path, sr=self.args.sample_rate)
+            input_values = np.squeeze(
+                    self.wav_feature_extractor(audio, sampling_rate=16000).input_values
+                )
+            input_values = torch.from_numpy(input_values).float().to(device=self.device, dtype=self.dtype)
+            ori_audio_len = audio_len = math.ceil(len(input_values) / self.args.sample_rate * self.args.fps)
+            input_values = input_values.unsqueeze(0)
+            # padding audio
+            if audio_len < L - first_fixed_frame:
+                audio_len = audio_len + ((L - first_fixed_frame) - audio_len % (L - first_fixed_frame))
+            elif (audio_len - (L - first_fixed_frame)) % (L - fixed_frame) != 0:
+                audio_len = audio_len + ((L - fixed_frame) - (audio_len - (L - first_fixed_frame)) % (L - fixed_frame))
+            input_values = F.pad(input_values, (0, audio_len * int(self.args.sample_rate / self.args.fps) - input_values.shape[1]), mode='constant', value=0)
+            with torch.no_grad():
+                hidden_states = self.audio_encoder(input_values, seq_len=audio_len, output_hidden_states=True)
+                audio_embeddings = hidden_states.last_hidden_state
+                for mid_hidden_states in hidden_states.hidden_states:
+                    audio_embeddings = torch.cat((audio_embeddings, mid_hidden_states), -1)
+            seq_len = audio_len
+            audio_embeddings = audio_embeddings.squeeze(0)
+            audio_prefix = torch.zeros_like(audio_embeddings[:first_fixed_frame])
+        else:
+            audio_embeddings = None
+        # loop
+        times = (seq_len - L + first_fixed_frame) // (L-fixed_frame) + 1
+        if times * (L-fixed_frame) + fixed_frame < seq_len:
+            times += 1
+        video = []
+        image_emb = {}
+        img_lat = None
+        if self.args.i2v:
+            self.pipe.load_models_to_device(['vae'])
+            img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device, dtype=self.dtype)
+            msk = torch.zeros_like(img_lat.repeat(1, 1, T, 1, 1)[:,:1], dtype=self.dtype)
+            image_cat = img_lat.repeat(1, 1, T, 1, 1)
+            msk[:, :, 1:] = 1
+            image_emb["y"] = torch.cat([image_cat, msk], dim=1)
+        total_iterations = times * num_steps
+        with tqdm(total=total_iterations) as pbar:
+            for t in range(times):
+                print(f"[{t+1}/{times}]")
+                audio_emb = {}
+                if t == 0:
+                    overlap = first_fixed_frame
+                else:
+                    overlap = fixed_frame
+                    image_emb["y"][:, -1:, :prefix_lat_frame] = 0 # 第一次推理是mask只有1，往后都是mask overlap
+                prefix_overlap = (3 + overlap) // 4
+                if audio_embeddings is not None:
+                    if t == 0:
+                        audio_tensor = audio_embeddings[
+                                :min(L - overlap, audio_embeddings.shape[0])
+                            ]
+                    else:
+                        audio_start = L - first_fixed_frame + (t - 1) * (L - overlap)
+                        audio_tensor = audio_embeddings[
+                            audio_start: min(audio_start + L - overlap, audio_embeddings.shape[0])
+                        ]
+                    audio_tensor = torch.cat([audio_prefix, audio_tensor], dim=0)
+                    audio_prefix = audio_tensor[-fixed_frame:]
+                    audio_tensor = audio_tensor.unsqueeze(0).to(device=self.device, dtype=self.dtype)
+                    audio_emb["audio_emb"] = audio_tensor
+                else:
+                    audio_prefix = None
+                if image is not None and img_lat is None:
+                    self.pipe.load_models_to_device(['vae'])
+                    img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device, dtype=self.dtype)
+                    assert img_lat.shape[2] == prefix_overlap
+                img_lat = torch.cat([img_lat, torch.zeros_like(img_lat[:, :, :1].repeat(1, 1, T - prefix_overlap, 1, 1), dtype=self.dtype)], dim=2)
+                frames, _, latents = self.pipe.log_video(img_lat, prompt, prefix_overlap, image_emb, audio_emb,
+                                                     negative_prompt, num_inference_steps=num_steps,
+                                                     cfg_scale=guidance_scale, audio_cfg_scale=audio_scale if audio_scale is not None else guidance_scale,
+                                                     return_latent=True,
+                                                     tea_cache_l1_thresh=self.args.tea_cache_l1_thresh,tea_cache_model_id="Wan2.1-T2V-14B", progress_bar_cmd=pbar)
+                torch.cuda.empty_cache()
+                img_lat = None
+                image = (frames[:, -fixed_frame:].clip(0, 1) * 2.0 - 1.0).permute(0, 2, 1, 3, 4).contiguous()
+                if t == 0:
+                    video.append(frames)
+                else:
+                    video.append(frames[:, overlap:])
+        video = torch.cat(video, dim=1)
+        video = video[:, :ori_audio_len + 1]
+        return video
+snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./pretrained_models/Wan2.1-T2V-1.3B")
+snapshot_download(repo_id="facebook/wav2vec2-base-960h", local_dir="./pretrained_models/wav2vec2-base-960h")
+snapshot_download(repo_id="OmniAvatar/OmniAvatar-1.3B", local_dir="./pretrained_models/OmniAvatar-1.3B")
+import tempfile
+set_seed(args.seed)
+seq_len = args.seq_len
+inferpipe = WanInferencePipeline(args)
+ADAPTIVE_PROMPT_TEMPLATES = [
+    "A claymation video of a person speaking and moving their head accordingly but without moving their hands.",
+    "A claymation video of a person speaking and sometimes looking directly to the camera and moving their eyes and pupils and head accordingly and turning and looking at the camera and looking away from the camera but with subtle hands movement that complements their speech.",
+    "A claymation video of a person speaking and sometimes looking directly to the camera and moving their eyes and pupils and head accordingly and turning and looking at the camera and looking away from the camera based on their movements with dynamic and rhythmic and subtle hand gestures that complement their speech and don't disrupt things if they are holding something with their hands. Their hands are clearly visible, independent, and unobstructed. Their facial expressions are expressive and full of emotion, enhancing the delivery. The camera remains steady, capturing sharp, clear movements and a focused, engaging presence."
+]
+def slider_value_change(image_path, audio_path, text, num_steps, session_state):
+   return update_generate_button(image_path, audio_path, text, num_steps, session_state), text
+def update_generate_button(image_path, audio_path, text, num_steps, session_state):
+    if image_path is None or audio_path is None:
+        return gr.update(value="⌚ Zero GPU Required: --")
+    duration_s = get_duration(image_path, audio_path, text, num_steps, session_state, None)
+    duration_m = duration_s / 60
+    return gr.update(value=f"⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)")
+def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
+    if image_path is None:
+        gr.Info("Step1: Please Provide an Image or Choose from Image Samples")
+        print("Step1: Please Provide an Image or Choose from Image Samples")
+        return 0
+    if audio_path is None:
+        gr.Info("Step2: Please Provide an Audio or Choose from Audio Samples")
+        print("Step2: Please Provide an Audio or Choose from Audio Samples")
+        return 0
+    audio_chunks = inferpipe.get_times(
+                prompt=text,
+                image_path=image_path,
+                audio_path=audio_path,
+                seq_len=args.seq_len,
+                num_steps=num_steps
+            )
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    dirpath = os.path.dirname(image_path)
+    basename = os.path.basename(image_path)
+    name, ext = os.path.splitext(basename)
+    new_basename = f"clay_{name}{ext}"
+    clay_image_path = os.path.join(dirpath, new_basename)
+    if os.path.exists(clay_image_path):
+        claymation = 0
+    else:
+        claymation = flux_inference * 2
+    warmup_s = 15
+    last_step_s = 20
+    duration_s = (4 * (num_steps - 1) + last_step_s)
+    if audio_chunks > 1:
+        duration_s =  (duration_s * audio_chunks)
+    duration_s = duration_s + warmup_s + claymation
+    print(f'for {audio_chunks} times and {num_steps} steps, {session_id} is preparing for {duration_s}')
+    return int(duration_s)
+def preprocess_img(input_image_path, raw_image_path, session_id = None):
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    if input_image_path is None:
+        return None, None
+    if raw_image_path == '':
+        raw_image_path = input_image_path
+    image = Image.open(raw_image_path).convert("RGB")
+    img_id = uuid.uuid4().hex
+    image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
+    _, _, h, w = image.shape
+    select_size = match_size(getattr( args, f'image_sizes_{ args.max_hw}'), h, w)
+    image = resize_pad(image, (h, w), select_size)
+    image = image * 2.0 - 1.0
+    image = image[:, :, None]
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    img_dir = output_dir + '/image'
+    os.makedirs(img_dir, exist_ok=True)
+    input_img_path = os.path.join(img_dir, f"img_{img_id}.jpg")
+    image = tensor_to_pil(image)
+    image.save(input_img_path)
+    return input_img_path, raw_image_path
+def infer_example(image_path, audio_path, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
+    current_image_size = args.image_sizes_720
+    args.image_sizes_720 = [[720, 400]]
+    text = ADAPTIVE_PROMPT_TEMPLATES[2]
+    result = infer(image_path, audio_path, text, num_steps, session_id, progress)
+    args.image_sizes_720 = current_image_size
+    return result
+@spaces.GPU(duration=get_duration)
+def infer(image_path, audio_path, text, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
+    if image_path is None:
+        return None
+    if audio_path is None:
+        return None
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    # Decompose the path
+    dirpath = os.path.dirname(image_path)
+    basename = os.path.basename(image_path)           # e.g. "photo.png"
+    name, ext = os.path.splitext(basename)            # name="photo", ext=".png"
+    # Rebuild with "clay_" prefix
+    new_basename = f"clay_{name}{ext}"                 # "clay_photo.png"
+    clay_image_path = os.path.join(dirpath, new_basename)
+    # If the output file already exists, skip inference
+    if os.path.exists(clay_image_path):
+        print("using existing image")
+    else:
+        flux_prompt = "in style of omniavatar-claymation"
+        raw_image = load_image(image_path)
+        w, h = raw_image.size
+        clay_image = flux_pipe(image=raw_image, width=w, height=h, prompt=flux_prompt, negative_prompt=args.negative_prompt, num_inference_steps=flux_inference, true_cfg_scale=2.5).images[0]
+        clay_image.save(clay_image_path)
+    audio_dir = output_dir + '/audio'
+    os.makedirs(audio_dir, exist_ok=True)
+    if args.silence_duration_s > 0:
+        input_audio_path = os.path.join(audio_dir, f"audio_input.wav")
+    else:
+        input_audio_path = audio_path
+    prompt_dir = output_dir + '/prompt'
+    os.makedirs(prompt_dir, exist_ok=True)
+    if args.silence_duration_s > 0:
+        add_silence_to_audio_ffmpeg(audio_path, input_audio_path, args.silence_duration_s)
+    tmp2_audio_path = os.path.join(audio_dir, f"audio_out.wav")
+    prompt_path = os.path.join(prompt_dir, f"prompt.txt")
+    video = inferpipe(
+                prompt=text,
+                image_path=clay_image_path,
+                audio_path=input_audio_path,
+                seq_len=args.seq_len,
+                num_steps=num_steps
+            )
+    torch.cuda.empty_cache()
+    add_silence_to_audio_ffmpeg(audio_path, tmp2_audio_path, 1.0 / args.fps + args.silence_duration_s)
+    video_paths = save_video_as_grid_and_mp4(video,
+                            output_dir,
+                            args.fps,
+                            prompt=text,
+                            prompt_path = prompt_path,
+                            audio_path=tmp2_audio_path if args.use_audio else None,
+                            prefix=f'result')
+    return video_paths[0]
+def apply_image(request):
+    print('image applied')
+    return request, None
+def apply_audio(request):
+    print('audio applied')
+    return request
+def cleanup(request: gr.Request):
+    sid = request.session_hash
+    if sid:
+        d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
+        shutil.rmtree(d1, ignore_errors=True)
+def start_session(request: gr.Request):
+    return request.session_hash
+def orientation_changed(session_id, evt: gr.EventData):
+    detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
+    if detail['value'] == "9:16":
+        args.image_sizes_720 = [[720, 400]]
+    elif detail['value'] == "1:1":
+        args.image_sizes_720 = [[720, 720]]
+    elif detail['value'] == "16:9":
+        args.image_sizes_720 = [[400, 720]]
+    print(f'{session_id} has {args.image_sizes_720} orientation')
+def clear_raw_image():
+    return ''
+def preprocess_audio_first_nseconds_librosa(audio_path, limit_in_seconds, session_id=None):
+    if not audio_path:
+        return None
+    # Robust duration check (librosa changed arg name across versions)
+    try:
+        dur = librosa.get_duration(path=audio_path)
+    except TypeError:
+        dur = librosa.get_duration(filename=audio_path)
+    # Small tolerance to avoid re-encoding 4.9999s files
+    if dur < 5.0 - 1e-3:
+        return audio_path
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    # Where we'll store per-session processed audio
+    output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
+    audio_dir = os.path.join(output_dir, "audio")
+    os.makedirs(audio_dir, exist_ok=True)
+    trimmed_path = os.path.join(audio_dir, f"audio_input_{limit_in_seconds}s.wav")
+    sr = getattr(args, "sample_rate", 16000)
+    y, _ = librosa.load(audio_path, sr=sr, mono=True, duration=float(limit_in_seconds))
+    # Save as 16-bit PCM mono WAV
+    waveform = torch.from_numpy(y).unsqueeze(0)  # [1, num_samples]
+    torchaudio.save(
+        trimmed_path,
+        waveform,
+        sr,
+        encoding="PCM_S",
+        bits_per_sample=16,
+        format="wav",
+    )
+    return trimmed_path
+css = """
+    #col-container {
+        margin: 0 auto;
+        max-width: 1560px;
+    }
+    /* editable vs locked, reusing theme variables that adapt to dark/light */
+    .stateful textarea:not(:disabled):not([readonly]) {
+      color: var(--color-text) !important;            /* accent in both modes */
+    }
+    .stateful textarea:disabled,
+    .stateful textarea[readonly]{
+      color: var(--body-text-color-subdued) !important; /* subdued in both modes */
+    }
+    """
+with gr.Blocks(css=css) as demo:
+    session_state = gr.State()
+    demo.load(start_session, outputs=[session_state])
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(
+            """
+            <div style="text-align: center;">
+                <div style="display: flex; justify-content: center;">
+                    <img src="https://huggingface.co/spaces/alexnasa/OmniAvatar-Clay-Fast/resolve/main/assets/logo-omniavatar.png" alt="Logo">
+                </div>
+            </div>
+            <div style="text-align: center;">
+                <p style="font-size:16px; display: inline; margin: 0;">
+                    <strong>OmniAvatar</strong> – Efficient Audio-Driven Avatar Video Generation with Adaptive Body Animation
+                </p>
+                <a href="https://huggingface.co/OmniAvatar/OmniAvatar-1.3B" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                    [model]
+                </a>
+            </div>
+            <div style="text-align: center;">
+                <strong>HF Space by:</strong>
+                <a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                    <img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
+                </a>
+            </div>
+            <div style="text-align: center;">
+                <p style="font-size:16px; display: inline; margin: 0;">
+                    If you looking for realism please try the other HF Space:
+                </p>
+                <a href="https://huggingface.co/spaces/alexnasa/OmniAvatar" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
+                    <img src="https://img.shields.io/badge/🤗-HF Demo-yellow.svg">
+                </a>
+            </div>
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_input = extendedimage(label="Reference Image", type="filepath", height=512)
+                audio_input = gr.Audio(label="Input Audio", type="filepath")
+                gr.Markdown("*Change the duration limit in Advanced Settings*")
+            with gr.Column(scale=1):
+                output_video = gr.Video(label="Avatar", height=512)
+                num_steps = gr.Slider(8, 50, value=8, step=1, label="Steps")
+                time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
+                infer_btn = gr.Button("🗿 Clay Me", variant="primary")
+                with gr.Accordion("Advanced Settings", open=False):
+                    raw_img_text = gr.Text(show_label=False, label="", value='', visible=False)
+                    limit_in_seconds = gr.Slider(5, 180, value=5, step=5, label="Duration")
+                    text_input = gr.Textbox(label="Prompt", lines=6, value= ADAPTIVE_PROMPT_TEMPLATES[2])
+            with gr.Column(scale=1):
+                cached_examples = gr.Examples(
+                    examples=[
+                        [
+                            "examples/images/female-003.png",
+                            "examples/audios/fox.wav",
+                            8,
+                            ''
+                        ],
+                        [
+                            "examples/images/male-001.png",
+                            "examples/audios/ocean.wav",
+                            8,
+                            ''
+                        ],
+                        [
+                            "examples/images/female-002.png",
+                            "examples/audios/lion.wav",
+                            16,
+                            ''
+                        ],
+                        [
+                            "examples/images/female-009.png",
+                            "examples/audios/script.wav",
+                            8,
+                            ''
+                        ],
+                    ],
+                    label="Cached Examples",
+                    inputs=[image_input, audio_input, num_steps, raw_img_text],
+                    outputs=[output_video],
+                    fn=infer_example,
+                    cache_examples=True
+                    )
+    infer_btn.click(
+        fn=infer,
+        inputs=[image_input, audio_input, text_input, num_steps, session_state],
+        outputs=[output_video]
+    )
+    image_input.orientation(fn=orientation_changed, inputs=[session_state]).then(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
+    image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
+    image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
+    image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
+    audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
+    num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required, text_input])
+    audio_input.upload(fn=apply_audio, inputs=[audio_input], outputs=[audio_input]
+    ).then(
+        fn=preprocess_audio_first_nseconds_librosa,
+        inputs=[audio_input, limit_in_seconds, session_state],
+        outputs=[audio_input],
+    )
+if __name__ == "__main__":
+    demo.unload(cleanup)
+    demo.queue()
+    demo.launch(ssr_mode=False)

args_config.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+config: configs/inference.yaml
+input_file: examples/infer_samples.txt
+debug: null
+infer: false
+hparams: ''
+dtype: bf16
+exp_path: pretrained_models/OmniAvatar-1.3B
+text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
+image_encoder_path: None
+dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
+vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
+wav2vec_path: pretrained_models/wav2vec2-base-960h
+num_persistent_param_in_dit:
+reload_cfg: true
+sp_size: 1
+seed: 42
+image_sizes_720:
+# - - 400
+#   - 720
+# - - 720 commented out due duration needed on HF
+#   - 720
+- - 720
+  - 400
+image_sizes_1280:
+- - 720
+  - 720
+- - 528
+  - 960
+- - 960
+  - 528
+- - 720
+  - 1280
+- - 1280
+  - 720
+max_hw: 720
+max_tokens: 40000
+seq_len: 200
+overlap_frame: 13
+guidance_scale: 4.5
+audio_scale: null
+num_steps: 8
+fps: 24
+sample_rate: 16000
+negative_prompt: Vivid color tones, background/camera moving quickly, screen switching,
+  subtitles and special effects, mutation, overexposed, static, blurred details, subtitles,
+  style, work, painting, image, still, overall grayish, worst quality, low quality,
+  JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly
+  drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image,
+  chaotic background, three legs, crowded background with many people, walking backward
+silence_duration_s: 0.0
+use_fsdp: false
+tea_cache_l1_thresh: 0
+rank: 0
+world_size: 1
+local_rank: 0
+device: cuda
+num_nodes: 1
+i2v: true
+use_audio: true
+random_prefix_frames: true
+model_config:
+  in_dim: 33
+  audio_hidden_size: 32
+train_architecture: lora
+lora_target_modules: q,k,v,o,ffn.0,ffn.2
+init_lora_weights: kaiming
+lora_rank: 128
+lora_alpha: 64.0

assets/logo-omniavatar.png ADDED Viewed

assets/material/pipeline.png ADDED Viewed

assets/material/teaser.png ADDED Viewed

configs/inference.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# 预训练模型路径
+dtype: "bf16"
+text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth
+image_encoder_path: None
+dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors
+vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth
+wav2vec_path: pretrained_models/wav2vec2-base-960h
+exp_path: pretrained_models/OmniAvatar-14B
+num_persistent_param_in_dit:  # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required.
+reload_cfg: True
+sp_size: 1
+# 数据参数
+seed: 42
+image_sizes_720: [[400, 720],
+             [720, 720],
+             [720, 400]]
+image_sizes_1280: [
+             [720, 720],
+             [528, 960],
+             [960, 528],
+             [720, 1280],
+             [1280, 720]]
+max_hw: 720 # 720: 480p; 1280: 720p
+max_tokens: 30000
+seq_len: 200
+overlap_frame: 13 # must be 1 + 4*n
+guidance_scale: 4.5
+audio_scale:
+num_steps: 16
+fps: 25
+sample_rate: 16000
+negative_prompt: "Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward"
+silence_duration_s: 0.3
+use_fsdp: False
+tea_cache_l1_thresh: 0 # 0.14 The larger this value is, the faster the speed, but the worse the visual quality. TODO check value

configs/inference_1.3B.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# 预训练模型路径
+dtype: "bf16"
+text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
+image_encoder_path: None
+dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
+vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
+wav2vec_path: pretrained_models/wav2vec2-base-960h
+exp_path: pretrained_models/OmniAvatar-1.3B
+num_persistent_param_in_dit:  # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required.
+reload_cfg: True
+sp_size: 1
+# 数据参数
+seed: 42
+image_sizes_720: [[400, 720],
+             [720, 720],
+             [720, 400]]
+image_sizes_1280: [
+             [720, 720],
+             [528, 960],
+             [960, 528],
+             [720, 1280],
+             [1280, 720]]
+max_hw: 720 # 720: 480p; 1280: 720p
+max_tokens: 30000
+seq_len: 200
+overlap_frame: 13 # must be 1 + 4*n
+guidance_scale: 4.5
+audio_scale:
+num_steps: 10
+fps: 25
+sample_rate: 16000
+negative_prompt: "Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward"
+silence_duration_s: 0.3
+use_fsdp: False
+tea_cache_l1_thresh: 0 # 0.14 The larger this value is, the faster the speed, but the worse the visual quality. TODO check value

examples/audios/fox.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4de974b2b0b46ae66a545c04ed98d54e18dc9d67dd8cd8d50aad91dfa978624e
+size 2060268

examples/audios/lion.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76d6b7292da45406ee5b6c7e10dbedcbbb6647a5b0872a3f506419c014696e72
+size 1633964

examples/audios/ocean.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cdb50fd2bc117cbe8e4b37bb3ac7d257511ecba90cb26641a75fe569390c41f
+size 1749164

examples/audios/script.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:deb654269051d935d85b6e573805a88466e2f7f791f44b77da16467c5207eeec
+size 259244

examples/images/female-002.png ADDED Viewed

examples/images/female-003.png ADDED Viewed

Git LFS Details

SHA256: 1626385413c07a8ff4931c897c78f3861addc99fb66e12f4150ddfe39d0efca6
Pointer size: 132 Bytes
Size of remote file: 2.25 MB

examples/images/female-009.png ADDED Viewed

examples/images/male-001.png ADDED Viewed

Git LFS Details

SHA256: f8b88789fe691d92d843327cb22d61ca6628d147bfef7e3e8a020de876db017b
Pointer size: 132 Bytes
Size of remote file: 2.43 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+pytest
+diffusers
+torchao
+tqdm
+librosa==0.10.2.post1
+peft>=0.17.0
+transformers==4.52.3
+scipy==1.14.0
+numpy==1.26.4
+ftfy
+einops
+omegaconf
+torchvision
+ninja
+imageio[ffmpeg]
+sentencepiece
+torchaudio
+gradio_extendedimage @ https://github.com/OutofAi/gradio-extendedimage/releases/download/0.0.2/gradio_extendedimage-0.0.2-py3-none-any.whl

scripts/inference.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import subprocess
+import os, sys
+from glob import glob
+from datetime import datetime
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+import math
+import random
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from functools import partial
+from omegaconf import OmegaConf
+from argparse import Namespace
+# # load the one true config you dumped
+# _args_cfg = OmegaConf.load("demo_out/config/args_config.yaml")
+# args = Namespace(**OmegaConf.to_container(_args_cfg, resolve=True))
+# from OmniAvatar.utils.args_config import set_global_args
+# set_global_args(args)
+from OmniAvatar.utils.args_config import parse_args
+args = parse_args()
+from OmniAvatar.utils.io_utils import load_state_dict
+from peft import LoraConfig, inject_adapter_in_model
+from OmniAvatar.models.model_manager import ModelManager
+from OmniAvatar.wan_video import WanVideoPipeline
+from OmniAvatar.utils.io_utils import save_video_as_grid_and_mp4
+import torchvision.transforms as TT
+from transformers import Wav2Vec2FeatureExtractor
+import torchvision.transforms as transforms
+import torch.nn.functional as F
+from OmniAvatar.utils.audio_preprocess import add_silence_to_audio_ffmpeg
+from huggingface_hub import hf_hub_download
+def set_seed(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)  # 设置当前GPU
+    torch.cuda.manual_seed_all(seed)  # 设置所有GPU
+def read_from_file(p):
+    with open(p, "r") as fin:
+        for l in fin:
+            yield l.strip()
+def match_size(image_size, h, w):
+    ratio_ = 9999
+    size_ = 9999
+    select_size = None
+    for image_s in image_size:
+        ratio_tmp = abs(image_s[0] / image_s[1] - h / w)
+        size_tmp = abs(max(image_s) - max(w, h))
+        if ratio_tmp < ratio_:
+            ratio_ = ratio_tmp
+            size_ = size_tmp
+            select_size = image_s
+        if ratio_ == ratio_tmp:
+            if size_ == size_tmp:
+                select_size = image_s
+    return select_size
+def resize_pad(image, ori_size, tgt_size):
+    h, w = ori_size
+    scale_ratio = max(tgt_size[0] / h, tgt_size[1] / w)
+    scale_h = int(h * scale_ratio)
+    scale_w = int(w * scale_ratio)
+    image = transforms.Resize(size=[scale_h, scale_w])(image)
+    padding_h = tgt_size[0] - scale_h
+    padding_w = tgt_size[1] - scale_w
+    pad_top = padding_h // 2
+    pad_bottom = padding_h - pad_top
+    pad_left = padding_w // 2
+    pad_right = padding_w - pad_left
+    image = F.pad(image, (pad_left, pad_right, pad_top, pad_bottom), mode='constant', value=0)
+    return image
+class WanInferencePipeline(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.device = torch.device(f"cuda")
+        if self.args.dtype=='bf16':
+            self.dtype = torch.bfloat16
+        elif self.args.dtype=='fp16':
+            self.dtype = torch.float16
+        else:
+            self.dtype = torch.float32
+        self.pipe = self.load_model()
+        if self.args.i2v:
+            chained_trainsforms = []
+            chained_trainsforms.append(TT.ToTensor())
+            self.transform = TT.Compose(chained_trainsforms)
+        if self.args.use_audio:
+            from OmniAvatar.models.wav2vec import Wav2VecModel
+            self.wav_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+                    self.args.wav2vec_path
+                )
+            self.audio_encoder = Wav2VecModel.from_pretrained(self.args.wav2vec_path, local_files_only=True).to(device=self.device)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+    def load_model(self):
+        torch.cuda.set_device(0)
+        ckpt_path = f'{self.args.exp_path}/pytorch_model.pt'
+        assert os.path.exists(ckpt_path), f"pytorch_model.pt not found in {self.args.exp_path}"
+        if self.args.train_architecture == 'lora':
+            self.args.pretrained_lora_path = pretrained_lora_path = ckpt_path
+        else:
+            resume_path = ckpt_path
+        self.step = 0
+        # Load models
+        model_manager = ModelManager(device="cpu", infer=True)
+        model_manager.load_models(
+            [
+                self.args.dit_path.split(","),
+                self.args.text_encoder_path,
+                self.args.vae_path
+            ],
+            torch_dtype=self.dtype, # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization.
+            device='cpu',
+        )
+        LORA_REPO_ID = "Kijai/WanVideo_comfy"
+        LORA_FILENAME = "Wan21_CausVid_14B_T2V_lora_rank32.safetensors"
+        causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
+        model_manager.load_lora(causvid_path, lora_alpha=1.0)
+        pipe = WanVideoPipeline.from_model_manager(model_manager,
+                                                torch_dtype=self.dtype,
+                                                device=f"cuda",
+                                                use_usp=True if self.args.sp_size > 1 else False,
+                                                infer=True)
+        if self.args.train_architecture == "lora":
+            print(f'Use LoRA: lora rank: {self.args.lora_rank}, lora alpha: {self.args.lora_alpha}')
+            self.add_lora_to_model(
+                    pipe.denoising_model(),
+                    lora_rank=self.args.lora_rank,
+                    lora_alpha=self.args.lora_alpha,
+                    lora_target_modules=self.args.lora_target_modules,
+                    init_lora_weights=self.args.init_lora_weights,
+                    pretrained_lora_path=pretrained_lora_path,
+                )
+        else:
+            missing_keys, unexpected_keys = pipe.denoising_model().load_state_dict(load_state_dict(resume_path), strict=True)
+            print(f"load from {resume_path}, {len(missing_keys)} missing keys, {len(unexpected_keys)} unexpected keys")
+        pipe.requires_grad_(False)
+        pipe.eval()
+        pipe.enable_vram_management(num_persistent_param_in_dit=self.args.num_persistent_param_in_dit) # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required.
+        return pipe
+    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="q,k,v,o,ffn.0,ffn.2", init_lora_weights="kaiming", pretrained_lora_path=None, state_dict_converter=None):
+        # Add LoRA to UNet
+        self.lora_alpha = lora_alpha
+        if init_lora_weights == "kaiming":
+            init_lora_weights = True
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            init_lora_weights=init_lora_weights,
+            target_modules=lora_target_modules.split(","),
+        )
+        model = inject_adapter_in_model(lora_config, model)
+        # Lora pretrained lora weights
+        if pretrained_lora_path is not None:
+            state_dict = load_state_dict(pretrained_lora_path)
+            if state_dict_converter is not None:
+                state_dict = state_dict_converter(state_dict)
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            all_keys = [i for i, _ in model.named_parameters()]
+            num_updated_keys = len(all_keys) - len(missing_keys)
+            num_unexpected_keys = len(unexpected_keys)
+            print(f"{num_updated_keys} parameters are loaded from {pretrained_lora_path}. {num_unexpected_keys} parameters are unexpected.")
+    def forward(self, prompt,
+                image_path=None,
+                audio_path=None,
+                seq_len=101, # not used while audio_path is not None
+                height=720,
+                width=720,
+                overlap_frame=None,
+                num_steps=None,
+                negative_prompt=None,
+                guidance_scale=None,
+                audio_scale=None):
+        overlap_frame = overlap_frame if overlap_frame is not None else self.args.overlap_frame
+        num_steps = num_steps if num_steps is not None else self.args.num_steps
+        negative_prompt = negative_prompt if negative_prompt is not None else self.args.negative_prompt
+        guidance_scale = guidance_scale if guidance_scale is not None else self.args.guidance_scale
+        audio_scale = audio_scale if audio_scale is not None else self.args.audio_scale
+        if image_path is not None:
+            from PIL import Image
+            image = Image.open(image_path).convert("RGB")
+            image = self.transform(image).unsqueeze(0).to(self.device)
+            _, _, h, w = image.shape
+            select_size = match_size(getattr(self.args, f'image_sizes_{self.args.max_hw}'), h, w)
+            image = resize_pad(image, (h, w), select_size)
+            image = image * 2.0 - 1.0
+            image = image[:, :, None]
+        else:
+            image = None
+            select_size = [height, width]
+        L = int(self.args.max_tokens * 16 * 16 * 4 / select_size[0] / select_size[1])
+        L = L // 4 * 4 + 1 if L % 4 != 0 else L - 3  # video frames
+        T = (L + 3) // 4  # latent frames
+        if self.args.i2v:
+            if self.args.random_prefix_frames:
+                fixed_frame = overlap_frame
+                assert fixed_frame % 4 == 1
+            else:
+                fixed_frame = 1
+            prefix_lat_frame = (3 + fixed_frame) // 4
+            first_fixed_frame = 1
+        else:
+            fixed_frame = 0
+            prefix_lat_frame = 0
+            first_fixed_frame = 0
+        if audio_path is not None and self.args.use_audio:
+            audio, sr = librosa.load(audio_path, sr=self.args.sample_rate)
+            input_values = np.squeeze(
+                    self.wav_feature_extractor(audio, sampling_rate=16000).input_values
+                )
+            input_values = torch.from_numpy(input_values).float().to(device=self.device)
+            ori_audio_len = audio_len = math.ceil(len(input_values) / self.args.sample_rate * self.args.fps)
+            input_values = input_values.unsqueeze(0)
+            # padding audio
+            if audio_len < L - first_fixed_frame:
+                audio_len = audio_len + ((L - first_fixed_frame) - audio_len % (L - first_fixed_frame))
+            elif (audio_len - (L - first_fixed_frame)) % (L - fixed_frame) != 0:
+                audio_len = audio_len + ((L - fixed_frame) - (audio_len - (L - first_fixed_frame)) % (L - fixed_frame))
+            input_values = F.pad(input_values, (0, audio_len * int(self.args.sample_rate / self.args.fps) - input_values.shape[1]), mode='constant', value=0)
+            with torch.no_grad():
+                hidden_states = self.audio_encoder(input_values, seq_len=audio_len, output_hidden_states=True)
+                audio_embeddings = hidden_states.last_hidden_state
+                for mid_hidden_states in hidden_states.hidden_states:
+                    audio_embeddings = torch.cat((audio_embeddings, mid_hidden_states), -1)
+            seq_len = audio_len
+            audio_embeddings = audio_embeddings.squeeze(0)
+            audio_prefix = torch.zeros_like(audio_embeddings[:first_fixed_frame])
+        else:
+            audio_embeddings = None
+        # loop
+        times = (seq_len - L + first_fixed_frame) // (L-fixed_frame) + 1
+        if times * (L-fixed_frame) + fixed_frame < seq_len:
+            times += 1
+        video = []
+        image_emb = {}
+        img_lat = None
+        if self.args.i2v:
+            self.pipe.load_models_to_device(['vae'])
+            img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device)
+            msk = torch.zeros_like(img_lat.repeat(1, 1, T, 1, 1)[:,:1])
+            image_cat = img_lat.repeat(1, 1, T, 1, 1)
+            msk[:, :, 1:] = 1
+            image_emb["y"] = torch.cat([image_cat, msk], dim=1)
+        for t in range(times):
+            print(f"[{t+1}/{times}]")
+            audio_emb = {}
+            if t == 0:
+                overlap = first_fixed_frame
+            else:
+                overlap = fixed_frame
+                image_emb["y"][:, -1:, :prefix_lat_frame] = 0 # 第一次推理是mask只有1，往后都是mask overlap
+            prefix_overlap = (3 + overlap) // 4
+            if audio_embeddings is not None:
+                if t == 0:
+                    audio_tensor = audio_embeddings[
+                            :min(L - overlap, audio_embeddings.shape[0])
+                        ]
+                else:
+                    audio_start = L - first_fixed_frame + (t - 1) * (L - overlap)
+                    audio_tensor = audio_embeddings[
+                        audio_start: min(audio_start + L - overlap, audio_embeddings.shape[0])
+                    ]
+                audio_tensor = torch.cat([audio_prefix, audio_tensor], dim=0)
+                audio_prefix = audio_tensor[-fixed_frame:]
+                audio_tensor = audio_tensor.unsqueeze(0).to(device=self.device, dtype=self.dtype)
+                audio_emb["audio_emb"] = audio_tensor
+            else:
+                audio_prefix = None
+            if image is not None and img_lat is None:
+                self.pipe.load_models_to_device(['vae'])
+                img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device)
+                assert img_lat.shape[2] == prefix_overlap
+            img_lat = torch.cat([img_lat, torch.zeros_like(img_lat[:, :, :1].repeat(1, 1, T - prefix_overlap, 1, 1))], dim=2)
+            frames, _, latents = self.pipe.log_video(img_lat, prompt, prefix_overlap, image_emb, audio_emb,
+                                                 negative_prompt, num_inference_steps=num_steps,
+                                                 cfg_scale=guidance_scale, audio_cfg_scale=audio_scale if audio_scale is not None else guidance_scale,
+                                                 return_latent=True,
+                                                 tea_cache_l1_thresh=self.args.tea_cache_l1_thresh,tea_cache_model_id="Wan2.1-T2V-14B")
+            img_lat = None
+            image = (frames[:, -fixed_frame:].clip(0, 1) * 2 - 1).permute(0, 2, 1, 3, 4).contiguous()
+            if t == 0:
+                video.append(frames)
+            else:
+                video.append(frames[:, overlap:])
+        video = torch.cat(video, dim=1)
+        video = video[:, :ori_audio_len + 1]
+        return video
+def main():
+    # os.makedirs("demo_out/config", exist_ok=True)
+    # OmegaConf.save(config=OmegaConf.create(vars(args)),
+    #                f="demo_out/config/args_config.yaml")
+    # print("Saved merged args to demo_out/config/args_config.yaml")
+    set_seed(args.seed)
+    # laod data
+    data_iter = read_from_file(args.input_file)
+    exp_name = os.path.basename(args.exp_path)
+    seq_len = args.seq_len
+    # Text-to-video
+    inferpipe = WanInferencePipeline(args)
+    output_dir = f'demo_out'
+    idx = 0
+    text =  "A realistic video of a man speaking directly to the camera on a sofa, with dynamic and rhythmic hand gestures that complement his speech. His hands are clearly visible, independent, and unobstructed. His facial expressions are expressive and full of emotion, enhancing the delivery. The camera remains steady, capturing sharp, clear movements and a focused, engaging presence."
+    image_path =  "examples/images/0000.jpeg"
+    audio_path = "examples/audios/0000.MP3"
+    audio_dir = output_dir + '/audio'
+    os.makedirs(audio_dir, exist_ok=True)
+    if args.silence_duration_s > 0:
+        input_audio_path = os.path.join(audio_dir, f"audio_input_{idx:03d}.wav")
+    else:
+        input_audio_path = audio_path
+    prompt_dir = output_dir + '/prompt'
+    os.makedirs(prompt_dir, exist_ok=True)
+    if args.silence_duration_s > 0:
+        add_silence_to_audio_ffmpeg(audio_path, input_audio_path, args.silence_duration_s)
+    video = inferpipe(
+        prompt=text,
+        image_path=image_path,
+        audio_path=input_audio_path,
+        seq_len=seq_len
+    )
+    tmp2_audio_path = os.path.join(audio_dir, f"audio_out_{idx:03d}.wav") # 因为第一帧是参考帧，因此需要往前1/25秒
+    prompt_path = os.path.join(prompt_dir, f"prompt_{idx:03d}.txt")
+    add_silence_to_audio_ffmpeg(audio_path, tmp2_audio_path, 1.0 / args.fps + args.silence_duration_s)
+    save_video_as_grid_and_mp4(video,
+                            output_dir,
+                            args.fps,
+                            prompt=text,
+                            prompt_path = prompt_path,
+                            audio_path=tmp2_audio_path if args.use_audio else None,
+                            prefix=f'result_{idx:03d}')
+class NoPrint:
+    def write(self, x):
+        pass
+    def flush(self):
+        pass
+if __name__ == '__main__':
+    if not args.debug:
+        if args.local_rank != 0: # 屏蔽除0外的输出
+            sys.stdout = NoPrint()
+    main()