Spaces:

tencent
/

Hunyuan3D-2mv

Running on Zero

App Files Files Community

ZeqiangLai commited on Mar 19

Commit

79cc00b

1 Parent(s): b530233

update

Browse files

Files changed (7) hide show

hy3dgen/shapegen/__init__.py +1 -1
hy3dgen/shapegen/models/__init__.py +1 -1
hy3dgen/shapegen/models/conditioner.py +104 -12
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py +12 -3
hy3dgen/shapegen/pipelines.py +181 -65
hy3dgen/shapegen/postprocessors.py +4 -1
hy3dgen/shapegen/preprocessors.py +55 -6

hy3dgen/shapegen/__init__.py CHANGED Viewed

@@ -13,5 +13,5 @@
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
-from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
 from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR

 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
+from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
 from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR

hy3dgen/shapegen/models/__init__.py CHANGED Viewed

@@ -25,4 +25,4 @@
 from .autoencoders import ShapeVAE
 from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
-from .denoisers import HunYuanDiTPlain, Hunyuan3DDiT

 from .autoencoders import ShapeVAE
 from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
+from .denoisers import Hunyuan3DDiT

hy3dgen/shapegen/models/conditioner.py CHANGED Viewed

@@ -22,6 +22,7 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import torch
 import torch.nn as nn
 from torchvision import transforms
@@ -33,6 +34,26 @@ from transformers import (
 )
 class ImageEncoder(nn.Module):
     def __init__(
         self,
@@ -67,7 +88,7 @@ class ImageEncoder(nn.Module):
             ]
         )
-    def forward(self, image, mask=None, value_range=(-1, 1)):
         if value_range is not None:
             low, high = value_range
             image = (image - low) / (high - low)
@@ -82,7 +103,7 @@ class ImageEncoder(nn.Module):
         return last_hidden_state
-    def unconditional_embedding(self, batch_size):
         device = next(self.model.parameters()).device
         dtype = next(self.model.parameters()).dtype
         zero = torch.zeros(
@@ -110,11 +131,82 @@ class DinoImageEncoder(ImageEncoder):
     std = [0.229, 0.224, 0.225]
 def build_image_encoder(config):
     if config['type'] == 'CLIPImageEncoder':
         return CLIPImageEncoder(**config['kwargs'])
     elif config['type'] == 'DinoImageEncoder':
         return DinoImageEncoder(**config['kwargs'])
     else:
         raise ValueError(f'Unknown image encoder type: {config["type"]}')
@@ -129,17 +221,17 @@ class DualImageEncoder(nn.Module):
         self.main_image_encoder = build_image_encoder(main_image_encoder)
         self.additional_image_encoder = build_image_encoder(additional_image_encoder)
-    def forward(self, image, mask=None):
         outputs = {
-            'main': self.main_image_encoder(image, mask=mask),
-            'additional': self.additional_image_encoder(image, mask=mask),
         }
         return outputs
-    def unconditional_embedding(self, batch_size):
         outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size),
-            'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
         }
         return outputs
@@ -152,14 +244,14 @@ class SingleImageEncoder(nn.Module):
         super().__init__()
         self.main_image_encoder = build_image_encoder(main_image_encoder)
-    def forward(self, image, mask=None):
         outputs = {
-            'main': self.main_image_encoder(image, mask=mask),
         }
         return outputs
-    def unconditional_embedding(self, batch_size):
         outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size),
         }
         return outputs

 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import numpy as np
 import torch
 import torch.nn as nn
 from torchvision import transforms
 )
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    return np.concatenate([emb_sin, emb_cos], axis=1)
 class ImageEncoder(nn.Module):
     def __init__(
         self,
             ]
         )
+    def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
         if value_range is not None:
             low, high = value_range
             image = (image - low) / (high - low)
         return last_hidden_state
+    def unconditional_embedding(self, batch_size, **kwargs):
         device = next(self.model.parameters()).device
         dtype = next(self.model.parameters()).dtype
         zero = torch.zeros(
     std = [0.229, 0.224, 0.225]
+class DinoImageEncoderMV(DinoImageEncoder):
+    def __init__(
+        self,
+        version=None,
+        config=None,
+        use_cls_token=True,
+        image_size=224,
+        view_num=4,
+        **kwargs,
+    ):
+        super().__init__(version, config, use_cls_token, image_size, **kwargs)
+        self.view_num = view_num
+        self.num_patches = self.num_patches
+        pos = np.arange(self.view_num, dtype=np.float32)
+        view_embedding = torch.from_numpy(
+            get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
+        view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
+        self.view_embed = view_embedding.unsqueeze(0)
+    def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
+        if value_range is not None:
+            low, high = value_range
+            image = (image - low) / (high - low)
+        image = image.to(self.model.device, dtype=self.model.dtype)
+        bs, num_views, c, h, w = image.shape
+        image = image.view(bs * num_views, c, h, w)
+        inputs = self.transform(image)
+        outputs = self.model(inputs)
+        last_hidden_state = outputs.last_hidden_state
+        last_hidden_state = last_hidden_state.view(
+            bs, num_views, last_hidden_state.shape[-2],
+            last_hidden_state.shape[-1]
+        )
+        view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
+        if view_idxs is not None:
+            assert len(view_idxs) == bs
+            view_embeddings = []
+            for i in range(bs):
+                view_idx = view_idxs[i]
+                assert num_views == len(view_idx)
+                view_embeddings.append(self.view_embed[:, view_idx, ...])
+            view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
+        if num_views != self.view_num:
+            view_embedding = view_embedding[:, :num_views, ...]
+        last_hidden_state = last_hidden_state + view_embedding
+        last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
+                                                   last_hidden_state.shape[-1])
+        return last_hidden_state
+    def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
+        device = next(self.model.parameters()).device
+        dtype = next(self.model.parameters()).dtype
+        zero = torch.zeros(
+            batch_size,
+            self.num_patches * len(view_idxs[0]),
+            self.model.config.hidden_size,
+            device=device,
+            dtype=dtype,
+        )
+        return zero
 def build_image_encoder(config):
     if config['type'] == 'CLIPImageEncoder':
         return CLIPImageEncoder(**config['kwargs'])
     elif config['type'] == 'DinoImageEncoder':
         return DinoImageEncoder(**config['kwargs'])
+    elif config['type'] == 'DinoImageEncoderMV':
+        return DinoImageEncoderMV(**config['kwargs'])
     else:
         raise ValueError(f'Unknown image encoder type: {config["type"]}')
         self.main_image_encoder = build_image_encoder(main_image_encoder)
         self.additional_image_encoder = build_image_encoder(additional_image_encoder)
+    def forward(self, image, mask=None, **kwargs):
         outputs = {
+            'main': self.main_image_encoder(image, mask=mask, **kwargs),
+            'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
         }
         return outputs
+    def unconditional_embedding(self, batch_size, **kwargs):
         outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
+            'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
         }
         return outputs
         super().__init__()
         self.main_image_encoder = build_image_encoder(main_image_encoder)
+    def forward(self, image, mask=None, **kwargs):
         outputs = {
+            'main': self.main_image_encoder(image, mask=mask, **kwargs),
         }
         return outputs
+    def unconditional_embedding(self, batch_size, **kwargs):
         outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
         }
         return outputs

hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py CHANGED Viewed

@@ -60,6 +60,15 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     return embedding
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
@@ -162,7 +171,7 @@ class DoubleStreamBlock(nn.Module):
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
@@ -173,7 +182,7 @@ class DoubleStreamBlock(nn.Module):
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
@@ -239,7 +248,7 @@ class SingleStreamBlock(nn.Module):
         self.hidden_size = hidden_size
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = nn.GELU(approximate="tanh")
         self.modulation = Modulation(hidden_size, double=False)
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:

     return embedding
+class GELU(nn.Module):
+    def __init__(self, approximate='tanh'):
+        super().__init__()
+        self.approximate = approximate
+    def forward(self, x: Tensor) -> Tensor:
+        return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.hidden_size = hidden_size
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = GELU(approximate="tanh")
         self.modulation = Modulation(hidden_size, double=False)
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:

hy3dgen/shapegen/pipelines.py CHANGED Viewed

@@ -24,11 +24,12 @@ import trimesh
 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
-from .utils import logger, synchronize_timer
 def retrieve_timesteps(
@@ -127,6 +128,9 @@ def instantiate_from_config(config, **kwargs):
 class Hunyuan3DDiTPipeline:
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
@@ -207,34 +211,12 @@ class Hunyuan3DDiTPipeline:
             dtype=dtype,
             device=device,
         )
-        original_model_path = model_path
-        # try local path
-        base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
-        model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
-        logger.info(f'Try to load model from local path: {model_path}')
-        if not os.path.exists(model_path):
-            logger.info('Model path not exists, try to download from huggingface')
-            try:
-                import huggingface_hub
-                # download from huggingface
-                path = huggingface_hub.snapshot_download(repo_id=original_model_path)
-                model_path = os.path.join(path, subfolder)
-            except ImportError:
-                logger.warning(
-                    "You need to install HuggingFace Hub to load models from the hub."
-                )
-                raise RuntimeError(f"Model path {model_path} not found")
-            except Exception as e:
-                raise e
-        if not os.path.exists(model_path):
-            raise FileNotFoundError(f"Model path {original_model_path} not found")
-        extension = 'ckpt' if not use_safetensors else 'safetensors'
-        variant = '' if variant is None else f'.{variant}'
-        ckpt_name = f'model{variant}.{extension}'
-        config_path = os.path.join(model_path, 'config.yaml')
-        ckpt_path = os.path.join(model_path, ckpt_name)
         return cls.from_single_file(
             ckpt_path,
             config_path,
@@ -279,12 +261,18 @@ class Hunyuan3DDiTPipeline:
         if enabled:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             turbo_vae_mapping = {
-                'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
-                'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s-turbo'
             }
             model_name = model_path.split('/')[-1]
             if replace_vae and model_name in turbo_vae_mapping:
-                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=turbo_vae_mapping[model_name])
             self.vae.enable_flashvdm_decoder(
                 enabled=enabled,
                 adaptive_kv_selection=adaptive_kv_selection,
@@ -294,33 +282,146 @@ class Hunyuan3DDiTPipeline:
         else:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             vae_mapping = {
-                'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
-                'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s'
             }
             model_name = model_path.split('/')[-1]
             if model_name in vae_mapping:
-                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=vae_mapping[model_name])
             self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
-        if device is not None:
-            self.device = torch.device(device)
-            self.vae.to(device)
-            self.model.to(device)
-            self.conditioner.to(device)
         if dtype is not None:
             self.dtype = dtype
             self.vae.to(dtype=dtype)
             self.model.to(dtype=dtype)
             self.conditioner.to(dtype=dtype)
     @synchronize_timer('Encode cond')
-    def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
         bsz = image.shape[0]
-        cond = self.conditioner(image=image, mask=mask)
         if do_classifier_free_guidance:
-            un_cond = self.conditioner.unconditional_embedding(bsz)
             if dual_guidance:
                 un_cond_drop_main = copy.deepcopy(un_cond)
@@ -336,8 +437,6 @@ class Hunyuan3DDiTPipeline:
                 cond = cat_recursive(cond, un_cond_drop_main, un_cond)
             else:
-                un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
                 def cat_recursive(a, b):
                     if isinstance(a, torch.Tensor):
                         return torch.cat([a, b], dim=0).to(self.dtype)
@@ -383,25 +482,27 @@ class Hunyuan3DDiTPipeline:
         latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         return latents
-    def prepare_image(self, image):
         if isinstance(image, str) and not os.path.exists(image):
             raise FileNotFoundError(f"Couldn't find image at path {image}")
         if not isinstance(image, list):
             image = [image]
-        image_pts = []
-        mask_pts = []
         for img in image:
-            image_pt, mask_pt = self.image_processor(img, return_mask=True)
-            image_pts.append(image_pt)
-            mask_pts.append(mask_pt)
-        image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
-        if mask_pts[0] is not None:
-            mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
-        else:
-            mask_pts = None
-        return image_pts, mask_pts
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
@@ -474,10 +575,14 @@ class Hunyuan3DDiTPipeline:
                                       getattr(self.model, 'guidance_cond_proj_dim', None) is None
         dual_guidance = dual_guidance_scale >= 0 and dual_guidance
-        image, mask = self.prepare_image(image)
-        cond = self.encode_cond(image=image,
-                                do_classifier_free_guidance=do_classifier_free_guidance,
-                                dual_guidance=dual_guidance)
         batch_size = image.shape[0]
         t_dtype = torch.long
@@ -535,7 +640,17 @@ class Hunyuan3DDiTPipeline:
             box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         )
-    def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=True):
         if not output_type == "latent":
             latents = 1. / self.vae.scale_factor * latents
             latents = self.vae(latents)
@@ -562,7 +677,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
     @torch.inference_mode()
     def __call__(
         self,
-        image: Union[str, List[str], Image.Image] = None,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         sigmas: List[float] = None,
@@ -590,10 +705,11 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
             self.model.guidance_embed is True
         )
-        image, mask = self.prepare_image(image)
         cond = self.encode_cond(
             image=image,
-            mask=mask,
             do_classifier_free_guidance=do_classifier_free_guidance,
             dual_guidance=False,
         )

 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
+from .utils import logger, synchronize_timer, smart_load_model
 def retrieve_timesteps(
 class Hunyuan3DDiTPipeline:
+    model_cpu_offload_seq = "conditioner->model->vae"
+    _exclude_from_cpu_offload = []
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
             dtype=dtype,
             device=device,
         )
+        config_path, ckpt_path = smart_load_model(
+            model_path,
+            subfolder=subfolder,
+            use_safetensors=use_safetensors,
+            variant=variant
+        )
         return cls.from_single_file(
             ckpt_path,
             config_path,
         if enabled:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             turbo_vae_mapping = {
+                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
+                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
+                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
             }
             model_name = model_path.split('/')[-1]
             if replace_vae and model_name in turbo_vae_mapping:
+                model_path, subfolder = turbo_vae_mapping[model_name]
+                self.vae = ShapeVAE.from_pretrained(
+                    model_path, subfolder=subfolder,
+                    use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
+                    device=self.device,
+                )
             self.vae.enable_flashvdm_decoder(
                 enabled=enabled,
                 adaptive_kv_selection=adaptive_kv_selection,
         else:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             vae_mapping = {
+                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
+                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
+                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
             }
             model_name = model_path.split('/')[-1]
             if model_name in vae_mapping:
+                model_path, subfolder = vae_mapping[model_name]
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
             self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
         if dtype is not None:
             self.dtype = dtype
             self.vae.to(dtype=dtype)
             self.model.to(dtype=dtype)
             self.conditioner.to(dtype=dtype)
+        if device is not None:
+            self.device = torch.device(device)
+            self.vae.to(device)
+            self.model.to(device)
+            self.conditioner.to(device)
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
+        Accelerate's module hooks.
+        """
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
+                continue
+            if not hasattr(model, "_hf_hook"):
+                return self.device
+            for module in model.modules():
+                if (
+                    hasattr(module, "_hf_hook")
+                    and hasattr(module._hf_hook, "execution_device")
+                    and module._hf_hook.execution_device is not None
+                ):
+                    return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        if self.model_cpu_offload_seq is None:
+            raise ValueError(
+                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
+            )
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu")
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
+        self._all_hooks = []
+        hook = None
+        for model_str in self.model_cpu_offload_seq.split("->"):
+            model = all_model_components.pop(model_str, None)
+            if not isinstance(model, torch.nn.Module):
+                continue
+            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
+            self._all_hooks.append(hook)
+        # CPU offload models that are not in the seq chain unless they are explicitly excluded
+        # these models will stay on CPU until maybe_free_model_hooks is called
+        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
+        for name, model in all_model_components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                _, hook = cpu_offload_with_hook(model, device)
+                self._all_hooks.append(hook)
+    def maybe_free_model_hooks(self):
+        r"""
+        Function that offloads all components, removes all model hooks that were added when using
+        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
+        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
+        functions correctly when applying enable_model_cpu_offload.
+        """
+        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
+            # `enable_model_cpu_offload` has not be called, so silently do nothing
+            return
+        for hook in self._all_hooks:
+            # offload model and remove hook from model
+            hook.offload()
+            hook.remove()
+        # make sure the model is in the same state as before calling it
+        self.enable_model_cpu_offload()
     @synchronize_timer('Encode cond')
+    def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
         bsz = image.shape[0]
+        cond = self.conditioner(image=image, **additional_cond_inputs)
         if do_classifier_free_guidance:
+            un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
             if dual_guidance:
                 un_cond_drop_main = copy.deepcopy(un_cond)
                 cond = cat_recursive(cond, un_cond_drop_main, un_cond)
             else:
                 def cat_recursive(a, b):
                     if isinstance(a, torch.Tensor):
                         return torch.cat([a, b], dim=0).to(self.dtype)
         latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         return latents
+    def prepare_image(self, image) -> dict:
         if isinstance(image, str) and not os.path.exists(image):
             raise FileNotFoundError(f"Couldn't find image at path {image}")
         if not isinstance(image, list):
             image = [image]
+        outputs = []
         for img in image:
+            output = self.image_processor(img)
+            outputs.append(output)
+        cond_input = {k: [] for k in outputs[0].keys()}
+        for output in outputs:
+            for key, value in output.items():
+                cond_input[key].append(value)
+        for key, value in cond_input.items():
+            if isinstance(value[0], torch.Tensor):
+                cond_input[key] = torch.cat(value, dim=0)
+        return cond_input
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
                                       getattr(self.model, 'guidance_cond_proj_dim', None) is None
         dual_guidance = dual_guidance_scale >= 0 and dual_guidance
+        cond_inputs = self.prepare_image(image)
+        image = cond_inputs.pop('image')
+        cond = self.encode_cond(
+            image=image,
+            additional_cond_inputs=cond_inputs,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            dual_guidance=False,
+        )
         batch_size = image.shape[0]
         t_dtype = torch.long
             box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         )
+    def _export(
+        self,
+        latents,
+        output_type='trimesh',
+        box_v=1.01,
+        mc_level=0.0,
+        num_chunks=20000,
+        octree_resolution=256,
+        mc_algo='mc',
+        enable_pbar=True
+    ):
         if not output_type == "latent":
             latents = 1. / self.vae.scale_factor * latents
             latents = self.vae(latents)
     @torch.inference_mode()
     def __call__(
         self,
+        image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         sigmas: List[float] = None,
             self.model.guidance_embed is True
         )
+        cond_inputs = self.prepare_image(image)
+        image = cond_inputs.pop('image')
         cond = self.encode_cond(
             image=image,
+            additional_cond_inputs=cond_inputs,
             do_classifier_free_guidance=do_classifier_free_guidance,
             dual_guidance=False,
         )

hy3dgen/shapegen/postprocessors.py CHANGED Viewed

@@ -12,13 +12,16 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import tempfile
 from typing import Union
 import pymeshlab
 import trimesh
-from .models.vae import Latent2MeshOutput
 from .utils import synchronize_timer

 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import os
 import tempfile
 from typing import Union
+import numpy as np
 import pymeshlab
+import torch
 import trimesh
+from .models.autoencoders import Latent2MeshOutput
 from .utils import synchronize_timer

hy3dgen/shapegen/preprocessors.py CHANGED Viewed

@@ -87,9 +87,7 @@ class ImageProcessorV2:
         mask = mask.clip(0, 255).astype(np.uint8)
         return result, mask
-    def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
-        if self.border_ratio is not None:
-            border_ratio = self.border_ratio
         if isinstance(image, str):
             image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
             image, mask = self.recenter(image, border_ratio=border_ratio)
@@ -106,13 +104,64 @@ class ImageProcessorV2:
         if to_tensor:
             image = array_to_tensor(image)
             mask = array_to_tensor(mask)
-        if return_mask:
-            return image, mask
-        return image
 IMAGE_PROCESSORS = {
     "v2": ImageProcessorV2,
 }
 DEFAULT_IMAGEPROCESSOR = 'v2'

         mask = mask.clip(0, 255).astype(np.uint8)
         return result, mask
+    def load_image(self, image, border_ratio=0.15, to_tensor=True):
         if isinstance(image, str):
             image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
             image, mask = self.recenter(image, border_ratio=border_ratio)
         if to_tensor:
             image = array_to_tensor(image)
             mask = array_to_tensor(mask)
+        return image, mask
+    def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
+        if self.border_ratio is not None:
+            border_ratio = self.border_ratio
+        image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
+        outputs = {
+            'image': image,
+            'mask': mask
+        }
+        return outputs
+class MVImageProcessorV2(ImageProcessorV2):
+    """
+    view order: front, front clockwise 90, back, front clockwise 270
+    """
+    return_view_idx = True
+    def __init__(self, size=512, border_ratio=None):
+        super().__init__(size, border_ratio)
+        self.view2idx = {
+            'front': 0,
+            'left': 1,
+            'back': 2,
+            'right': 3
+        }
+    def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
+        if self.border_ratio is not None:
+            border_ratio = self.border_ratio
+        images = []
+        masks = []
+        view_idxs = []
+        for idx, (view_tag, image) in enumerate(image_dict.items()):
+            view_idxs.append(self.view2idx[view_tag])
+            image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
+            images.append(image)
+            masks.append(mask)
+        zipped_lists = zip(view_idxs, images, masks)
+        sorted_zipped_lists = sorted(zipped_lists)
+        view_idxs, images, masks = zip(*sorted_zipped_lists)
+        image = torch.cat(images, 0).unsqueeze(0)
+        mask = torch.cat(masks, 0).unsqueeze(0)
+        outputs = {
+            'image': image,
+            'mask': mask,
+            'view_idxs': view_idxs
+        }
+        return outputs
 IMAGE_PROCESSORS = {
     "v2": ImageProcessorV2,
+    'mv_v2': MVImageProcessorV2,
 }
 DEFAULT_IMAGEPROCESSOR = 'v2'