Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Commit 
							
							·
						
						79cc00b
	
1
								Parent(s):
							
							b530233
								
update
Browse files- hy3dgen/shapegen/__init__.py +1 -1
- hy3dgen/shapegen/models/__init__.py +1 -1
- hy3dgen/shapegen/models/conditioner.py +104 -12
- hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py +12 -3
- hy3dgen/shapegen/pipelines.py +181 -65
- hy3dgen/shapegen/postprocessors.py +4 -1
- hy3dgen/shapegen/preprocessors.py +55 -6
    	
        hy3dgen/shapegen/__init__.py
    CHANGED
    
    | @@ -13,5 +13,5 @@ | |
| 13 | 
             
            # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
         | 
| 14 |  | 
| 15 | 
             
            from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
         | 
| 16 | 
            -
            from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
         | 
| 17 | 
             
            from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
         | 
|  | |
| 13 | 
             
            # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
         | 
| 14 |  | 
| 15 | 
             
            from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
         | 
| 16 | 
            +
            from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
         | 
| 17 | 
             
            from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
         | 
    	
        hy3dgen/shapegen/models/__init__.py
    CHANGED
    
    | @@ -25,4 +25,4 @@ | |
| 25 |  | 
| 26 | 
             
            from .autoencoders import ShapeVAE
         | 
| 27 | 
             
            from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
         | 
| 28 | 
            -
            from .denoisers import  | 
|  | |
| 25 |  | 
| 26 | 
             
            from .autoencoders import ShapeVAE
         | 
| 27 | 
             
            from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
         | 
| 28 | 
            +
            from .denoisers import Hunyuan3DDiT
         | 
    	
        hy3dgen/shapegen/models/conditioner.py
    CHANGED
    
    | @@ -22,6 +22,7 @@ | |
| 22 | 
             
            # fine-tuning enabling code and other elements of the foregoing made publicly available
         | 
| 23 | 
             
            # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
         | 
| 24 |  | 
|  | |
| 25 | 
             
            import torch
         | 
| 26 | 
             
            import torch.nn as nn
         | 
| 27 | 
             
            from torchvision import transforms
         | 
| @@ -33,6 +34,26 @@ from transformers import ( | |
| 33 | 
             
            )
         | 
| 34 |  | 
| 35 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 36 | 
             
            class ImageEncoder(nn.Module):
         | 
| 37 | 
             
                def __init__(
         | 
| 38 | 
             
                    self,
         | 
| @@ -67,7 +88,7 @@ class ImageEncoder(nn.Module): | |
| 67 | 
             
                        ]
         | 
| 68 | 
             
                    )
         | 
| 69 |  | 
| 70 | 
            -
                def forward(self, image, mask=None, value_range=(-1, 1)):
         | 
| 71 | 
             
                    if value_range is not None:
         | 
| 72 | 
             
                        low, high = value_range
         | 
| 73 | 
             
                        image = (image - low) / (high - low)
         | 
| @@ -82,7 +103,7 @@ class ImageEncoder(nn.Module): | |
| 82 |  | 
| 83 | 
             
                    return last_hidden_state
         | 
| 84 |  | 
| 85 | 
            -
                def unconditional_embedding(self, batch_size):
         | 
| 86 | 
             
                    device = next(self.model.parameters()).device
         | 
| 87 | 
             
                    dtype = next(self.model.parameters()).dtype
         | 
| 88 | 
             
                    zero = torch.zeros(
         | 
| @@ -110,11 +131,82 @@ class DinoImageEncoder(ImageEncoder): | |
| 110 | 
             
                std = [0.229, 0.224, 0.225]
         | 
| 111 |  | 
| 112 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 113 | 
             
            def build_image_encoder(config):
         | 
| 114 | 
             
                if config['type'] == 'CLIPImageEncoder':
         | 
| 115 | 
             
                    return CLIPImageEncoder(**config['kwargs'])
         | 
| 116 | 
             
                elif config['type'] == 'DinoImageEncoder':
         | 
| 117 | 
             
                    return DinoImageEncoder(**config['kwargs'])
         | 
|  | |
|  | |
| 118 | 
             
                else:
         | 
| 119 | 
             
                    raise ValueError(f'Unknown image encoder type: {config["type"]}')
         | 
| 120 |  | 
| @@ -129,17 +221,17 @@ class DualImageEncoder(nn.Module): | |
| 129 | 
             
                    self.main_image_encoder = build_image_encoder(main_image_encoder)
         | 
| 130 | 
             
                    self.additional_image_encoder = build_image_encoder(additional_image_encoder)
         | 
| 131 |  | 
| 132 | 
            -
                def forward(self, image, mask=None):
         | 
| 133 | 
             
                    outputs = {
         | 
| 134 | 
            -
                        'main': self.main_image_encoder(image, mask=mask),
         | 
| 135 | 
            -
                        'additional': self.additional_image_encoder(image, mask=mask),
         | 
| 136 | 
             
                    }
         | 
| 137 | 
             
                    return outputs
         | 
| 138 |  | 
| 139 | 
            -
                def unconditional_embedding(self, batch_size):
         | 
| 140 | 
             
                    outputs = {
         | 
| 141 | 
            -
                        'main': self.main_image_encoder.unconditional_embedding(batch_size),
         | 
| 142 | 
            -
                        'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
         | 
| 143 | 
             
                    }
         | 
| 144 | 
             
                    return outputs
         | 
| 145 |  | 
| @@ -152,14 +244,14 @@ class SingleImageEncoder(nn.Module): | |
| 152 | 
             
                    super().__init__()
         | 
| 153 | 
             
                    self.main_image_encoder = build_image_encoder(main_image_encoder)
         | 
| 154 |  | 
| 155 | 
            -
                def forward(self, image, mask=None):
         | 
| 156 | 
             
                    outputs = {
         | 
| 157 | 
            -
                        'main': self.main_image_encoder(image, mask=mask),
         | 
| 158 | 
             
                    }
         | 
| 159 | 
             
                    return outputs
         | 
| 160 |  | 
| 161 | 
            -
                def unconditional_embedding(self, batch_size):
         | 
| 162 | 
             
                    outputs = {
         | 
| 163 | 
            -
                        'main': self.main_image_encoder.unconditional_embedding(batch_size),
         | 
| 164 | 
             
                    }
         | 
| 165 | 
             
                    return outputs
         | 
|  | |
| 22 | 
             
            # fine-tuning enabling code and other elements of the foregoing made publicly available
         | 
| 23 | 
             
            # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
         | 
| 24 |  | 
| 25 | 
            +
            import numpy as np
         | 
| 26 | 
             
            import torch
         | 
| 27 | 
             
            import torch.nn as nn
         | 
| 28 | 
             
            from torchvision import transforms
         | 
|  | |
| 34 | 
             
            )
         | 
| 35 |  | 
| 36 |  | 
| 37 | 
            +
            def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
         | 
| 38 | 
            +
                """
         | 
| 39 | 
            +
                embed_dim: output dimension for each position
         | 
| 40 | 
            +
                pos: a list of positions to be encoded: size (M,)
         | 
| 41 | 
            +
                out: (M, D)
         | 
| 42 | 
            +
                """
         | 
| 43 | 
            +
                assert embed_dim % 2 == 0
         | 
| 44 | 
            +
                omega = np.arange(embed_dim // 2, dtype=np.float64)
         | 
| 45 | 
            +
                omega /= embed_dim / 2.
         | 
| 46 | 
            +
                omega = 1. / 10000 ** omega  # (D/2,)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                pos = pos.reshape(-1)  # (M,)
         | 
| 49 | 
            +
                out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                emb_sin = np.sin(out)  # (M, D/2)
         | 
| 52 | 
            +
                emb_cos = np.cos(out)  # (M, D/2)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                return np.concatenate([emb_sin, emb_cos], axis=1)
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
             
            class ImageEncoder(nn.Module):
         | 
| 58 | 
             
                def __init__(
         | 
| 59 | 
             
                    self,
         | 
|  | |
| 88 | 
             
                        ]
         | 
| 89 | 
             
                    )
         | 
| 90 |  | 
| 91 | 
            +
                def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
         | 
| 92 | 
             
                    if value_range is not None:
         | 
| 93 | 
             
                        low, high = value_range
         | 
| 94 | 
             
                        image = (image - low) / (high - low)
         | 
|  | |
| 103 |  | 
| 104 | 
             
                    return last_hidden_state
         | 
| 105 |  | 
| 106 | 
            +
                def unconditional_embedding(self, batch_size, **kwargs):
         | 
| 107 | 
             
                    device = next(self.model.parameters()).device
         | 
| 108 | 
             
                    dtype = next(self.model.parameters()).dtype
         | 
| 109 | 
             
                    zero = torch.zeros(
         | 
|  | |
| 131 | 
             
                std = [0.229, 0.224, 0.225]
         | 
| 132 |  | 
| 133 |  | 
| 134 | 
            +
            class DinoImageEncoderMV(DinoImageEncoder):
         | 
| 135 | 
            +
                def __init__(
         | 
| 136 | 
            +
                    self,
         | 
| 137 | 
            +
                    version=None,
         | 
| 138 | 
            +
                    config=None,
         | 
| 139 | 
            +
                    use_cls_token=True,
         | 
| 140 | 
            +
                    image_size=224,
         | 
| 141 | 
            +
                    view_num=4,
         | 
| 142 | 
            +
                    **kwargs,
         | 
| 143 | 
            +
                ):
         | 
| 144 | 
            +
                    super().__init__(version, config, use_cls_token, image_size, **kwargs)
         | 
| 145 | 
            +
                    self.view_num = view_num
         | 
| 146 | 
            +
                    self.num_patches = self.num_patches
         | 
| 147 | 
            +
                    pos = np.arange(self.view_num, dtype=np.float32)
         | 
| 148 | 
            +
                    view_embedding = torch.from_numpy(
         | 
| 149 | 
            +
                        get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                    view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
         | 
| 152 | 
            +
                    self.view_embed = view_embedding.unsqueeze(0)
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
         | 
| 155 | 
            +
                    if value_range is not None:
         | 
| 156 | 
            +
                        low, high = value_range
         | 
| 157 | 
            +
                        image = (image - low) / (high - low)
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                    image = image.to(self.model.device, dtype=self.model.dtype)
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    bs, num_views, c, h, w = image.shape
         | 
| 162 | 
            +
                    image = image.view(bs * num_views, c, h, w)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                    inputs = self.transform(image)
         | 
| 165 | 
            +
                    outputs = self.model(inputs)
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                    last_hidden_state = outputs.last_hidden_state
         | 
| 168 | 
            +
                    last_hidden_state = last_hidden_state.view(
         | 
| 169 | 
            +
                        bs, num_views, last_hidden_state.shape[-2],
         | 
| 170 | 
            +
                        last_hidden_state.shape[-1]
         | 
| 171 | 
            +
                    )
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                    view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
         | 
| 174 | 
            +
                    if view_idxs is not None:
         | 
| 175 | 
            +
                        assert len(view_idxs) == bs
         | 
| 176 | 
            +
                        view_embeddings = []
         | 
| 177 | 
            +
                        for i in range(bs):
         | 
| 178 | 
            +
                            view_idx = view_idxs[i]
         | 
| 179 | 
            +
                            assert num_views == len(view_idx)
         | 
| 180 | 
            +
                            view_embeddings.append(self.view_embed[:, view_idx, ...])
         | 
| 181 | 
            +
                        view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                    if num_views != self.view_num:
         | 
| 184 | 
            +
                        view_embedding = view_embedding[:, :num_views, ...]
         | 
| 185 | 
            +
                    last_hidden_state = last_hidden_state + view_embedding
         | 
| 186 | 
            +
                    last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
         | 
| 187 | 
            +
                                                               last_hidden_state.shape[-1])
         | 
| 188 | 
            +
                    return last_hidden_state
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
         | 
| 191 | 
            +
                    device = next(self.model.parameters()).device
         | 
| 192 | 
            +
                    dtype = next(self.model.parameters()).dtype
         | 
| 193 | 
            +
                    zero = torch.zeros(
         | 
| 194 | 
            +
                        batch_size,
         | 
| 195 | 
            +
                        self.num_patches * len(view_idxs[0]),
         | 
| 196 | 
            +
                        self.model.config.hidden_size,
         | 
| 197 | 
            +
                        device=device,
         | 
| 198 | 
            +
                        dtype=dtype,
         | 
| 199 | 
            +
                    )
         | 
| 200 | 
            +
                    return zero
         | 
| 201 | 
            +
             | 
| 202 | 
            +
             | 
| 203 | 
             
            def build_image_encoder(config):
         | 
| 204 | 
             
                if config['type'] == 'CLIPImageEncoder':
         | 
| 205 | 
             
                    return CLIPImageEncoder(**config['kwargs'])
         | 
| 206 | 
             
                elif config['type'] == 'DinoImageEncoder':
         | 
| 207 | 
             
                    return DinoImageEncoder(**config['kwargs'])
         | 
| 208 | 
            +
                elif config['type'] == 'DinoImageEncoderMV':
         | 
| 209 | 
            +
                    return DinoImageEncoderMV(**config['kwargs'])
         | 
| 210 | 
             
                else:
         | 
| 211 | 
             
                    raise ValueError(f'Unknown image encoder type: {config["type"]}')
         | 
| 212 |  | 
|  | |
| 221 | 
             
                    self.main_image_encoder = build_image_encoder(main_image_encoder)
         | 
| 222 | 
             
                    self.additional_image_encoder = build_image_encoder(additional_image_encoder)
         | 
| 223 |  | 
| 224 | 
            +
                def forward(self, image, mask=None, **kwargs):
         | 
| 225 | 
             
                    outputs = {
         | 
| 226 | 
            +
                        'main': self.main_image_encoder(image, mask=mask, **kwargs),
         | 
| 227 | 
            +
                        'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
         | 
| 228 | 
             
                    }
         | 
| 229 | 
             
                    return outputs
         | 
| 230 |  | 
| 231 | 
            +
                def unconditional_embedding(self, batch_size, **kwargs):
         | 
| 232 | 
             
                    outputs = {
         | 
| 233 | 
            +
                        'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
         | 
| 234 | 
            +
                        'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
         | 
| 235 | 
             
                    }
         | 
| 236 | 
             
                    return outputs
         | 
| 237 |  | 
|  | |
| 244 | 
             
                    super().__init__()
         | 
| 245 | 
             
                    self.main_image_encoder = build_image_encoder(main_image_encoder)
         | 
| 246 |  | 
| 247 | 
            +
                def forward(self, image, mask=None, **kwargs):
         | 
| 248 | 
             
                    outputs = {
         | 
| 249 | 
            +
                        'main': self.main_image_encoder(image, mask=mask, **kwargs),
         | 
| 250 | 
             
                    }
         | 
| 251 | 
             
                    return outputs
         | 
| 252 |  | 
| 253 | 
            +
                def unconditional_embedding(self, batch_size, **kwargs):
         | 
| 254 | 
             
                    outputs = {
         | 
| 255 | 
            +
                        'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
         | 
| 256 | 
             
                    }
         | 
| 257 | 
             
                    return outputs
         | 
    	
        hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py
    CHANGED
    
    | @@ -60,6 +60,15 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10 | |
| 60 | 
             
                return embedding
         | 
| 61 |  | 
| 62 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 63 | 
             
            class MLPEmbedder(nn.Module):
         | 
| 64 | 
             
                def __init__(self, in_dim: int, hidden_dim: int):
         | 
| 65 | 
             
                    super().__init__()
         | 
| @@ -162,7 +171,7 @@ class DoubleStreamBlock(nn.Module): | |
| 162 | 
             
                    self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         | 
| 163 | 
             
                    self.img_mlp = nn.Sequential(
         | 
| 164 | 
             
                        nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
         | 
| 165 | 
            -
                         | 
| 166 | 
             
                        nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         | 
| 167 | 
             
                    )
         | 
| 168 |  | 
| @@ -173,7 +182,7 @@ class DoubleStreamBlock(nn.Module): | |
| 173 | 
             
                    self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         | 
| 174 | 
             
                    self.txt_mlp = nn.Sequential(
         | 
| 175 | 
             
                        nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
         | 
| 176 | 
            -
                         | 
| 177 | 
             
                        nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         | 
| 178 | 
             
                    )
         | 
| 179 |  | 
| @@ -239,7 +248,7 @@ class SingleStreamBlock(nn.Module): | |
| 239 | 
             
                    self.hidden_size = hidden_size
         | 
| 240 | 
             
                    self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         | 
| 241 |  | 
| 242 | 
            -
                    self.mlp_act =  | 
| 243 | 
             
                    self.modulation = Modulation(hidden_size, double=False)
         | 
| 244 |  | 
| 245 | 
             
                def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         | 
|  | |
| 60 | 
             
                return embedding
         | 
| 61 |  | 
| 62 |  | 
| 63 | 
            +
            class GELU(nn.Module):
         | 
| 64 | 
            +
                def __init__(self, approximate='tanh'):
         | 
| 65 | 
            +
                    super().__init__()
         | 
| 66 | 
            +
                    self.approximate = approximate
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def forward(self, x: Tensor) -> Tensor:
         | 
| 69 | 
            +
                    return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
| 72 | 
             
            class MLPEmbedder(nn.Module):
         | 
| 73 | 
             
                def __init__(self, in_dim: int, hidden_dim: int):
         | 
| 74 | 
             
                    super().__init__()
         | 
|  | |
| 171 | 
             
                    self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         | 
| 172 | 
             
                    self.img_mlp = nn.Sequential(
         | 
| 173 | 
             
                        nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
         | 
| 174 | 
            +
                        GELU(approximate="tanh"),
         | 
| 175 | 
             
                        nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         | 
| 176 | 
             
                    )
         | 
| 177 |  | 
|  | |
| 182 | 
             
                    self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         | 
| 183 | 
             
                    self.txt_mlp = nn.Sequential(
         | 
| 184 | 
             
                        nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
         | 
| 185 | 
            +
                        GELU(approximate="tanh"),
         | 
| 186 | 
             
                        nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         | 
| 187 | 
             
                    )
         | 
| 188 |  | 
|  | |
| 248 | 
             
                    self.hidden_size = hidden_size
         | 
| 249 | 
             
                    self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         | 
| 250 |  | 
| 251 | 
            +
                    self.mlp_act = GELU(approximate="tanh")
         | 
| 252 | 
             
                    self.modulation = Modulation(hidden_size, double=False)
         | 
| 253 |  | 
| 254 | 
             
                def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         | 
    	
        hy3dgen/shapegen/pipelines.py
    CHANGED
    
    | @@ -24,11 +24,12 @@ import trimesh | |
| 24 | 
             
            import yaml
         | 
| 25 | 
             
            from PIL import Image
         | 
| 26 | 
             
            from diffusers.utils.torch_utils import randn_tensor
         | 
|  | |
| 27 | 
             
            from tqdm import tqdm
         | 
| 28 |  | 
| 29 | 
             
            from .models.autoencoders import ShapeVAE
         | 
| 30 | 
             
            from .models.autoencoders import SurfaceExtractors
         | 
| 31 | 
            -
            from .utils import logger, synchronize_timer
         | 
| 32 |  | 
| 33 |  | 
| 34 | 
             
            def retrieve_timesteps(
         | 
| @@ -127,6 +128,9 @@ def instantiate_from_config(config, **kwargs): | |
| 127 |  | 
| 128 |  | 
| 129 | 
             
            class Hunyuan3DDiTPipeline:
         | 
|  | |
|  | |
|  | |
| 130 | 
             
                @classmethod
         | 
| 131 | 
             
                @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
         | 
| 132 | 
             
                def from_single_file(
         | 
| @@ -207,34 +211,12 @@ class Hunyuan3DDiTPipeline: | |
| 207 | 
             
                        dtype=dtype,
         | 
| 208 | 
             
                        device=device,
         | 
| 209 | 
             
                    )
         | 
| 210 | 
            -
                     | 
| 211 | 
            -
             | 
| 212 | 
            -
             | 
| 213 | 
            -
             | 
| 214 | 
            -
             | 
| 215 | 
            -
                     | 
| 216 | 
            -
                        logger.info('Model path not exists, try to download from huggingface')
         | 
| 217 | 
            -
                        try:
         | 
| 218 | 
            -
                            import huggingface_hub
         | 
| 219 | 
            -
                            # download from huggingface
         | 
| 220 | 
            -
                            path = huggingface_hub.snapshot_download(repo_id=original_model_path)
         | 
| 221 | 
            -
                            model_path = os.path.join(path, subfolder)
         | 
| 222 | 
            -
                        except ImportError:
         | 
| 223 | 
            -
                            logger.warning(
         | 
| 224 | 
            -
                                "You need to install HuggingFace Hub to load models from the hub."
         | 
| 225 | 
            -
                            )
         | 
| 226 | 
            -
                            raise RuntimeError(f"Model path {model_path} not found")
         | 
| 227 | 
            -
                        except Exception as e:
         | 
| 228 | 
            -
                            raise e
         | 
| 229 | 
            -
             | 
| 230 | 
            -
                    if not os.path.exists(model_path):
         | 
| 231 | 
            -
                        raise FileNotFoundError(f"Model path {original_model_path} not found")
         | 
| 232 | 
            -
             | 
| 233 | 
            -
                    extension = 'ckpt' if not use_safetensors else 'safetensors'
         | 
| 234 | 
            -
                    variant = '' if variant is None else f'.{variant}'
         | 
| 235 | 
            -
                    ckpt_name = f'model{variant}.{extension}'
         | 
| 236 | 
            -
                    config_path = os.path.join(model_path, 'config.yaml')
         | 
| 237 | 
            -
                    ckpt_path = os.path.join(model_path, ckpt_name)
         | 
| 238 | 
             
                    return cls.from_single_file(
         | 
| 239 | 
             
                        ckpt_path,
         | 
| 240 | 
             
                        config_path,
         | 
| @@ -279,12 +261,18 @@ class Hunyuan3DDiTPipeline: | |
| 279 | 
             
                    if enabled:
         | 
| 280 | 
             
                        model_path = self.kwargs['from_pretrained_kwargs']['model_path']
         | 
| 281 | 
             
                        turbo_vae_mapping = {
         | 
| 282 | 
            -
                            'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
         | 
| 283 | 
            -
                            'Hunyuan3D- | 
|  | |
| 284 | 
             
                        }
         | 
| 285 | 
             
                        model_name = model_path.split('/')[-1]
         | 
| 286 | 
             
                        if replace_vae and model_name in turbo_vae_mapping:
         | 
| 287 | 
            -
                             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 288 | 
             
                        self.vae.enable_flashvdm_decoder(
         | 
| 289 | 
             
                            enabled=enabled,
         | 
| 290 | 
             
                            adaptive_kv_selection=adaptive_kv_selection,
         | 
| @@ -294,33 +282,146 @@ class Hunyuan3DDiTPipeline: | |
| 294 | 
             
                    else:
         | 
| 295 | 
             
                        model_path = self.kwargs['from_pretrained_kwargs']['model_path']
         | 
| 296 | 
             
                        vae_mapping = {
         | 
| 297 | 
            -
                            'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
         | 
| 298 | 
            -
                            'Hunyuan3D- | 
|  | |
| 299 | 
             
                        }
         | 
| 300 | 
             
                        model_name = model_path.split('/')[-1]
         | 
| 301 | 
             
                        if model_name in vae_mapping:
         | 
| 302 | 
            -
                             | 
|  | |
| 303 | 
             
                        self.vae.enable_flashvdm_decoder(enabled=False)
         | 
| 304 |  | 
| 305 | 
             
                def to(self, device=None, dtype=None):
         | 
| 306 | 
            -
                    if device is not None:
         | 
| 307 | 
            -
                        self.device = torch.device(device)
         | 
| 308 | 
            -
                        self.vae.to(device)
         | 
| 309 | 
            -
                        self.model.to(device)
         | 
| 310 | 
            -
                        self.conditioner.to(device)
         | 
| 311 | 
             
                    if dtype is not None:
         | 
| 312 | 
             
                        self.dtype = dtype
         | 
| 313 | 
             
                        self.vae.to(dtype=dtype)
         | 
| 314 | 
             
                        self.model.to(dtype=dtype)
         | 
| 315 | 
             
                        self.conditioner.to(dtype=dtype)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 316 |  | 
| 317 | 
             
                @synchronize_timer('Encode cond')
         | 
| 318 | 
            -
                def encode_cond(self, image,  | 
| 319 | 
             
                    bsz = image.shape[0]
         | 
| 320 | 
            -
                    cond = self.conditioner(image=image,  | 
| 321 |  | 
| 322 | 
             
                    if do_classifier_free_guidance:
         | 
| 323 | 
            -
                        un_cond = self.conditioner.unconditional_embedding(bsz)
         | 
| 324 |  | 
| 325 | 
             
                        if dual_guidance:
         | 
| 326 | 
             
                            un_cond_drop_main = copy.deepcopy(un_cond)
         | 
| @@ -336,8 +437,6 @@ class Hunyuan3DDiTPipeline: | |
| 336 |  | 
| 337 | 
             
                            cond = cat_recursive(cond, un_cond_drop_main, un_cond)
         | 
| 338 | 
             
                        else:
         | 
| 339 | 
            -
                            un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
         | 
| 340 | 
            -
             | 
| 341 | 
             
                            def cat_recursive(a, b):
         | 
| 342 | 
             
                                if isinstance(a, torch.Tensor):
         | 
| 343 | 
             
                                    return torch.cat([a, b], dim=0).to(self.dtype)
         | 
| @@ -383,25 +482,27 @@ class Hunyuan3DDiTPipeline: | |
| 383 | 
             
                    latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         | 
| 384 | 
             
                    return latents
         | 
| 385 |  | 
| 386 | 
            -
                def prepare_image(self, image):
         | 
| 387 | 
             
                    if isinstance(image, str) and not os.path.exists(image):
         | 
| 388 | 
             
                        raise FileNotFoundError(f"Couldn't find image at path {image}")
         | 
| 389 |  | 
| 390 | 
             
                    if not isinstance(image, list):
         | 
| 391 | 
             
                        image = [image]
         | 
| 392 | 
            -
             | 
| 393 | 
            -
                     | 
| 394 | 
             
                    for img in image:
         | 
| 395 | 
            -
                         | 
| 396 | 
            -
                         | 
| 397 | 
            -
                        mask_pts.append(mask_pt)
         | 
| 398 |  | 
| 399 | 
            -
                     | 
| 400 | 
            -
                     | 
| 401 | 
            -
                         | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
|  | |
|  | |
|  | |
| 405 |  | 
| 406 | 
             
                def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         | 
| 407 | 
             
                    """
         | 
| @@ -474,10 +575,14 @@ class Hunyuan3DDiTPipeline: | |
| 474 | 
             
                                                  getattr(self.model, 'guidance_cond_proj_dim', None) is None
         | 
| 475 | 
             
                    dual_guidance = dual_guidance_scale >= 0 and dual_guidance
         | 
| 476 |  | 
| 477 | 
            -
                     | 
| 478 | 
            -
                     | 
| 479 | 
            -
             | 
| 480 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 481 | 
             
                    batch_size = image.shape[0]
         | 
| 482 |  | 
| 483 | 
             
                    t_dtype = torch.long
         | 
| @@ -535,7 +640,17 @@ class Hunyuan3DDiTPipeline: | |
| 535 | 
             
                        box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         | 
| 536 | 
             
                    )
         | 
| 537 |  | 
| 538 | 
            -
                def _export( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 539 | 
             
                    if not output_type == "latent":
         | 
| 540 | 
             
                        latents = 1. / self.vae.scale_factor * latents
         | 
| 541 | 
             
                        latents = self.vae(latents)
         | 
| @@ -562,7 +677,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): | |
| 562 | 
             
                @torch.inference_mode()
         | 
| 563 | 
             
                def __call__(
         | 
| 564 | 
             
                    self,
         | 
| 565 | 
            -
                    image: Union[str, List[str], Image.Image] = None,
         | 
| 566 | 
             
                    num_inference_steps: int = 50,
         | 
| 567 | 
             
                    timesteps: List[int] = None,
         | 
| 568 | 
             
                    sigmas: List[float] = None,
         | 
| @@ -590,10 +705,11 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): | |
| 590 | 
             
                        self.model.guidance_embed is True
         | 
| 591 | 
             
                    )
         | 
| 592 |  | 
| 593 | 
            -
                     | 
|  | |
| 594 | 
             
                    cond = self.encode_cond(
         | 
| 595 | 
             
                        image=image,
         | 
| 596 | 
            -
                         | 
| 597 | 
             
                        do_classifier_free_guidance=do_classifier_free_guidance,
         | 
| 598 | 
             
                        dual_guidance=False,
         | 
| 599 | 
             
                    )
         | 
|  | |
| 24 | 
             
            import yaml
         | 
| 25 | 
             
            from PIL import Image
         | 
| 26 | 
             
            from diffusers.utils.torch_utils import randn_tensor
         | 
| 27 | 
            +
            from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
         | 
| 28 | 
             
            from tqdm import tqdm
         | 
| 29 |  | 
| 30 | 
             
            from .models.autoencoders import ShapeVAE
         | 
| 31 | 
             
            from .models.autoencoders import SurfaceExtractors
         | 
| 32 | 
            +
            from .utils import logger, synchronize_timer, smart_load_model
         | 
| 33 |  | 
| 34 |  | 
| 35 | 
             
            def retrieve_timesteps(
         | 
|  | |
| 128 |  | 
| 129 |  | 
| 130 | 
             
            class Hunyuan3DDiTPipeline:
         | 
| 131 | 
            +
                model_cpu_offload_seq = "conditioner->model->vae"
         | 
| 132 | 
            +
                _exclude_from_cpu_offload = []
         | 
| 133 | 
            +
             | 
| 134 | 
             
                @classmethod
         | 
| 135 | 
             
                @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
         | 
| 136 | 
             
                def from_single_file(
         | 
|  | |
| 211 | 
             
                        dtype=dtype,
         | 
| 212 | 
             
                        device=device,
         | 
| 213 | 
             
                    )
         | 
| 214 | 
            +
                    config_path, ckpt_path = smart_load_model(
         | 
| 215 | 
            +
                        model_path,
         | 
| 216 | 
            +
                        subfolder=subfolder,
         | 
| 217 | 
            +
                        use_safetensors=use_safetensors,
         | 
| 218 | 
            +
                        variant=variant
         | 
| 219 | 
            +
                    )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 220 | 
             
                    return cls.from_single_file(
         | 
| 221 | 
             
                        ckpt_path,
         | 
| 222 | 
             
                        config_path,
         | 
|  | |
| 261 | 
             
                    if enabled:
         | 
| 262 | 
             
                        model_path = self.kwargs['from_pretrained_kwargs']['model_path']
         | 
| 263 | 
             
                        turbo_vae_mapping = {
         | 
| 264 | 
            +
                            'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
         | 
| 265 | 
            +
                            'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
         | 
| 266 | 
            +
                            'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
         | 
| 267 | 
             
                        }
         | 
| 268 | 
             
                        model_name = model_path.split('/')[-1]
         | 
| 269 | 
             
                        if replace_vae and model_name in turbo_vae_mapping:
         | 
| 270 | 
            +
                            model_path, subfolder = turbo_vae_mapping[model_name]
         | 
| 271 | 
            +
                            self.vae = ShapeVAE.from_pretrained(
         | 
| 272 | 
            +
                                model_path, subfolder=subfolder,
         | 
| 273 | 
            +
                                use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
         | 
| 274 | 
            +
                                device=self.device,
         | 
| 275 | 
            +
                            )
         | 
| 276 | 
             
                        self.vae.enable_flashvdm_decoder(
         | 
| 277 | 
             
                            enabled=enabled,
         | 
| 278 | 
             
                            adaptive_kv_selection=adaptive_kv_selection,
         | 
|  | |
| 282 | 
             
                    else:
         | 
| 283 | 
             
                        model_path = self.kwargs['from_pretrained_kwargs']['model_path']
         | 
| 284 | 
             
                        vae_mapping = {
         | 
| 285 | 
            +
                            'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
         | 
| 286 | 
            +
                            'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
         | 
| 287 | 
            +
                            'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
         | 
| 288 | 
             
                        }
         | 
| 289 | 
             
                        model_name = model_path.split('/')[-1]
         | 
| 290 | 
             
                        if model_name in vae_mapping:
         | 
| 291 | 
            +
                            model_path, subfolder = vae_mapping[model_name]
         | 
| 292 | 
            +
                            self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
         | 
| 293 | 
             
                        self.vae.enable_flashvdm_decoder(enabled=False)
         | 
| 294 |  | 
| 295 | 
             
                def to(self, device=None, dtype=None):
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 296 | 
             
                    if dtype is not None:
         | 
| 297 | 
             
                        self.dtype = dtype
         | 
| 298 | 
             
                        self.vae.to(dtype=dtype)
         | 
| 299 | 
             
                        self.model.to(dtype=dtype)
         | 
| 300 | 
             
                        self.conditioner.to(dtype=dtype)
         | 
| 301 | 
            +
                    if device is not None:
         | 
| 302 | 
            +
                        self.device = torch.device(device)
         | 
| 303 | 
            +
                        self.vae.to(device)
         | 
| 304 | 
            +
                        self.model.to(device)
         | 
| 305 | 
            +
                        self.conditioner.to(device)
         | 
| 306 | 
            +
             | 
| 307 | 
            +
                @property
         | 
| 308 | 
            +
                def _execution_device(self):
         | 
| 309 | 
            +
                    r"""
         | 
| 310 | 
            +
                    Returns the device on which the pipeline's models will be executed. After calling
         | 
| 311 | 
            +
                    [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
         | 
| 312 | 
            +
                    Accelerate's module hooks.
         | 
| 313 | 
            +
                    """
         | 
| 314 | 
            +
                    for name, model in self.components.items():
         | 
| 315 | 
            +
                        if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
         | 
| 316 | 
            +
                            continue
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                        if not hasattr(model, "_hf_hook"):
         | 
| 319 | 
            +
                            return self.device
         | 
| 320 | 
            +
                        for module in model.modules():
         | 
| 321 | 
            +
                            if (
         | 
| 322 | 
            +
                                hasattr(module, "_hf_hook")
         | 
| 323 | 
            +
                                and hasattr(module._hf_hook, "execution_device")
         | 
| 324 | 
            +
                                and module._hf_hook.execution_device is not None
         | 
| 325 | 
            +
                            ):
         | 
| 326 | 
            +
                                return torch.device(module._hf_hook.execution_device)
         | 
| 327 | 
            +
                    return self.device
         | 
| 328 | 
            +
             | 
| 329 | 
            +
                def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
         | 
| 330 | 
            +
                    r"""
         | 
| 331 | 
            +
                    Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
         | 
| 332 | 
            +
                    to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
         | 
| 333 | 
            +
                    method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         | 
| 334 | 
            +
                    `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         | 
| 335 | 
            +
             | 
| 336 | 
            +
                    Arguments:
         | 
| 337 | 
            +
                        gpu_id (`int`, *optional*):
         | 
| 338 | 
            +
                            The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
         | 
| 339 | 
            +
                        device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
         | 
| 340 | 
            +
                            The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
         | 
| 341 | 
            +
                            default to "cuda".
         | 
| 342 | 
            +
                    """
         | 
| 343 | 
            +
                    if self.model_cpu_offload_seq is None:
         | 
| 344 | 
            +
                        raise ValueError(
         | 
| 345 | 
            +
                            "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
         | 
| 346 | 
            +
                        )
         | 
| 347 | 
            +
             | 
| 348 | 
            +
                    if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
         | 
| 349 | 
            +
                        from accelerate import cpu_offload_with_hook
         | 
| 350 | 
            +
                    else:
         | 
| 351 | 
            +
                        raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
         | 
| 352 | 
            +
             | 
| 353 | 
            +
                    torch_device = torch.device(device)
         | 
| 354 | 
            +
                    device_index = torch_device.index
         | 
| 355 | 
            +
             | 
| 356 | 
            +
                    if gpu_id is not None and device_index is not None:
         | 
| 357 | 
            +
                        raise ValueError(
         | 
| 358 | 
            +
                            f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
         | 
| 359 | 
            +
                            f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
         | 
| 360 | 
            +
                        )
         | 
| 361 | 
            +
             | 
| 362 | 
            +
                    # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
         | 
| 363 | 
            +
                    self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
         | 
| 364 | 
            +
             | 
| 365 | 
            +
                    device_type = torch_device.type
         | 
| 366 | 
            +
                    device = torch.device(f"{device_type}:{self._offload_gpu_id}")
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                    if self.device.type != "cpu":
         | 
| 369 | 
            +
                        self.to("cpu")
         | 
| 370 | 
            +
                        device_mod = getattr(torch, self.device.type, None)
         | 
| 371 | 
            +
                        if hasattr(device_mod, "empty_cache") and device_mod.is_available():
         | 
| 372 | 
            +
                            device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
         | 
| 373 | 
            +
             | 
| 374 | 
            +
                    all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
         | 
| 375 | 
            +
             | 
| 376 | 
            +
                    self._all_hooks = []
         | 
| 377 | 
            +
                    hook = None
         | 
| 378 | 
            +
                    for model_str in self.model_cpu_offload_seq.split("->"):
         | 
| 379 | 
            +
                        model = all_model_components.pop(model_str, None)
         | 
| 380 | 
            +
                        if not isinstance(model, torch.nn.Module):
         | 
| 381 | 
            +
                            continue
         | 
| 382 | 
            +
             | 
| 383 | 
            +
                        _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
         | 
| 384 | 
            +
                        self._all_hooks.append(hook)
         | 
| 385 | 
            +
             | 
| 386 | 
            +
                    # CPU offload models that are not in the seq chain unless they are explicitly excluded
         | 
| 387 | 
            +
                    # these models will stay on CPU until maybe_free_model_hooks is called
         | 
| 388 | 
            +
                    # some models cannot be in the seq chain because they are iteratively called, such as controlnet
         | 
| 389 | 
            +
                    for name, model in all_model_components.items():
         | 
| 390 | 
            +
                        if not isinstance(model, torch.nn.Module):
         | 
| 391 | 
            +
                            continue
         | 
| 392 | 
            +
             | 
| 393 | 
            +
                        if name in self._exclude_from_cpu_offload:
         | 
| 394 | 
            +
                            model.to(device)
         | 
| 395 | 
            +
                        else:
         | 
| 396 | 
            +
                            _, hook = cpu_offload_with_hook(model, device)
         | 
| 397 | 
            +
                            self._all_hooks.append(hook)
         | 
| 398 | 
            +
             | 
| 399 | 
            +
                def maybe_free_model_hooks(self):
         | 
| 400 | 
            +
                    r"""
         | 
| 401 | 
            +
                    Function that offloads all components, removes all model hooks that were added when using
         | 
| 402 | 
            +
                    `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
         | 
| 403 | 
            +
                    is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
         | 
| 404 | 
            +
                    functions correctly when applying enable_model_cpu_offload.
         | 
| 405 | 
            +
                    """
         | 
| 406 | 
            +
                    if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
         | 
| 407 | 
            +
                        # `enable_model_cpu_offload` has not be called, so silently do nothing
         | 
| 408 | 
            +
                        return
         | 
| 409 | 
            +
             | 
| 410 | 
            +
                    for hook in self._all_hooks:
         | 
| 411 | 
            +
                        # offload model and remove hook from model
         | 
| 412 | 
            +
                        hook.offload()
         | 
| 413 | 
            +
                        hook.remove()
         | 
| 414 | 
            +
             | 
| 415 | 
            +
                    # make sure the model is in the same state as before calling it
         | 
| 416 | 
            +
                    self.enable_model_cpu_offload()
         | 
| 417 |  | 
| 418 | 
             
                @synchronize_timer('Encode cond')
         | 
| 419 | 
            +
                def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
         | 
| 420 | 
             
                    bsz = image.shape[0]
         | 
| 421 | 
            +
                    cond = self.conditioner(image=image, **additional_cond_inputs)
         | 
| 422 |  | 
| 423 | 
             
                    if do_classifier_free_guidance:
         | 
| 424 | 
            +
                        un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
         | 
| 425 |  | 
| 426 | 
             
                        if dual_guidance:
         | 
| 427 | 
             
                            un_cond_drop_main = copy.deepcopy(un_cond)
         | 
|  | |
| 437 |  | 
| 438 | 
             
                            cond = cat_recursive(cond, un_cond_drop_main, un_cond)
         | 
| 439 | 
             
                        else:
         | 
|  | |
|  | |
| 440 | 
             
                            def cat_recursive(a, b):
         | 
| 441 | 
             
                                if isinstance(a, torch.Tensor):
         | 
| 442 | 
             
                                    return torch.cat([a, b], dim=0).to(self.dtype)
         | 
|  | |
| 482 | 
             
                    latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         | 
| 483 | 
             
                    return latents
         | 
| 484 |  | 
| 485 | 
            +
                def prepare_image(self, image) -> dict:
         | 
| 486 | 
             
                    if isinstance(image, str) and not os.path.exists(image):
         | 
| 487 | 
             
                        raise FileNotFoundError(f"Couldn't find image at path {image}")
         | 
| 488 |  | 
| 489 | 
             
                    if not isinstance(image, list):
         | 
| 490 | 
             
                        image = [image]
         | 
| 491 | 
            +
             | 
| 492 | 
            +
                    outputs = []
         | 
| 493 | 
             
                    for img in image:
         | 
| 494 | 
            +
                        output = self.image_processor(img)
         | 
| 495 | 
            +
                        outputs.append(output)
         | 
|  | |
| 496 |  | 
| 497 | 
            +
                    cond_input = {k: [] for k in outputs[0].keys()}
         | 
| 498 | 
            +
                    for output in outputs:
         | 
| 499 | 
            +
                        for key, value in output.items():
         | 
| 500 | 
            +
                            cond_input[key].append(value)
         | 
| 501 | 
            +
                    for key, value in cond_input.items():
         | 
| 502 | 
            +
                        if isinstance(value[0], torch.Tensor):
         | 
| 503 | 
            +
                            cond_input[key] = torch.cat(value, dim=0)
         | 
| 504 | 
            +
             | 
| 505 | 
            +
                    return cond_input
         | 
| 506 |  | 
| 507 | 
             
                def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         | 
| 508 | 
             
                    """
         | 
|  | |
| 575 | 
             
                                                  getattr(self.model, 'guidance_cond_proj_dim', None) is None
         | 
| 576 | 
             
                    dual_guidance = dual_guidance_scale >= 0 and dual_guidance
         | 
| 577 |  | 
| 578 | 
            +
                    cond_inputs = self.prepare_image(image)
         | 
| 579 | 
            +
                    image = cond_inputs.pop('image')
         | 
| 580 | 
            +
                    cond = self.encode_cond(
         | 
| 581 | 
            +
                        image=image,
         | 
| 582 | 
            +
                        additional_cond_inputs=cond_inputs,
         | 
| 583 | 
            +
                        do_classifier_free_guidance=do_classifier_free_guidance,
         | 
| 584 | 
            +
                        dual_guidance=False,
         | 
| 585 | 
            +
                    )
         | 
| 586 | 
             
                    batch_size = image.shape[0]
         | 
| 587 |  | 
| 588 | 
             
                    t_dtype = torch.long
         | 
|  | |
| 640 | 
             
                        box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         | 
| 641 | 
             
                    )
         | 
| 642 |  | 
| 643 | 
            +
                def _export(
         | 
| 644 | 
            +
                    self,
         | 
| 645 | 
            +
                    latents,
         | 
| 646 | 
            +
                    output_type='trimesh',
         | 
| 647 | 
            +
                    box_v=1.01,
         | 
| 648 | 
            +
                    mc_level=0.0,
         | 
| 649 | 
            +
                    num_chunks=20000,
         | 
| 650 | 
            +
                    octree_resolution=256,
         | 
| 651 | 
            +
                    mc_algo='mc',
         | 
| 652 | 
            +
                    enable_pbar=True
         | 
| 653 | 
            +
                ):
         | 
| 654 | 
             
                    if not output_type == "latent":
         | 
| 655 | 
             
                        latents = 1. / self.vae.scale_factor * latents
         | 
| 656 | 
             
                        latents = self.vae(latents)
         | 
|  | |
| 677 | 
             
                @torch.inference_mode()
         | 
| 678 | 
             
                def __call__(
         | 
| 679 | 
             
                    self,
         | 
| 680 | 
            +
                    image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
         | 
| 681 | 
             
                    num_inference_steps: int = 50,
         | 
| 682 | 
             
                    timesteps: List[int] = None,
         | 
| 683 | 
             
                    sigmas: List[float] = None,
         | 
|  | |
| 705 | 
             
                        self.model.guidance_embed is True
         | 
| 706 | 
             
                    )
         | 
| 707 |  | 
| 708 | 
            +
                    cond_inputs = self.prepare_image(image)
         | 
| 709 | 
            +
                    image = cond_inputs.pop('image')
         | 
| 710 | 
             
                    cond = self.encode_cond(
         | 
| 711 | 
             
                        image=image,
         | 
| 712 | 
            +
                        additional_cond_inputs=cond_inputs,
         | 
| 713 | 
             
                        do_classifier_free_guidance=do_classifier_free_guidance,
         | 
| 714 | 
             
                        dual_guidance=False,
         | 
| 715 | 
             
                    )
         | 
    	
        hy3dgen/shapegen/postprocessors.py
    CHANGED
    
    | @@ -12,13 +12,16 @@ | |
| 12 | 
             
            # fine-tuning enabling code and other elements of the foregoing made publicly available
         | 
| 13 | 
             
            # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
         | 
| 14 |  | 
|  | |
| 15 | 
             
            import tempfile
         | 
| 16 | 
             
            from typing import Union
         | 
| 17 |  | 
|  | |
| 18 | 
             
            import pymeshlab
         | 
|  | |
| 19 | 
             
            import trimesh
         | 
| 20 |  | 
| 21 | 
            -
            from .models. | 
| 22 | 
             
            from .utils import synchronize_timer
         | 
| 23 |  | 
| 24 |  | 
|  | |
| 12 | 
             
            # fine-tuning enabling code and other elements of the foregoing made publicly available
         | 
| 13 | 
             
            # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
         | 
| 14 |  | 
| 15 | 
            +
            import os
         | 
| 16 | 
             
            import tempfile
         | 
| 17 | 
             
            from typing import Union
         | 
| 18 |  | 
| 19 | 
            +
            import numpy as np
         | 
| 20 | 
             
            import pymeshlab
         | 
| 21 | 
            +
            import torch
         | 
| 22 | 
             
            import trimesh
         | 
| 23 |  | 
| 24 | 
            +
            from .models.autoencoders import Latent2MeshOutput
         | 
| 25 | 
             
            from .utils import synchronize_timer
         | 
| 26 |  | 
| 27 |  | 
    	
        hy3dgen/shapegen/preprocessors.py
    CHANGED
    
    | @@ -87,9 +87,7 @@ class ImageProcessorV2: | |
| 87 | 
             
                    mask = mask.clip(0, 255).astype(np.uint8)
         | 
| 88 | 
             
                    return result, mask
         | 
| 89 |  | 
| 90 | 
            -
                def  | 
| 91 | 
            -
                    if self.border_ratio is not None:
         | 
| 92 | 
            -
                        border_ratio = self.border_ratio
         | 
| 93 | 
             
                    if isinstance(image, str):
         | 
| 94 | 
             
                        image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
         | 
| 95 | 
             
                        image, mask = self.recenter(image, border_ratio=border_ratio)
         | 
| @@ -106,13 +104,64 @@ class ImageProcessorV2: | |
| 106 | 
             
                    if to_tensor:
         | 
| 107 | 
             
                        image = array_to_tensor(image)
         | 
| 108 | 
             
                        mask = array_to_tensor(mask)
         | 
| 109 | 
            -
                     | 
| 110 | 
            -
             | 
| 111 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 112 |  | 
| 113 |  | 
| 114 | 
             
            IMAGE_PROCESSORS = {
         | 
| 115 | 
             
                "v2": ImageProcessorV2,
         | 
|  | |
| 116 | 
             
            }
         | 
| 117 |  | 
| 118 | 
             
            DEFAULT_IMAGEPROCESSOR = 'v2'
         | 
|  | |
| 87 | 
             
                    mask = mask.clip(0, 255).astype(np.uint8)
         | 
| 88 | 
             
                    return result, mask
         | 
| 89 |  | 
| 90 | 
            +
                def load_image(self, image, border_ratio=0.15, to_tensor=True):
         | 
|  | |
|  | |
| 91 | 
             
                    if isinstance(image, str):
         | 
| 92 | 
             
                        image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
         | 
| 93 | 
             
                        image, mask = self.recenter(image, border_ratio=border_ratio)
         | 
|  | |
| 104 | 
             
                    if to_tensor:
         | 
| 105 | 
             
                        image = array_to_tensor(image)
         | 
| 106 | 
             
                        mask = array_to_tensor(mask)
         | 
| 107 | 
            +
                    return image, mask
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
         | 
| 110 | 
            +
                    if self.border_ratio is not None:
         | 
| 111 | 
            +
                        border_ratio = self.border_ratio
         | 
| 112 | 
            +
                    image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
         | 
| 113 | 
            +
                    outputs = {
         | 
| 114 | 
            +
                        'image': image,
         | 
| 115 | 
            +
                        'mask': mask
         | 
| 116 | 
            +
                    }
         | 
| 117 | 
            +
                    return outputs
         | 
| 118 | 
            +
             | 
| 119 | 
            +
             | 
| 120 | 
            +
            class MVImageProcessorV2(ImageProcessorV2):
         | 
| 121 | 
            +
                """
         | 
| 122 | 
            +
                view order: front, front clockwise 90, back, front clockwise 270
         | 
| 123 | 
            +
                """
         | 
| 124 | 
            +
                return_view_idx = True
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                def __init__(self, size=512, border_ratio=None):
         | 
| 127 | 
            +
                    super().__init__(size, border_ratio)
         | 
| 128 | 
            +
                    self.view2idx = {
         | 
| 129 | 
            +
                        'front': 0,
         | 
| 130 | 
            +
                        'left': 1,
         | 
| 131 | 
            +
                        'back': 2,
         | 
| 132 | 
            +
                        'right': 3
         | 
| 133 | 
            +
                    }
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
         | 
| 136 | 
            +
                    if self.border_ratio is not None:
         | 
| 137 | 
            +
                        border_ratio = self.border_ratio
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    images = []
         | 
| 140 | 
            +
                    masks = []
         | 
| 141 | 
            +
                    view_idxs = []
         | 
| 142 | 
            +
                    for idx, (view_tag, image) in enumerate(image_dict.items()):
         | 
| 143 | 
            +
                        view_idxs.append(self.view2idx[view_tag])
         | 
| 144 | 
            +
                        image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
         | 
| 145 | 
            +
                        images.append(image)
         | 
| 146 | 
            +
                        masks.append(mask)
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                    zipped_lists = zip(view_idxs, images, masks)
         | 
| 149 | 
            +
                    sorted_zipped_lists = sorted(zipped_lists)
         | 
| 150 | 
            +
                    view_idxs, images, masks = zip(*sorted_zipped_lists)
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                    image = torch.cat(images, 0).unsqueeze(0)
         | 
| 153 | 
            +
                    mask = torch.cat(masks, 0).unsqueeze(0)
         | 
| 154 | 
            +
                    outputs = {
         | 
| 155 | 
            +
                        'image': image,
         | 
| 156 | 
            +
                        'mask': mask,
         | 
| 157 | 
            +
                        'view_idxs': view_idxs
         | 
| 158 | 
            +
                    }
         | 
| 159 | 
            +
                    return outputs
         | 
| 160 |  | 
| 161 |  | 
| 162 | 
             
            IMAGE_PROCESSORS = {
         | 
| 163 | 
             
                "v2": ImageProcessorV2,
         | 
| 164 | 
            +
                'mv_v2': MVImageProcessorV2,
         | 
| 165 | 
             
            }
         | 
| 166 |  | 
| 167 | 
             
            DEFAULT_IMAGEPROCESSOR = 'v2'
         | 

