MeshAnythingV2

Running on Zero

App Files Files Community

Yiwen-ntu commited on Aug 5, 2024

Commit

14af97a

verified ·

1 Parent(s): cf814c2

Upload 2 files

Browse files

Files changed (2) hide show

MeshAnything/models/meshanything_v2.py +162 -0
MeshAnything/models/shape_opt.py +63 -55

MeshAnything/models/meshanything_v2.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn.functional as nnf
+from torch import nn
+import random
+from transformers import AutoModelForCausalLM
+from MeshAnything.miche.encode import load_model
+from MeshAnything.models.shape_opt import ShapeOPTConfig
+from einops import repeat, reduce, rearrange, pack, unpack
+class MeshAnythingV2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.point_encoder = load_model(ckpt_path=None)
+        self.n_discrete_size = 128
+        self.max_seq_ratio = 0.70
+        self.face_per_token = 9
+        self.cond_length = 257
+        self.cond_dim = 768
+        self.pad_id = -1
+        self.n_max_triangles = 1600
+        self.max_length = int(self.n_max_triangles * self.face_per_token * self.max_seq_ratio + 3 + self.cond_length) # add 1
+        self.coor_continuous_range = (-0.5, 0.5)
+        self.config = ShapeOPTConfig.from_pretrained(
+            "facebook/opt-350m",
+            n_positions=self.max_length,
+            max_position_embeddings=self.max_length,
+            vocab_size=self.n_discrete_size + 4,
+            _attn_implementation="flash_attention_2"
+        )
+        self.bos_token_id = 0
+        self.eos_token_id = 1
+        self.pad_token_id = 2
+        self.config.bos_token_id = self.bos_token_id
+        self.config.eos_token_id = self.eos_token_id
+        self.config.pad_token_id = self.pad_token_id
+        self.config._attn_implementation="flash_attention_2"
+        self.config.n_discrete_size = self.n_discrete_size
+        self.config.face_per_token = self.face_per_token
+        self.config.cond_length = self.cond_length
+        if self.config.word_embed_proj_dim != self.config.hidden_size:
+            self.config.word_embed_proj_dim = self.config.hidden_size
+        self.transformer = AutoModelForCausalLM.from_config(
+            config=self.config, use_flash_attention_2 = True
+        )
+        self.transformer.to_bettertransformer()
+        self.cond_head_proj = nn.Linear(self.cond_dim, self.config.word_embed_proj_dim)
+        self.cond_proj = nn.Linear(self.cond_dim * 2, self.config.word_embed_proj_dim)
+        self.eval()
+    def adjacent_detokenize(self, input_ids):
+        input_ids = input_ids.reshape(input_ids.shape[0], -1) # B x L
+        batch_size = input_ids.shape[0]
+        continuous_coors = torch.zeros((batch_size, self.n_max_triangles * 3 * 10, 3), device=input_ids.device)
+        continuous_coors[...] = float('nan')
+        for i in range(batch_size):
+            cur_ids = input_ids[i]
+            coor_loop_check = 0
+            vertice_count = 0
+            continuous_coors[i, :3, :] = torch.tensor([[-0.1, 0.0, 0.1], [-0.1, 0.1, 0.2], [-0.3, 0.3, 0.2]],
+                                                      device=input_ids.device)
+            for id in cur_ids:
+                if id == self.pad_id:
+                    break
+                elif id == self.n_discrete_size:
+                    if coor_loop_check < 9:
+                        break
+                    if coor_loop_check % 3 !=0:
+                        break
+                    coor_loop_check = 0
+                else:
+                    if coor_loop_check % 3 == 0 and coor_loop_check >= 9:
+                        continuous_coors[i, vertice_count] = continuous_coors[i, vertice_count-2]
+                        continuous_coors[i, vertice_count+1] = continuous_coors[i, vertice_count-1]
+                        vertice_count += 2
+                    continuous_coors[i, vertice_count, coor_loop_check % 3] = undiscretize(id, self.coor_continuous_range[0], self.coor_continuous_range[1], self.n_discrete_size)
+                    if coor_loop_check % 3 == 2:
+                        vertice_count += 1
+                    coor_loop_check += 1
+        continuous_coors = rearrange(continuous_coors, 'b (nf nv) c -> b nf nv c', nv=3, c=3)
+        return continuous_coors # b, nf, 3, 3
+    def forward(self, data_dict: dict, is_eval: bool = False) -> dict:
+        if not is_eval:
+            return self.train_one_step(data_dict)
+        else:
+            return self.generate(data_dict)
+    def process_point_feature(self, point_feature):
+        encode_feature = torch.zeros(point_feature.shape[0], self.cond_length, self.config.word_embed_proj_dim,
+                                    device=self.cond_head_proj.weight.device, dtype=self.cond_head_proj.weight.dtype)
+        encode_feature[:, 0] = self.cond_head_proj(point_feature[:, 0])
+        shape_latents = self.point_encoder.to_shape_latents(point_feature[:, 1:])
+        encode_feature[:, 1:] = self.cond_proj(torch.cat([point_feature[:, 1:], shape_latents], dim=-1))
+        return encode_feature
+    @torch.no_grad()
+    def forward(self, pc_normal, sampling=False) -> dict:
+        batch_size = pc_normal.shape[0]
+        point_feature = self.point_encoder.encode_latents(pc_normal)
+        processed_point_feature = self.process_point_feature(point_feature)
+        generate_length = self.max_length - self.cond_length
+        net_device = next(self.parameters()).device
+        outputs = torch.ones(batch_size, generate_length).long().to(net_device) * self.eos_token_id
+        # batch x ntokens
+        if not sampling:
+            results = self.transformer.generate(
+                inputs_embeds=processed_point_feature,
+                max_new_tokens=generate_length,  # all faces plus two
+                num_beams=1,
+                bos_token_id=self.bos_token_id,
+                eos_token_id=self.eos_token_id,
+                pad_token_id=self.pad_token_id,
+            )
+        else:
+            results = self.transformer.generate(
+                inputs_embeds = processed_point_feature,
+                max_new_tokens = generate_length, # all faces plus two
+                do_sample=True,
+                top_k=50,
+                top_p=0.95,
+                bos_token_id = self.bos_token_id,
+                eos_token_id = self.eos_token_id,
+                pad_token_id = self.pad_token_id,
+            )
+        assert results.shape[1] <= generate_length # B x ID  bos is not included since it's predicted
+        outputs[:, :results.shape[1]] = results
+        # batch x ntokens ====> batch x ntokens x D
+        outputs = outputs[:, 1: -1]
+        outputs[outputs == self.bos_token_id] = self.pad_id
+        outputs[outputs == self.eos_token_id] = self.pad_id
+        outputs[outputs == self.pad_token_id] = self.pad_id
+        outputs[outputs != self.pad_id] -= 3
+        gen_mesh = self.adjacent_detokenize(outputs)
+        return gen_mesh
+def undiscretize(
+    t,
+    low,#-0.5
+    high,# 0.5
+    num_discrete
+):
+    t = t.float() #[0, num_discrete-1]
+    t /= num_discrete  # 0<=t<1
+    t = t * (high - low) + low # -0.5 <= t < 0.5
+    return t

MeshAnything/models/shape_opt.py CHANGED Viewed

@@ -8,9 +8,8 @@ from transformers.modeling_outputs import (
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
-from transformers.utils import replace_return_docstrings, logging
 from transformers.modeling_outputs import BaseModelOutputWithPast
-# from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 class ShapeOPTConfig(OPTConfig):
     model_type = "shape_opt"
@@ -26,23 +25,6 @@ class ShapeOPT(OPTForCausalLM):
         # Initialize weights and apply final processing
         self.post_init()
-    def tie_weights(self):
-        """
-        Tie the weights between the input embeddings and the output embeddings.
-        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
-        weights instead.
-        """
-        if getattr(self.config, "is_encoder_decoder", False) and getattr(self.config, "tie_encoder_decoder", False):
-            if hasattr(self, self.base_model_prefix):
-                self = getattr(self, self.base_model_prefix)
-            self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
-        for module in self.modules():
-            if hasattr(module, "_tie_weights"):
-                module._tie_weights()
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="OPTConfig")
     def forward(
         self,
@@ -140,7 +122,7 @@ class ShapeOPT(OPTForCausalLM):
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model.decoder(
-            input_ids=input_ids,
             face_ids = face_ids,
             attention_mask=attention_mask,
             head_mask=head_mask,
@@ -195,28 +177,18 @@ class ShapeOPTDecoder(OPTDecoder):
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx) # not used
         self.hidden_size = config.hidden_size
         self.word_embed_proj_dim = config.word_embed_proj_dim
-        self.extra_embeds = nn.Embedding(3, config.word_embed_proj_dim) #padding_idx=self.padding_idx)
-        self.input_layer = nn.Linear(config.quantize_codebook_dim, config.word_embed_proj_dim)
         self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
-        self.token_embed_positions =  OPTFacePositionalEmbedding(config.face_per_token + 3, config.word_embed_proj_dim) #padding_idx=self.padding_idx)
         self.face_per_token = config.face_per_token
         self.cond_length = config.cond_length
         self.cond_embed = nn.Embedding(2, config.word_embed_proj_dim)
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
-        else:
-            self.project_out = None
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
-        else:
-            self.project_in = None
         # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
         # with checkpoints that have been fine-tuned before transformers v4.20.1
         # see https://github.com/facebookresearch/metaseq/pull/164
@@ -234,17 +206,6 @@ class ShapeOPTDecoder(OPTDecoder):
         # Initialize weights and apply final processing
         self.post_init()
-    def embed_with_vae(self, input_ids):
-        inputs_embeds = repeat(torch.zeros(input_ids.shape, device=input_ids.device), 'b n -> b n d',
-                               d=self.word_embed_proj_dim).clone().detach()
-        idx_in_extra = torch.isin(input_ids, torch.LongTensor([0, 1, 2]).to(input_ids.device))
-        inputs_embeds[idx_in_extra] += self.extra_embeds(input_ids[idx_in_extra])
-        self.quantize_codebooks = self.quantize_codebooks.to(input_ids.device)
-        inputs_embeds[~idx_in_extra] += self.input_layer(self.quantize_codebooks[0][input_ids[~idx_in_extra] - 3])
-        return inputs_embeds
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -315,11 +276,13 @@ class ShapeOPTDecoder(OPTDecoder):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # Transformer Decoder
-        if input_ids is not None:
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
-            inputs_embeds = self.embed_with_vae(input_ids) # nothing to do with position
             face_embeds = self.token_embed_positions(attention_mask[:, self.cond_length:], face_ids, input_ids,
                                                      self.face_per_token)
             inputs_embeds += face_embeds
@@ -329,7 +292,8 @@ class ShapeOPTDecoder(OPTDecoder):
         elif inputs_embeds is not None:
             # assert self.cond and not self.training
             total_length = inputs_embeds.shape[1] # B x length x embeding
             cond_embed_query = torch.zeros((inputs_embeds.shape[0], total_length), device=inputs_embeds.device,
                                             dtype=inputs_embeds.dtype).long()
@@ -357,9 +321,6 @@ class ShapeOPTDecoder(OPTDecoder):
         pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
-        if self.project_in is not None:
-            inputs_embeds = self.project_in(inputs_embeds)
         hidden_states = inputs_embeds + pos_embeds
         # decoder layers
@@ -419,9 +380,6 @@ class ShapeOPTDecoder(OPTDecoder):
         if self.final_layer_norm is not None:
             hidden_states = self.final_layer_norm(hidden_states)
-        if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -436,6 +394,56 @@ class ShapeOPTDecoder(OPTDecoder):
             attentions=all_self_attns,
         )
 class OPTFacePositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.

 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from transformers.utils import replace_return_docstrings
 from transformers.modeling_outputs import BaseModelOutputWithPast
 class ShapeOPTConfig(OPTConfig):
     model_type = "shape_opt"
         # Initialize weights and apply final processing
         self.post_init()
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="OPTConfig")
     def forward(
         self,
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model.decoder(
+            input_ids = input_ids,
             face_ids = face_ids,
             attention_mask=attention_mask,
             head_mask=head_mask,
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
         self.hidden_size = config.hidden_size
         self.word_embed_proj_dim = config.word_embed_proj_dim
+        self.n_discrete_size = config.n_discrete_size
         self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
+        self.token_embed_positions = OPTLoopEmbedding(10, config.word_embed_proj_dim, self.n_discrete_size) #padding_idx=self.padding_idx)
         self.face_per_token = config.face_per_token
         self.cond_length = config.cond_length
         self.cond_embed = nn.Embedding(2, config.word_embed_proj_dim)
         # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
         # with checkpoints that have been fine-tuned before transformers v4.20.1
         # see https://github.com/facebookresearch/metaseq/pull/164
         # Initialize weights and apply final processing
         self.post_init()
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # Transformer Decoder
+        if input_ids is not None and inputs_embeds is not None: # when  train and first generate
+            assert False
+        elif input_ids is not None:
+            assert not self.training
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
+            inputs_embeds = self.embed_tokens(input_ids)
             face_embeds = self.token_embed_positions(attention_mask[:, self.cond_length:], face_ids, input_ids,
                                                      self.face_per_token)
             inputs_embeds += face_embeds
         elif inputs_embeds is not None:
             # assert self.cond and not self.training
+            assert not self.training
+            self.token_embed_positions.init_state(inputs_embeds)
             total_length = inputs_embeds.shape[1] # B x length x embeding
             cond_embed_query = torch.zeros((inputs_embeds.shape[0], total_length), device=inputs_embeds.device,
                                             dtype=inputs_embeds.dtype).long()
         pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
         hidden_states = inputs_embeds + pos_embeds
         # decoder layers
         if self.final_layer_norm is not None:
             hidden_states = self.final_layer_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
             attentions=all_self_attns,
         )
+class OPTLoopEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, n_discrete_size: int):
+        super().__init__(num_embeddings, embedding_dim)
+        self.state = None
+        self.loop_state = None
+        self.n_discrete_size = n_discrete_size + 3 # for padding
+    def forward(self, attention_mask=None, face_ids = None, input_ids = None, face_per_token = None):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        if face_ids is not None:
+            return super().forward(face_ids)
+        assert input_ids.shape[1] == 1, "Only one token is allowed for loop embedding"
+        assert self.state is not None, "State is not initialized"
+        # zero as beginning
+        batch_size = input_ids.shape[0]
+        face_ids = input_ids.clone().detach()
+        for cur_batch_index in range(batch_size):
+            cur_ids = input_ids[cur_batch_index]
+            idx_in_extra = torch.isin(cur_ids, torch.LongTensor([0, 1, 2]).to(input_ids.device))
+            if idx_in_extra:
+                self.state[cur_batch_index] = 9  # init
+                self.loop_state[cur_batch_index] = 0
+            else:
+                if cur_ids == self.n_discrete_size:
+                    face_ids[cur_batch_index] = 3
+                    self.state[cur_batch_index] = 9 # init
+                    self.loop_state[cur_batch_index] = 0
+                else:
+                    if self.state[cur_batch_index] == 0:
+                        face_ids[cur_batch_index] = 7 + self.loop_state[cur_batch_index] % 3
+                    else:
+                        self.state[cur_batch_index] -= 1
+                        face_ids[cur_batch_index] = 4 + self.loop_state[cur_batch_index] % 3
+                    self.loop_state[cur_batch_index] += 1
+        return super().forward(face_ids)
+    def init_state(self, template_tensor):
+        batch_size = template_tensor.shape[0]
+        self.state = torch.zeros((batch_size, 1), dtype=torch.long, device=template_tensor.device)
+        self.state[...] = 9
+        self.loop_state = torch.zeros((batch_size, 1), dtype=torch.long, device=template_tensor.device)
 class OPTFacePositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.