Update model on main, checkpoint

Browse files

Files changed (7) hide show

README.md +43 -0
config.json +35 -0
modeling_novomolgen.py +341 -0
pytorch_model.bin +3 -0
special_tokens_map.json +30 -0
tokenizer.json +203 -0
tokenizer_config.json +43 -0

README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+license: mit
+datasets:
+  - ZINC-22
+language:
+  - en
+tags:
+  - molecular-generation
+  - drug-discovery
+  - llama
+  - flash-attention
+pipeline_tag: text-generation
+---
+# NovoMolGen
+NovoMolGen is a family of molecular foundation models trained on 1.5 billion ZINC‑22 molecules using Llama architectures and FlashAttention. It achieves state‑of‑the‑art performance on both unconstrained and goal‑directed molecule generation tasks.
+## How to load
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("chandar-lab/NovoMolGen_300M_SMILES_AtomWise", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("chandar-lab/NovoMolGen_300M_SMILES_AtomWise", trust_remote_code=True)
+```
+## Quickstart
+```python
+outputs = model.sample(tokenizer=tokenizer, batch_size=4)
+print(outputs['SMILES'])
+```
+## Citation
+```bibtex
+@article{chitsaz2024novomolgen,
+  title={NovoMolGen: Rethinking Molecular Language Model Pretraining},
+  author={Chitsaz, Kamran and Balaji, Roshan and Fournier, Quentin and Bhatt, Nirav Pravinbhai and Chandar, Sarath},
+  journal={arXiv preprint},
+  year={2025},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModelForCausalLM": "modeling_novomolgen.NovoMolGen"
+  },
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "fused_bias_fc": false,
+  "fused_dropout_add_ln": false,
+  "fused_mlp": false,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "loss_type": "ForCausalLM",
+  "max_position_embeddings": 2048,
+  "max_seq_length": 64,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 12,
+  "pretraining_tp": 1,
+  "residual_in_fp32": true,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.46.2",
+  "use_cache": true,
+  "use_flash_attn": true,
+  "vocab_size": 84
+}

modeling_novomolgen.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import copy
+import json
+import os.path
+import re
+import shutil
+import inspect
+from typing import Optional, Union
+import torch
+import torch.nn.functional as F
+from transformers import LlamaConfig
+from transformers.loss.loss_utils import LOSS_MAPPING
+from transformers.modeling_outputs import CausalLMOutput
+from transformers.utils.hub import cached_file, get_checkpoint_shard_files
+from transformers.utils import (
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+)
+from transformers.modeling_utils import unwrap_model, logger
+from functools import partial
+from safetensors.torch import load_file as safe_load_file
+try:
+    from flash_attn.models.gpt import GPTLMHeadModel
+except ImportError:
+    GPTLMHeadModel = None
+try:
+    from flash_attn.models.llama import llama_config_to_gpt2_config, inv_remap_state_dict_hf_llama
+except ImportError:
+    llama_config_to_gpt2_config = None
+    inv_remap_state_dict_hf_llama = None
+def state_dict_from_pretrained(model_name, checkpoint_path: str = "", device=None, dtype=None):
+    """
+    code modified from: https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/pretrained.py
+    """
+    # If not fp32, then we don't want to load directly to the GPU
+    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
+    is_sharded = False
+    load_safe = False
+    # Try loading from HF hub instead of from local files
+    resolved_archive_file = cached_file(model_name, os.path.join(checkpoint_path, WEIGHTS_NAME),
+                                        _raise_exceptions_for_missing_entries=False)
+    if resolved_archive_file is None:
+        resolved_archive_file = cached_file(model_name, os.path.join(checkpoint_path, WEIGHTS_INDEX_NAME),
+                                            _raise_exceptions_for_missing_entries=False)
+        if resolved_archive_file is not None:
+            is_sharded = True
+    if resolved_archive_file is None:
+        raise EnvironmentError(f"Model name {model_name} was not found.")
+    if load_safe:
+        loader = partial(safe_load_file, device=mapped_device)
+    else:
+        loader = partial(torch.load, map_location=mapped_device)
+    if is_sharded:
+        # resolved_archive_file becomes a list of files that point to the different
+        # checkpoint shards in this case.
+        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+            model_name, resolved_archive_file
+        )
+        state_dict = {}
+        for sharded_file in resolved_archive_file:
+            state_dict.update(loader(sharded_file))
+    else:
+        state_dict = loader(resolved_archive_file)
+    # Convert dtype before moving to GPU to save memory
+    if dtype is not None:
+        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
+    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
+    return state_dict
+class NovoMolGenConfig(LlamaConfig):
+    # model_type = "NovoMolGen"
+    def __init__(self,
+                 use_flash_attn: bool = True,
+                 fused_bias_fc: bool = True,
+                 fused_mlp: bool = False,
+                 fused_dropout_add_ln: bool = True,
+                 residual_in_fp32: bool = True,
+                 loss_type: str = 'ForCausalLM',
+                 **kwargs
+                 ):
+        super().__init__(**kwargs)
+        self.use_flash_attn = use_flash_attn
+        self.fused_bias_fc = fused_bias_fc
+        self.fused_mlp = fused_mlp
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.residual_in_fp32 = residual_in_fp32
+        self.loss_type = loss_type
+        self.auto_map = {"AutoModelForCausalLM": "modeling_novomolgen.NovoMolGen"}
+    @classmethod
+    def from_pretrained(
+            cls,
+            pretrained_model_name_or_path: Union[str, os.PathLike],
+            checkpoint_path: str = "",
+            cache_dir: Optional[Union[str, os.PathLike]] = None,
+            force_download: bool = False,
+            local_files_only: bool = False,
+            token: Optional[Union[str, bool]] = None,
+            revision: str = "main",
+            **kwargs,
+    ):
+        resolved_archive_config_file = cached_file(pretrained_model_name_or_path,
+                                                   os.path.join(checkpoint_path, "config.json"),
+                                                   _raise_exceptions_for_missing_entries=False)
+        if resolved_archive_config_file is not None:
+            with open(resolved_archive_config_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            config_dict = json.loads(text)
+        else:
+            raise EnvironmentError(f"config for {pretrained_model_name_or_path} was not found.")
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            print(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class NovoMolGen(GPTLMHeadModel):
+    def __init__(
+            self,
+            config: NovoMolGenConfig,
+            mol_type: str = "SMILES",
+    ):
+        self.base_config = config
+        self.mol_type = mol_type
+        config = llama_config_to_gpt2_config(config)
+        config.use_flash_attn = self.base_config.use_flash_attn
+        config.fused_bias_fc = self.base_config.fused_bias_fc
+        config.fused_mlp = self.base_config.fused_mlp
+        config.fused_dropout_add_ln = self.base_config.fused_dropout_add_ln
+        config.residual_in_fp32 = self.base_config.residual_in_fp32
+        GPTLMHeadModel.__init__(self, config)
+    # TODO: here we ignore attention_mask to make it compatible with HF trainer. The MHA in flash-attention should
+    #  be reimplement and integrate attention_mask like here:
+    #  https://github.com/huggingface/transformers/blob/0864dd3beb238b7bec3528a3d1d6c17a28f51a51/src/transformers/models/llama/modeling_llama.py#L536
+    def forward(self, input_ids, attention_mask: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = None,
+                position_ids=None, inference_params=None, num_last_tokens=0, **loss_kwargs):
+        """
+                input_ids: (batch, seqlen) int tensor
+                inference_params: for generation. Adapted from Megatron-LM (and Apex)
+                https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+                num_last_tokens: if > 0, only return the logits for the last n tokens
+                """
+        assert (
+                input_ids.ndim == 2
+        ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
+        b, slen = input_ids.shape
+        hidden_states = self.transformer(
+            input_ids, position_ids=position_ids, inference_params=inference_params
+        )
+        if inference_params is not None:
+            assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        if self.output_scale != 1.0:
+            hidden_states = hidden_states * self.output_scale
+        if not self.norm_head:
+            lm_logits = self.lm_head(hidden_states)
+        else:
+            lm_head_weight = F.normalize(self.lm_head.weight)
+            # if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
+            #     hidden_states = all_gather(hidden_states, self.lm_head.process_group)
+            lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
+        # During inference, we want the full logit for sampling
+        # if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
+        #     lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
+        #     lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=lm_logits, labels=labels, vocab_size=self.base_config.vocab_size,
+                                      **loss_kwargs)
+        return CausalLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            hidden_states=hidden_states
+        )
+    @property
+    def loss_function(self):
+        if getattr(self.base_config, "loss_type", None) is not None:
+            loss_type = self.base_config.loss_type
+        else:
+            loss_type = self.__class__.__name__
+            if loss_type not in LOSS_MAPPING:
+                loss_groups = f"({'|'.join(LOSS_MAPPING)})"
+                loss_type = re.findall(loss_groups, self.__class__.__name__)
+                if len(loss_type) > 0:
+                    loss_type = loss_type[0]
+                else:
+                    loss_type = None
+        if loss_type is None or loss_type not in LOSS_MAPPING and getattr(self.base_config, "loss_type",
+                                                                          None) is not None:
+            print(
+                f"`loss_type={loss_type}` was set in the base_config but it is unrecognised."
+                f"Using the default loss: `ForCausalLMLoss`."
+            )
+            loss_type = "ForCausalLM"
+        return LOSS_MAPPING[loss_type]
+    def save_pretrained(
+            self,
+            save_directory: Union[str, os.PathLike],
+            is_main_process: bool = True,
+            state_dict: Optional[dict] = None,
+            safe_serialization: bool = False,
+            **kwargs,
+    ):
+        if safe_serialization:
+            raise ImportError("`safe_serialization` is not implemented yet`.")
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the config
+        if is_main_process:
+            self.base_config.save_pretrained(save_directory)
+        # Save the model
+        if state_dict is None:
+            # Only save the model itself if we are using distributed training
+            model_to_save = unwrap_model(self)
+            state_dict = model_to_save.state_dict()
+        weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        torch.save(state_dict, os.path.join(save_directory, weights_name))
+        # find the file where NovoMolGen is defined
+        src = inspect.getsourcefile(type(self))
+        if src:
+            dst = os.path.join(save_directory, os.path.basename(src))
+            shutil.copy(src, dst)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        checkpoint_path: str = "",
+        config: Optional[Union[NovoMolGenConfig, str, os.PathLike]] = None,
+        **kwargs,
+        ):
+        if config is None:
+            config = NovoMolGenConfig.from_pretrained(pretrained_model_name_or_path, checkpoint_path=checkpoint_path)
+        model = cls(config)
+        if os.path.exists(pretrained_model_name_or_path):
+            state_dict = torch.load(os.path.join(pretrained_model_name_or_path, checkpoint_path, WEIGHTS_NAME))
+        else:
+            state_dict = state_dict_from_pretrained(pretrained_model_name_or_path, checkpoint_path=checkpoint_path)
+        model.load_state_dict(state_dict)
+        return model
+    def sample(
+            self,
+            tokenizer,
+            batch_size: int = 4,
+            max_length: int = 64,
+            temperature: float = 1.0,
+            top_k: int = 50,
+            top_p: float = 0.95,
+            device: torch.device = torch.device("cuda"),
+    ):
+        """
+        Generate a batch of sequences from the model.
+        Returns a dictionary with up to three keys:
+        {
+            "<mol_type>": <list of raw sequences in that moltype>,
+            "sequences": <torch.LongTensor of valid token IDs>
+        }
+        """
+        input_ids = tokenizer.encode("", return_tensors="pt").to(device)
+        # Repeat the prompt for the desired batch size
+        input_ids = input_ids.repeat_interleave(batch_size, dim=0)
+        # If the tokenizer includes an EOS token for an empty prompt, we remove it.
+        if input_ids.shape[1] > 1:
+            input_ids = input_ids[:, :-1]
+        generation_output = self.generate(
+            input_ids,
+            max_length=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            eos_token_id=tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+        )
+        sequences = self._filter_tokens_after_eos(
+            generation_output.sequences, eos_id=tokenizer.eos_token_id
+        )
+        decoded_strings = tokenizer.batch_decode(sequences, skip_special_tokens=True)
+        decoded_strings = [s.replace(" ", "") for s in decoded_strings]
+        result = {
+            self.mol_type: decoded_strings,
+            "sequences": sequences,
+        }
+        return result
+    @staticmethod
+    def _filter_tokens_after_eos(sequences, eos_id):
+        output = copy.deepcopy(sequences)
+        for i in range(sequences.size(0)):
+            row = sequences[i]
+            eos_position = (row == eos_id).nonzero()
+            if eos_position.numel() > 0:
+                eos_position = eos_position[0, 0].item()  # Get the index of the first occurrence
+                output[i, eos_position + 1:] = eos_id
+        return output
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
+        # HF’s GenerationMixin would normally do more, but for a basic LM this usually suffices:
+        return {"input_ids": input_ids, "attention_mask": attention_mask}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6149e167f5797173f70c6defc0bd585556889eb7a9d232d12d8dcea575475838
+size 1208773654

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,203 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Split",
+    "pattern": {
+      "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])"
+    },
+    "behavior": "Isolated",
+    "invert": false
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<bos>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<bos>": {
+        "id": "<bos>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<bos>"
+        ]
+      },
+      "<eos>": {
+        "id": "<eos>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<eos>"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "BPEDecoder",
+    "suffix": "</w>"
+  },
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<unk>": 0,
+      "<pad>": 1,
+      "<bos>": 2,
+      "<eos>": 3,
+      "C": 4,
+      "(": 5,
+      ")": 6,
+      "c": 7,
+      "1": 8,
+      "O": 9,
+      "=": 10,
+      "N": 11,
+      "2": 12,
+      "n": 13,
+      "[C@H]": 14,
+      "[C@@H]": 15,
+      "3": 16,
+      "F": 17,
+      "S": 18,
+      "s": 19,
+      "4": 20,
+      "Cl": 21,
+      "[nH]": 22,
+      "o": 23,
+      "[C@]": 24,
+      "[C@@]": 25,
+      "#": 26,
+      "Br": 27,
+      "-": 28,
+      "/": 29,
+      "[N+]": 30,
+      "[O-]": 31,
+      "5": 32,
+      "I": 33,
+      "[N-]": 34,
+      "P": 35,
+      "[S@]": 36,
+      "[S@@]": 37,
+      "[n+]": 38,
+      "[Si]": 39,
+      "6": 40,
+      "[S+]": 41,
+      "B": 42,
+      "[P@]": 43,
+      "7": 44,
+      "[P@@]": 45,
+      "[N@]": 46,
+      "8": 47,
+      "[N@@]": 48,
+      "[B-]": 49,
+      "[NH+]": 50,
+      "[N@@H+]": 51,
+      "[NH2+]": 52,
+      "[N@H+]": 53,
+      "[O]": 54,
+      "[NH3+]": 55,
+      "[PH]": 56,
+      "[Si@]": 57,
+      "[Si@@]": 58,
+      "[n-]": 59,
+      "9": 60,
+      "[N@+]": 61,
+      "[nH+]": 62,
+      "[N@@+]": 63,
+      "[Sn]": 64,
+      "[s+]": 65,
+      "[Se]": 66,
+      ".": 67,
+      "[Cl-]": 68,
+      "[N]": 69,
+      "[C-]": 70,
+      "[C]": 71,
+      "[S@@+]": 72,
+      "%10": 73,
+      "%11": 74,
+      "[O+]": 75,
+      "[SH]": 76,
+      "[Si@H]": 77,
+      "[NH]": 78,
+      "[P+]": 79,
+      "[P@@H]": 80,
+      "[Si@@H]": 81,
+      "[c-]": 82,
+      "[o+]": 83
+    },
+    "unk_token": "<unk>"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}