kmchiti commited on
Commit
37077c6
·
verified ·
1 Parent(s): ae1c883

Update model on main, checkpoint

Browse files
README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - ZINC-22
5
+ language:
6
+ - en
7
+ tags:
8
+ - molecular-generation
9
+ - drug-discovery
10
+ - llama
11
+ - flash-attention
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # NovoMolGen
16
+
17
+ NovoMolGen is a family of molecular foundation models trained on 1.5 billion ZINC‑22 molecules using Llama architectures and FlashAttention. It achieves state‑of‑the‑art performance on both unconstrained and goal‑directed molecule generation tasks.
18
+
19
+ ## How to load
20
+
21
+ ```python
22
+ from transformers import AutoTokenizer, AutoModelForCausalLM
23
+ tokenizer = AutoTokenizer.from_pretrained("chandar-lab/NovoMolGen_300M_SMILES_BPE", trust_remote_code=True)
24
+ model = AutoModelForCausalLM.from_pretrained("chandar-lab/NovoMolGen_300M_SMILES_BPE", trust_remote_code=True)
25
+ ```
26
+
27
+ ## Quickstart
28
+
29
+ ```python
30
+ outputs = model.sample(tokenizer=tokenizer, batch_size=4)
31
+ print(outputs['SMILES'])
32
+ ```
33
+
34
+ ## Citation
35
+
36
+ ```bibtex
37
+ @article{chitsaz2024novomolgen,
38
+ title={NovoMolGen: Rethinking Molecular Language Model Pretraining},
39
+ author={Chitsaz, Kamran and Balaji, Roshan and Fournier, Quentin and Bhatt, Nirav Pravinbhai and Chandar, Sarath},
40
+ journal={arXiv preprint},
41
+ year={2025},
42
+ }
43
+ ```
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "attention_dropout": 0.0,
4
+ "auto_map": {
5
+ "AutoModelForCausalLM": "modeling_novomolgen.NovoMolGen"
6
+ },
7
+ "bos_token_id": 2,
8
+ "eos_token_id": 3,
9
+ "fused_bias_fc": false,
10
+ "fused_dropout_add_ln": false,
11
+ "fused_mlp": false,
12
+ "head_dim": 64,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 768,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "loss_type": "ForCausalLM",
18
+ "max_position_embeddings": 2048,
19
+ "max_seq_length": 64,
20
+ "mlp_bias": false,
21
+ "model_type": "llama",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 32,
24
+ "num_key_value_heads": 12,
25
+ "pretraining_tp": 1,
26
+ "residual_in_fp32": true,
27
+ "rms_norm_eps": 1e-06,
28
+ "rope_scaling": null,
29
+ "rope_theta": 10000.0,
30
+ "tie_word_embeddings": false,
31
+ "transformers_version": "4.46.2",
32
+ "use_cache": true,
33
+ "use_flash_attn": true,
34
+ "vocab_size": 500
35
+ }
modeling_novomolgen.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import json
3
+ import os.path
4
+ import re
5
+ import shutil
6
+ import inspect
7
+ from typing import Optional, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from transformers import LlamaConfig
12
+ from transformers.loss.loss_utils import LOSS_MAPPING
13
+ from transformers.modeling_outputs import CausalLMOutput
14
+ from transformers.utils.hub import cached_file, get_checkpoint_shard_files
15
+ from transformers.utils import (
16
+ SAFE_WEIGHTS_NAME,
17
+ WEIGHTS_INDEX_NAME,
18
+ WEIGHTS_NAME,
19
+ )
20
+ from transformers.modeling_utils import unwrap_model, logger
21
+ from functools import partial
22
+ from safetensors.torch import load_file as safe_load_file
23
+
24
+ try:
25
+ from flash_attn.models.gpt import GPTLMHeadModel
26
+ except ImportError:
27
+ GPTLMHeadModel = None
28
+
29
+ try:
30
+ from flash_attn.models.llama import llama_config_to_gpt2_config, inv_remap_state_dict_hf_llama
31
+ except ImportError:
32
+ llama_config_to_gpt2_config = None
33
+ inv_remap_state_dict_hf_llama = None
34
+
35
+
36
+ def state_dict_from_pretrained(model_name, checkpoint_path: str = "", device=None, dtype=None):
37
+ """
38
+ code modified from: https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/pretrained.py
39
+ """
40
+
41
+ # If not fp32, then we don't want to load directly to the GPU
42
+ mapped_device = "cpu" if dtype not in [torch.float32, None] else device
43
+ is_sharded = False
44
+ load_safe = False
45
+
46
+ # Try loading from HF hub instead of from local files
47
+ resolved_archive_file = cached_file(model_name, os.path.join(checkpoint_path, WEIGHTS_NAME),
48
+ _raise_exceptions_for_missing_entries=False)
49
+ if resolved_archive_file is None:
50
+ resolved_archive_file = cached_file(model_name, os.path.join(checkpoint_path, WEIGHTS_INDEX_NAME),
51
+ _raise_exceptions_for_missing_entries=False)
52
+ if resolved_archive_file is not None:
53
+ is_sharded = True
54
+
55
+ if resolved_archive_file is None:
56
+ raise EnvironmentError(f"Model name {model_name} was not found.")
57
+
58
+ if load_safe:
59
+ loader = partial(safe_load_file, device=mapped_device)
60
+ else:
61
+ loader = partial(torch.load, map_location=mapped_device)
62
+
63
+ if is_sharded:
64
+ # resolved_archive_file becomes a list of files that point to the different
65
+ # checkpoint shards in this case.
66
+ resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
67
+ model_name, resolved_archive_file
68
+ )
69
+ state_dict = {}
70
+ for sharded_file in resolved_archive_file:
71
+ state_dict.update(loader(sharded_file))
72
+ else:
73
+ state_dict = loader(resolved_archive_file)
74
+ # Convert dtype before moving to GPU to save memory
75
+ if dtype is not None:
76
+ state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
77
+ state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
78
+
79
+ return state_dict
80
+
81
+
82
+ class NovoMolGenConfig(LlamaConfig):
83
+ # model_type = "NovoMolGen"
84
+
85
+ def __init__(self,
86
+ use_flash_attn: bool = True,
87
+ fused_bias_fc: bool = True,
88
+ fused_mlp: bool = False,
89
+ fused_dropout_add_ln: bool = True,
90
+ residual_in_fp32: bool = True,
91
+ loss_type: str = 'ForCausalLM',
92
+ **kwargs
93
+ ):
94
+ super().__init__(**kwargs)
95
+ self.use_flash_attn = use_flash_attn
96
+ self.fused_bias_fc = fused_bias_fc
97
+ self.fused_mlp = fused_mlp
98
+ self.fused_dropout_add_ln = fused_dropout_add_ln
99
+ self.residual_in_fp32 = residual_in_fp32
100
+ self.loss_type = loss_type
101
+ self.auto_map = {"AutoModelForCausalLM": "modeling_novomolgen.NovoMolGen"}
102
+
103
+ @classmethod
104
+ def from_pretrained(
105
+ cls,
106
+ pretrained_model_name_or_path: Union[str, os.PathLike],
107
+ checkpoint_path: str = "",
108
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
109
+ force_download: bool = False,
110
+ local_files_only: bool = False,
111
+ token: Optional[Union[str, bool]] = None,
112
+ revision: str = "main",
113
+ **kwargs,
114
+ ):
115
+
116
+ resolved_archive_config_file = cached_file(pretrained_model_name_or_path,
117
+ os.path.join(checkpoint_path, "config.json"),
118
+ _raise_exceptions_for_missing_entries=False)
119
+
120
+ if resolved_archive_config_file is not None:
121
+ with open(resolved_archive_config_file, "r", encoding="utf-8") as reader:
122
+ text = reader.read()
123
+ config_dict = json.loads(text)
124
+
125
+ else:
126
+ raise EnvironmentError(f"config for {pretrained_model_name_or_path} was not found.")
127
+
128
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
129
+ print(
130
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
131
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
132
+ )
133
+
134
+ return cls.from_dict(config_dict, **kwargs)
135
+
136
+
137
+ class NovoMolGen(GPTLMHeadModel):
138
+ def __init__(
139
+ self,
140
+ config: NovoMolGenConfig,
141
+ mol_type: str = "SMILES",
142
+ ):
143
+ self.base_config = config
144
+ self.mol_type = mol_type
145
+ config = llama_config_to_gpt2_config(config)
146
+ config.use_flash_attn = self.base_config.use_flash_attn
147
+ config.fused_bias_fc = self.base_config.fused_bias_fc
148
+ config.fused_mlp = self.base_config.fused_mlp
149
+ config.fused_dropout_add_ln = self.base_config.fused_dropout_add_ln
150
+ config.residual_in_fp32 = self.base_config.residual_in_fp32
151
+ GPTLMHeadModel.__init__(self, config)
152
+
153
+ # TODO: here we ignore attention_mask to make it compatible with HF trainer. The MHA in flash-attention should
154
+ # be reimplement and integrate attention_mask like here:
155
+ # https://github.com/huggingface/transformers/blob/0864dd3beb238b7bec3528a3d1d6c17a28f51a51/src/transformers/models/llama/modeling_llama.py#L536
156
+ def forward(self, input_ids, attention_mask: Optional[torch.FloatTensor] = None,
157
+ labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = None,
158
+ position_ids=None, inference_params=None, num_last_tokens=0, **loss_kwargs):
159
+ """
160
+ input_ids: (batch, seqlen) int tensor
161
+ inference_params: for generation. Adapted from Megatron-LM (and Apex)
162
+ https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
163
+ num_last_tokens: if > 0, only return the logits for the last n tokens
164
+ """
165
+ assert (
166
+ input_ids.ndim == 2
167
+ ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
168
+ b, slen = input_ids.shape
169
+ hidden_states = self.transformer(
170
+ input_ids, position_ids=position_ids, inference_params=inference_params
171
+ )
172
+ if inference_params is not None:
173
+ assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
174
+ if num_last_tokens > 0:
175
+ hidden_states = hidden_states[:, -num_last_tokens:]
176
+ if self.project_out is not None:
177
+ hidden_states = self.project_out(hidden_states)
178
+ if self.output_scale != 1.0:
179
+ hidden_states = hidden_states * self.output_scale
180
+ if not self.norm_head:
181
+ lm_logits = self.lm_head(hidden_states)
182
+ else:
183
+ lm_head_weight = F.normalize(self.lm_head.weight)
184
+ # if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
185
+ # hidden_states = all_gather(hidden_states, self.lm_head.process_group)
186
+ lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
187
+ # During inference, we want the full logit for sampling
188
+ # if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
189
+ # lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
190
+ # lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
191
+
192
+ loss = None
193
+ if labels is not None:
194
+ loss = self.loss_function(logits=lm_logits, labels=labels, vocab_size=self.base_config.vocab_size,
195
+ **loss_kwargs)
196
+
197
+ return CausalLMOutput(
198
+ loss=loss,
199
+ logits=lm_logits,
200
+ hidden_states=hidden_states
201
+ )
202
+
203
+ @property
204
+ def loss_function(self):
205
+ if getattr(self.base_config, "loss_type", None) is not None:
206
+ loss_type = self.base_config.loss_type
207
+ else:
208
+ loss_type = self.__class__.__name__
209
+ if loss_type not in LOSS_MAPPING:
210
+ loss_groups = f"({'|'.join(LOSS_MAPPING)})"
211
+ loss_type = re.findall(loss_groups, self.__class__.__name__)
212
+ if len(loss_type) > 0:
213
+ loss_type = loss_type[0]
214
+ else:
215
+ loss_type = None
216
+ if loss_type is None or loss_type not in LOSS_MAPPING and getattr(self.base_config, "loss_type",
217
+ None) is not None:
218
+ print(
219
+ f"`loss_type={loss_type}` was set in the base_config but it is unrecognised."
220
+ f"Using the default loss: `ForCausalLMLoss`."
221
+ )
222
+ loss_type = "ForCausalLM"
223
+ return LOSS_MAPPING[loss_type]
224
+
225
+ def save_pretrained(
226
+ self,
227
+ save_directory: Union[str, os.PathLike],
228
+ is_main_process: bool = True,
229
+ state_dict: Optional[dict] = None,
230
+ safe_serialization: bool = False,
231
+ **kwargs,
232
+ ):
233
+
234
+ if safe_serialization:
235
+ raise ImportError("`safe_serialization` is not implemented yet`.")
236
+
237
+ if os.path.isfile(save_directory):
238
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
239
+ return
240
+ os.makedirs(save_directory, exist_ok=True)
241
+ # Save the config
242
+ if is_main_process:
243
+ self.base_config.save_pretrained(save_directory)
244
+
245
+ # Save the model
246
+ if state_dict is None:
247
+ # Only save the model itself if we are using distributed training
248
+ model_to_save = unwrap_model(self)
249
+ state_dict = model_to_save.state_dict()
250
+
251
+ weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
252
+ torch.save(state_dict, os.path.join(save_directory, weights_name))
253
+
254
+ # find the file where NovoMolGen is defined
255
+ src = inspect.getsourcefile(type(self))
256
+ if src:
257
+ dst = os.path.join(save_directory, os.path.basename(src))
258
+ shutil.copy(src, dst)
259
+
260
+ @classmethod
261
+ def from_pretrained(
262
+ cls,
263
+ pretrained_model_name_or_path,
264
+ checkpoint_path: str = "",
265
+ config: Optional[Union[NovoMolGenConfig, str, os.PathLike]] = None,
266
+ **kwargs,
267
+ ):
268
+ if config is None:
269
+ config = NovoMolGenConfig.from_pretrained(pretrained_model_name_or_path, checkpoint_path=checkpoint_path)
270
+ model = cls(config)
271
+
272
+ if os.path.exists(pretrained_model_name_or_path):
273
+ state_dict = torch.load(os.path.join(pretrained_model_name_or_path, checkpoint_path, WEIGHTS_NAME))
274
+ else:
275
+ state_dict = state_dict_from_pretrained(pretrained_model_name_or_path, checkpoint_path=checkpoint_path)
276
+ model.load_state_dict(state_dict)
277
+ return model
278
+
279
+ def sample(
280
+ self,
281
+ tokenizer,
282
+ batch_size: int = 4,
283
+ max_length: int = 64,
284
+ temperature: float = 1.0,
285
+ top_k: int = 50,
286
+ top_p: float = 0.95,
287
+ device: torch.device = torch.device("cuda"),
288
+ ):
289
+ """
290
+ Generate a batch of sequences from the model.
291
+
292
+ Returns a dictionary with up to three keys:
293
+ {
294
+ "<mol_type>": <list of raw sequences in that moltype>,
295
+ "sequences": <torch.LongTensor of valid token IDs>
296
+ }
297
+ """
298
+ input_ids = tokenizer.encode("", return_tensors="pt").to(device)
299
+ # Repeat the prompt for the desired batch size
300
+ input_ids = input_ids.repeat_interleave(batch_size, dim=0)
301
+ # If the tokenizer includes an EOS token for an empty prompt, we remove it.
302
+ if input_ids.shape[1] > 1:
303
+ input_ids = input_ids[:, :-1]
304
+
305
+ generation_output = self.generate(
306
+ input_ids,
307
+ max_length=max_length,
308
+ temperature=temperature,
309
+ top_k=top_k,
310
+ top_p=top_p,
311
+ eos_token_id=tokenizer.eos_token_id,
312
+ return_dict_in_generate=True,
313
+ )
314
+
315
+ sequences = self._filter_tokens_after_eos(
316
+ generation_output.sequences, eos_id=tokenizer.eos_token_id
317
+ )
318
+
319
+ decoded_strings = tokenizer.batch_decode(sequences, skip_special_tokens=True)
320
+ decoded_strings = [s.replace(" ", "") for s in decoded_strings]
321
+
322
+ result = {
323
+ self.mol_type: decoded_strings,
324
+ "sequences": sequences,
325
+ }
326
+ return result
327
+
328
+ @staticmethod
329
+ def _filter_tokens_after_eos(sequences, eos_id):
330
+ output = copy.deepcopy(sequences)
331
+ for i in range(sequences.size(0)):
332
+ row = sequences[i]
333
+ eos_position = (row == eos_id).nonzero()
334
+ if eos_position.numel() > 0:
335
+ eos_position = eos_position[0, 0].item() # Get the index of the first occurrence
336
+ output[i, eos_position + 1:] = eos_id
337
+ return output
338
+
339
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
340
+ # HF’s GenerationMixin would normally do more, but for a basic LM this usually suffices:
341
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bbb07cc62cec4185d2b79ef6e60d1dec5627ad138283fa7063d4c2371959fd9
3
+ size 1211329558
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,2456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<bos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<eos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": null,
44
+ "pre_tokenizer": null,
45
+ "post_processor": {
46
+ "type": "TemplateProcessing",
47
+ "single": [
48
+ {
49
+ "SpecialToken": {
50
+ "id": "<bos>",
51
+ "type_id": 0
52
+ }
53
+ },
54
+ {
55
+ "Sequence": {
56
+ "id": "A",
57
+ "type_id": 0
58
+ }
59
+ },
60
+ {
61
+ "SpecialToken": {
62
+ "id": "<eos>",
63
+ "type_id": 0
64
+ }
65
+ }
66
+ ],
67
+ "pair": [
68
+ {
69
+ "Sequence": {
70
+ "id": "A",
71
+ "type_id": 0
72
+ }
73
+ },
74
+ {
75
+ "Sequence": {
76
+ "id": "B",
77
+ "type_id": 1
78
+ }
79
+ }
80
+ ],
81
+ "special_tokens": {
82
+ "<bos>": {
83
+ "id": "<bos>",
84
+ "ids": [
85
+ 2
86
+ ],
87
+ "tokens": [
88
+ "<bos>"
89
+ ]
90
+ },
91
+ "<eos>": {
92
+ "id": "<eos>",
93
+ "ids": [
94
+ 3
95
+ ],
96
+ "tokens": [
97
+ "<eos>"
98
+ ]
99
+ }
100
+ }
101
+ },
102
+ "decoder": {
103
+ "type": "BPEDecoder",
104
+ "suffix": "</w>"
105
+ },
106
+ "model": {
107
+ "type": "BPE",
108
+ "dropout": 0.1,
109
+ "unk_token": "<unk>",
110
+ "continuing_subword_prefix": null,
111
+ "end_of_word_suffix": null,
112
+ "fuse_unk": false,
113
+ "byte_fallback": false,
114
+ "ignore_merges": false,
115
+ "vocab": {
116
+ "<unk>": 0,
117
+ "<pad>": 1,
118
+ "<bos>": 2,
119
+ "<eos>": 3,
120
+ "#": 4,
121
+ "(": 5,
122
+ ")": 6,
123
+ "+": 7,
124
+ "-": 8,
125
+ ".": 9,
126
+ "/": 10,
127
+ "1": 11,
128
+ "2": 12,
129
+ "3": 13,
130
+ "4": 14,
131
+ "5": 15,
132
+ "6": 16,
133
+ "7": 17,
134
+ "8": 18,
135
+ "9": 19,
136
+ "=": 20,
137
+ "@": 21,
138
+ "B": 22,
139
+ "C": 23,
140
+ "F": 24,
141
+ "H": 25,
142
+ "I": 26,
143
+ "N": 27,
144
+ "O": 28,
145
+ "P": 29,
146
+ "S": 30,
147
+ "[": 31,
148
+ "]": 32,
149
+ "c": 33,
150
+ "e": 34,
151
+ "i": 35,
152
+ "l": 36,
153
+ "n": 37,
154
+ "o": 38,
155
+ "r": 39,
156
+ "s": 40,
157
+ "CC": 41,
158
+ "O)": 42,
159
+ "(C": 43,
160
+ "=O)": 44,
161
+ "(=O)": 45,
162
+ "[C": 46,
163
+ "[C@": 47,
164
+ "H]": 48,
165
+ "cc": 49,
166
+ ")C": 50,
167
+ "[C@H]": 51,
168
+ "[C@@": 52,
169
+ "c1": 53,
170
+ "C(=O)": 54,
171
+ "[C@@H]": 55,
172
+ "(C)": 56,
173
+ "(=O)N": 57,
174
+ "NC(=O)": 58,
175
+ "CCC": 59,
176
+ "c2": 60,
177
+ "N(C": 61,
178
+ "F)": 62,
179
+ "1CC": 63,
180
+ "c1cc": 64,
181
+ "2)": 65,
182
+ "1C": 66,
183
+ "(C)C": 67,
184
+ "N(C(=O)": 68,
185
+ "(F)": 69,
186
+ "(C(=O)N": 70,
187
+ "cn": 71,
188
+ "2CC": 72,
189
+ "(CC": 73,
190
+ ")C1": 74,
191
+ "2C": 75,
192
+ "c3": 76,
193
+ "CC1": 77,
194
+ "OC": 78,
195
+ "C(=O)N": 79,
196
+ "=C": 80,
197
+ "c2cc": 81,
198
+ "c(": 82,
199
+ "NC(=O)C": 83,
200
+ "n1": 84,
201
+ "[C@H](": 85,
202
+ "[n": 86,
203
+ "[nH]": 87,
204
+ "[C@@H](": 88,
205
+ "2)C1": 89,
206
+ "CC(=O)N": 90,
207
+ "3)": 91,
208
+ "CO": 92,
209
+ "1CCC": 93,
210
+ "c(C": 94,
211
+ "C[C@H]": 95,
212
+ "c1n": 96,
213
+ "C[C@@H]": 97,
214
+ "l)": 98,
215
+ "[C@]": 99,
216
+ "N(C)C": 100,
217
+ "CCCC": 101,
218
+ "[C@@]": 102,
219
+ "(N": 103,
220
+ "Cc1cc": 104,
221
+ "2CCC": 105,
222
+ "cc1": 106,
223
+ "nn": 107,
224
+ "[C@H]1C": 108,
225
+ ")C2)": 109,
226
+ "CC(C)": 110,
227
+ "CCO": 111,
228
+ "O=C": 112,
229
+ "N(C(=O)C": 113,
230
+ "O)C": 114,
231
+ "c3cc": 115,
232
+ "c(C(=O)N": 116,
233
+ "(CNC(=O)": 117,
234
+ "[C@@H]1C": 118,
235
+ "CCN": 119,
236
+ "1)": 120,
237
+ "[C@H]2C": 121,
238
+ "c(C)": 122,
239
+ "ccc1": 123,
240
+ "c1cn": 124,
241
+ "3CC": 125,
242
+ "F)C": 126,
243
+ "[C@H](C)": 127,
244
+ "(C)(C)C": 128,
245
+ "2)C": 129,
246
+ "[C@@H]2C": 130,
247
+ "[C@@H](C)": 131,
248
+ "Br": 132,
249
+ "(F)(F)": 133,
250
+ "[C@H]1": 134,
251
+ "3C": 135,
252
+ "cs": 136,
253
+ "c(Cl)": 137,
254
+ "1)C": 138,
255
+ "N(C)C(=O)": 139,
256
+ "[C@H]1CC": 140,
257
+ "(O)": 141,
258
+ "c(F)": 142,
259
+ "(CC(=O)N": 143,
260
+ ")CC": 144,
261
+ "[C@@H]1CC": 145,
262
+ "c2cccc": 146,
263
+ "1C(=O)": 147,
264
+ "[C@H](C": 148,
265
+ "C=C": 149,
266
+ "=CC": 150,
267
+ "CCC1": 151,
268
+ "C1": 152,
269
+ "ncc": 153,
270
+ "[C@@H]1": 154,
271
+ "[C@@H](C": 155,
272
+ "[C@H]2CC": 156,
273
+ "Br)": 157,
274
+ "(NC(=O)": 158,
275
+ "OCC": 159,
276
+ "CN": 160,
277
+ "[C@H](C)C": 161,
278
+ "3)C": 162,
279
+ "3)C2)": 163,
280
+ "c4": 164,
281
+ "c2cn": 165,
282
+ "Cc1n": 166,
283
+ "(C)CC": 167,
284
+ "2)CC1": 168,
285
+ "S(=O)": 169,
286
+ ")C(=O)N": 170,
287
+ "(Cl)": 171,
288
+ "=N": 172,
289
+ "[C@H](NC(=O)": 173,
290
+ "[C@]1": 174,
291
+ "c1C": 175,
292
+ "[C@@H](C)C": 176,
293
+ "O=C(N": 177,
294
+ "(C)C)": 178,
295
+ "[C@H](C(=O)N": 179,
296
+ "[C@@H](NC(=O)": 180,
297
+ "(N)": 181,
298
+ "c2)": 182,
299
+ "C#": 183,
300
+ "[C@@H]2CC": 184,
301
+ "C(=O)N1CC": 185,
302
+ "c[nH]": 186,
303
+ "[C@H]2": 187,
304
+ "[C@@]1": 188,
305
+ "c1cccc": 189,
306
+ "n1cc": 190,
307
+ "c(O)": 191,
308
+ "OC)": 192,
309
+ "#N": 193,
310
+ "co": 194,
311
+ "nc1": 195,
312
+ "[C@@H](C(=O)N": 196,
313
+ "3CCC": 197,
314
+ "NC": 198,
315
+ "n2": 199,
316
+ "(C)(C)": 200,
317
+ "SC": 201,
318
+ "c1ccccc1": 202,
319
+ "[C@@H]2": 203,
320
+ "c2C": 204,
321
+ "(CCC": 205,
322
+ "c2n": 206,
323
+ "[N": 207,
324
+ "N1C(=O)": 208,
325
+ "(F)F)": 209,
326
+ "COC": 210,
327
+ "NC(=O)c1cc": 211,
328
+ "N(C(=O)c2cc": 212,
329
+ "3)CC": 213,
330
+ "N1CC": 214,
331
+ "S(=O)(=O)": 215,
332
+ "Cc1": 216,
333
+ "(F)(F)F)": 217,
334
+ "2C(=O)": 218,
335
+ "2(CC": 219,
336
+ "(F)C": 220,
337
+ "2(C": 221,
338
+ "=O": 222,
339
+ "cc2": 223,
340
+ "+]": 224,
341
+ "COc1cc": 225,
342
+ "-]": 226,
343
+ "[C@H](O)": 227,
344
+ "F)C1": 228,
345
+ "n(C)": 229,
346
+ "1CC1": 230,
347
+ "[C@@H]1CCC": 231,
348
+ "[N+]": 232,
349
+ "c(N": 233,
350
+ "cn1": 234,
351
+ "c3cccc": 235,
352
+ "[C@H]3": 236,
353
+ "C[C@@H]1C": 237,
354
+ ")C(=O)": 238,
355
+ "(N)=O)": 239,
356
+ "[C@H]1CCC": 240,
357
+ "c1C(=O)N": 241,
358
+ "(C(=O)N2CC": 242,
359
+ "CC(=O)N1CC": 243,
360
+ "CS": 244,
361
+ "c(Br)": 245,
362
+ "[C@@H]3": 246,
363
+ "s1": 247,
364
+ "C[C@H]1C": 248,
365
+ "2CC2)": 249,
366
+ "[C@H](CC": 250,
367
+ "c1ncc": 251,
368
+ "nn1": 252,
369
+ "[C@@H](O)": 253,
370
+ "N(C(=O)CC": 254,
371
+ "[C@H](CNC(=O)": 255,
372
+ "[O": 256,
373
+ "[O-]": 257,
374
+ "N2C(=O)": 258,
375
+ "COC(=O)": 259,
376
+ "[C@](C)": 260,
377
+ "C=": 261,
378
+ "[C@H](O)C": 262,
379
+ "c2)C1": 263,
380
+ "o1": 264,
381
+ "3)C1": 265,
382
+ "NC(=O)C1": 266,
383
+ "CCN1C(=O)": 267,
384
+ "O=C(C": 268,
385
+ "#N)": 269,
386
+ "[C@@](C)": 270,
387
+ "[C@@H](O)C": 271,
388
+ "(CO)": 272,
389
+ "[C@@H](CC": 273,
390
+ "(=O)[O-]": 274,
391
+ "[N+](=O)[O-]": 275,
392
+ "c1)": 276,
393
+ "CCC(=O)N": 277,
394
+ "4)C": 278,
395
+ "[C@]2": 279,
396
+ "(Br)": 280,
397
+ "CC2": 281,
398
+ "(=O)=O)": 282,
399
+ "[nH]1": 283,
400
+ "cc1C": 284,
401
+ "O1": 285,
402
+ "[C@@]2": 286,
403
+ "C=CC": 287,
404
+ "N(": 288,
405
+ "c(-": 289,
406
+ "c3cn": 290,
407
+ "[C@H]3C": 291,
408
+ "CC(C)(C)C": 292,
409
+ "(NC(=O)C": 293,
410
+ "[C@H](NC(=O)C": 294,
411
+ "cc1C(=O)N": 295,
412
+ "2)CC": 296,
413
+ "/C=C": 297,
414
+ "2)c1": 298,
415
+ "(CC)": 299,
416
+ "/C=C/": 300,
417
+ "[C@@H]2CCC": 301,
418
+ "[C@H](C)C(=O)N": 302,
419
+ "c(=N": 303,
420
+ "[C@@H](CNC(=O)": 304,
421
+ "NC(=O)CC": 305,
422
+ "(C(=O)NC": 306,
423
+ "[C@@H](NC(=O)C": 307,
424
+ "[C@H]2CCC": 308,
425
+ "4)": 309,
426
+ "CC(C)(C)": 310,
427
+ "c2ncc": 311,
428
+ "c2ccccc2": 312,
429
+ "(CNC(=O)C": 313,
430
+ "N=": 314,
431
+ "(F)(F)C": 315,
432
+ "N(CC": 316,
433
+ "CC(C)C": 317,
434
+ "OC1": 318,
435
+ "cccc": 319,
436
+ "n3": 320,
437
+ "CCOCC": 321,
438
+ "(-": 322,
439
+ "(F)F": 323,
440
+ ")C2": 324,
441
+ "c3C": 325,
442
+ "C[C@@H]1CC": 326,
443
+ "c(C(=O)N2CC": 327,
444
+ "CC1)": 328,
445
+ "Cc1cn": 329,
446
+ "CCc1cc": 330,
447
+ "[C@H]3CC": 331,
448
+ "c(F)cc": 332,
449
+ "c3n": 333,
450
+ "[C@@H](C)C(=O)N": 334,
451
+ "C[C@H]1CC": 335,
452
+ "[N+](=O)[O-])": 336,
453
+ "n2cc": 337,
454
+ "(C)(C)C)": 338,
455
+ "CCN(C(=O)": 339,
456
+ "[C@@H]3C": 340,
457
+ "(C)CCC": 341,
458
+ "C(=O)NC": 342,
459
+ "ccc2": 343,
460
+ "nc2": 344,
461
+ "=CC(=O)N": 345,
462
+ "3(CC": 346,
463
+ "4CC": 347,
464
+ "1)N": 348,
465
+ "S(=O)(=O)N": 349,
466
+ "N2CC": 350,
467
+ "(O)C": 351,
468
+ "cnc1": 352,
469
+ "S(C)": 353,
470
+ "1)C2": 354,
471
+ "CCNC(=O)": 355,
472
+ "l)C": 356,
473
+ "Cn1": 357,
474
+ "N(C(=O)c3cc": 358,
475
+ "[C@H]1C(=O)N": 359,
476
+ "n2)": 360,
477
+ "NC(=O)c2cc": 361,
478
+ "c3ccccc3": 362,
479
+ "3CC3)": 363,
480
+ "c2nn": 364,
481
+ "2)C1)": 365,
482
+ "[C@@H]1C(=O)N": 366,
483
+ "(F)(F)F": 367,
484
+ "COCC": 368,
485
+ "CC2)": 369,
486
+ "CCCCC": 370,
487
+ "c1cs": 371,
488
+ "C[C@]1": 372,
489
+ "3)C2)C1": 373,
490
+ "n1)": 374,
491
+ "(C)=O)": 375,
492
+ "O=C(": 376,
493
+ "CCN(CC": 377,
494
+ "CCS": 378,
495
+ "cs1": 379,
496
+ "F)C(=O)N": 380,
497
+ "c(C)n": 381,
498
+ "N(C)": 382,
499
+ "[C@@H]3CC": 383,
500
+ "S(C)(=O)=O)": 384,
501
+ "N(C)C)": 385,
502
+ "CC1(C)C": 386,
503
+ "OCC(=O)N": 387,
504
+ "N(C(=O)C2": 388,
505
+ "OC(C)(C)C": 389,
506
+ "CCc1n": 390,
507
+ "c4cc": 391,
508
+ "(C(=O)N2C": 392,
509
+ "N(C(=O)c2": 393,
510
+ "N1CCC": 394,
511
+ "C[C@H]2C": 395,
512
+ "c1ccc(": 396,
513
+ "C1CC": 397,
514
+ "3(C": 398,
515
+ "n1C": 399,
516
+ "c1ccn": 400,
517
+ "CCO1": 401,
518
+ "N1C": 402,
519
+ "Cc1cccc": 403,
520
+ "CCOC": 404,
521
+ "c(Cl)cc": 405,
522
+ "=CCC": 406,
523
+ "c2c(": 407,
524
+ "2)cc1": 408,
525
+ "[C@H](CNC(=O)C": 409,
526
+ "N3": 410,
527
+ "N(C(=O)c2cn": 411,
528
+ "cnc2": 412,
529
+ "1)C(=O)N": 413,
530
+ "c(C(=O)NC": 414,
531
+ "NC(=O)c1cn": 415,
532
+ "Cn1cc": 416,
533
+ "[C@H](CC(=O)N": 417,
534
+ "c1[nH]": 418,
535
+ "OC)C": 419,
536
+ "C[C@@]1": 420,
537
+ "n2)C1": 421,
538
+ "cc2C": 422,
539
+ "O)C1": 423,
540
+ "2)n1": 424,
541
+ "(C)C)C": 425,
542
+ "(C(=O)N2CCC": 426,
543
+ "O=C(CC": 427,
544
+ "C#CC": 428,
545
+ "1)C(=O)": 429,
546
+ "OC)C(=O)N": 430,
547
+ "CN(CC": 431,
548
+ "1CCCC1": 432,
549
+ "c1ccc(F)": 433,
550
+ "c(CC(=O)N": 434,
551
+ "CC(=O)N1C": 435,
552
+ "c3)": 436,
553
+ "4C": 437,
554
+ "c1ccc(C": 438,
555
+ "CCC2": 439,
556
+ "c3ncc": 440,
557
+ "CCOC(=O)": 441,
558
+ "c(N)": 442,
559
+ "CC(=O)": 443,
560
+ "c(=O)": 444,
561
+ "CN1CC": 445,
562
+ "Cc1ncc": 446,
563
+ "c1Cl": 447,
564
+ "CCn1": 448,
565
+ "CC[C@H](C)": 449,
566
+ "C[C@@H]2C": 450,
567
+ "c(CC": 451,
568
+ "c(OC)": 452,
569
+ "(OC)": 453,
570
+ "NC(=O)c1": 454,
571
+ "CC(=O)N1CCC": 455,
572
+ "NC(=O)N": 456,
573
+ "C(=O)N1C": 457,
574
+ "CN(C": 458,
575
+ "C[C@H](C": 459,
576
+ ")CC1": 460,
577
+ "c2ccccc2)": 461,
578
+ "N#": 462,
579
+ "(C)(C)C(=O)N": 463,
580
+ "2CCCC": 464,
581
+ "3)CC1": 465,
582
+ "ncn": 466,
583
+ "3)CC2)": 467,
584
+ "(C#N)": 468,
585
+ "C[C@@H](C": 469,
586
+ "c(=N)": 470,
587
+ "(C)C)C1": 471,
588
+ "(F)cc": 472,
589
+ "[C@@H](CNC(=O)C": 473,
590
+ "[C@H]1CN(C(=O)": 474,
591
+ "c1ccc(Cl)": 475,
592
+ "C[C@H]1": 476,
593
+ "[C@@H]1CN(C(=O)": 477,
594
+ "(F)(": 478,
595
+ "CC1CC": 479,
596
+ "Cc1cc(C(=O)N": 480,
597
+ "C[C@@H]1": 481,
598
+ "c(C)c1": 482,
599
+ "C(=O)N1CCC": 483,
600
+ "c(C(=O)N2C": 484,
601
+ "CCCO": 485,
602
+ "4)CC": 486,
603
+ "(=N": 487,
604
+ "2CC2)C1": 488,
605
+ "(CCNC(=O)": 489,
606
+ "n[nH]": 490,
607
+ "cc3": 491,
608
+ "(O)CC": 492,
609
+ "2(CCC": 493,
610
+ "[S": 494,
611
+ "S(N)": 495,
612
+ "ccc1C": 496,
613
+ "N(C)C(=O)C": 497,
614
+ "F)C2)": 498,
615
+ "(F)F)C": 499
616
+ },
617
+ "merges": [
618
+ [
619
+ "C",
620
+ "C"
621
+ ],
622
+ [
623
+ "O",
624
+ ")"
625
+ ],
626
+ [
627
+ "(",
628
+ "C"
629
+ ],
630
+ [
631
+ "=",
632
+ "O)"
633
+ ],
634
+ [
635
+ "(",
636
+ "=O)"
637
+ ],
638
+ [
639
+ "[",
640
+ "C"
641
+ ],
642
+ [
643
+ "[C",
644
+ "@"
645
+ ],
646
+ [
647
+ "H",
648
+ "]"
649
+ ],
650
+ [
651
+ "c",
652
+ "c"
653
+ ],
654
+ [
655
+ ")",
656
+ "C"
657
+ ],
658
+ [
659
+ "[C@",
660
+ "H]"
661
+ ],
662
+ [
663
+ "[C@",
664
+ "@"
665
+ ],
666
+ [
667
+ "c",
668
+ "1"
669
+ ],
670
+ [
671
+ "C",
672
+ "(=O)"
673
+ ],
674
+ [
675
+ "[C@@",
676
+ "H]"
677
+ ],
678
+ [
679
+ "(C",
680
+ ")"
681
+ ],
682
+ [
683
+ "(=O)",
684
+ "N"
685
+ ],
686
+ [
687
+ "N",
688
+ "C(=O)"
689
+ ],
690
+ [
691
+ "CC",
692
+ "C"
693
+ ],
694
+ [
695
+ "c",
696
+ "2"
697
+ ],
698
+ [
699
+ "N",
700
+ "(C"
701
+ ],
702
+ [
703
+ "F",
704
+ ")"
705
+ ],
706
+ [
707
+ "1",
708
+ "CC"
709
+ ],
710
+ [
711
+ "c1",
712
+ "cc"
713
+ ],
714
+ [
715
+ "2",
716
+ ")"
717
+ ],
718
+ [
719
+ "1",
720
+ "C"
721
+ ],
722
+ [
723
+ "(C",
724
+ ")C"
725
+ ],
726
+ [
727
+ "N(C",
728
+ "(=O)"
729
+ ],
730
+ [
731
+ "(",
732
+ "F)"
733
+ ],
734
+ [
735
+ "(C",
736
+ "(=O)N"
737
+ ],
738
+ [
739
+ "c",
740
+ "n"
741
+ ],
742
+ [
743
+ "2",
744
+ "CC"
745
+ ],
746
+ [
747
+ "(",
748
+ "CC"
749
+ ],
750
+ [
751
+ ")C",
752
+ "1"
753
+ ],
754
+ [
755
+ "2",
756
+ "C"
757
+ ],
758
+ [
759
+ "c",
760
+ "3"
761
+ ],
762
+ [
763
+ "CC",
764
+ "1"
765
+ ],
766
+ [
767
+ "O",
768
+ "C"
769
+ ],
770
+ [
771
+ "C(=O)",
772
+ "N"
773
+ ],
774
+ [
775
+ "=",
776
+ "C"
777
+ ],
778
+ [
779
+ "c2",
780
+ "cc"
781
+ ],
782
+ [
783
+ "c",
784
+ "("
785
+ ],
786
+ [
787
+ "NC(=O)",
788
+ "C"
789
+ ],
790
+ [
791
+ "n",
792
+ "1"
793
+ ],
794
+ [
795
+ "[C@H]",
796
+ "("
797
+ ],
798
+ [
799
+ "[",
800
+ "n"
801
+ ],
802
+ [
803
+ "[n",
804
+ "H]"
805
+ ],
806
+ [
807
+ "[C@@H]",
808
+ "("
809
+ ],
810
+ [
811
+ "2",
812
+ ")C1"
813
+ ],
814
+ [
815
+ "CC",
816
+ "(=O)N"
817
+ ],
818
+ [
819
+ "3",
820
+ ")"
821
+ ],
822
+ [
823
+ "C",
824
+ "O"
825
+ ],
826
+ [
827
+ "1",
828
+ "CCC"
829
+ ],
830
+ [
831
+ "c",
832
+ "(C"
833
+ ],
834
+ [
835
+ "C",
836
+ "[C@H]"
837
+ ],
838
+ [
839
+ "c1",
840
+ "n"
841
+ ],
842
+ [
843
+ "C",
844
+ "[C@@H]"
845
+ ],
846
+ [
847
+ "l",
848
+ ")"
849
+ ],
850
+ [
851
+ "[C@",
852
+ "]"
853
+ ],
854
+ [
855
+ "N(C",
856
+ ")C"
857
+ ],
858
+ [
859
+ "CC",
860
+ "CC"
861
+ ],
862
+ [
863
+ "[C@@",
864
+ "]"
865
+ ],
866
+ [
867
+ "(",
868
+ "N"
869
+ ],
870
+ [
871
+ "C",
872
+ "c1cc"
873
+ ],
874
+ [
875
+ "2",
876
+ "CCC"
877
+ ],
878
+ [
879
+ "cc",
880
+ "1"
881
+ ],
882
+ [
883
+ "n",
884
+ "n"
885
+ ],
886
+ [
887
+ "[C@H]",
888
+ "1C"
889
+ ],
890
+ [
891
+ ")C",
892
+ "2)"
893
+ ],
894
+ [
895
+ "CC",
896
+ "(C)"
897
+ ],
898
+ [
899
+ "CC",
900
+ "O"
901
+ ],
902
+ [
903
+ "O",
904
+ "=C"
905
+ ],
906
+ [
907
+ "N(C(=O)",
908
+ "C"
909
+ ],
910
+ [
911
+ "O)",
912
+ "C"
913
+ ],
914
+ [
915
+ "c3",
916
+ "cc"
917
+ ],
918
+ [
919
+ "c",
920
+ "(C(=O)N"
921
+ ],
922
+ [
923
+ "(C",
924
+ "NC(=O)"
925
+ ],
926
+ [
927
+ "[C@@H]",
928
+ "1C"
929
+ ],
930
+ [
931
+ "CC",
932
+ "N"
933
+ ],
934
+ [
935
+ "1",
936
+ ")"
937
+ ],
938
+ [
939
+ "[C@H]",
940
+ "2C"
941
+ ],
942
+ [
943
+ "c",
944
+ "(C)"
945
+ ],
946
+ [
947
+ "cc",
948
+ "c1"
949
+ ],
950
+ [
951
+ "c1",
952
+ "cn"
953
+ ],
954
+ [
955
+ "3",
956
+ "CC"
957
+ ],
958
+ [
959
+ "F",
960
+ ")C"
961
+ ],
962
+ [
963
+ "[C@H]",
964
+ "(C)"
965
+ ],
966
+ [
967
+ "(C)",
968
+ "(C)C"
969
+ ],
970
+ [
971
+ "2",
972
+ ")C"
973
+ ],
974
+ [
975
+ "[C@@H]",
976
+ "2C"
977
+ ],
978
+ [
979
+ "[C@@H]",
980
+ "(C)"
981
+ ],
982
+ [
983
+ "B",
984
+ "r"
985
+ ],
986
+ [
987
+ "(F)",
988
+ "(F)"
989
+ ],
990
+ [
991
+ "[C@H]",
992
+ "1"
993
+ ],
994
+ [
995
+ "3",
996
+ "C"
997
+ ],
998
+ [
999
+ "c",
1000
+ "s"
1001
+ ],
1002
+ [
1003
+ "c(C",
1004
+ "l)"
1005
+ ],
1006
+ [
1007
+ "1",
1008
+ ")C"
1009
+ ],
1010
+ [
1011
+ "N(C)C",
1012
+ "(=O)"
1013
+ ],
1014
+ [
1015
+ "[C@H]",
1016
+ "1CC"
1017
+ ],
1018
+ [
1019
+ "(",
1020
+ "O)"
1021
+ ],
1022
+ [
1023
+ "c",
1024
+ "(F)"
1025
+ ],
1026
+ [
1027
+ "(CC",
1028
+ "(=O)N"
1029
+ ],
1030
+ [
1031
+ ")",
1032
+ "CC"
1033
+ ],
1034
+ [
1035
+ "[C@@H]",
1036
+ "1CC"
1037
+ ],
1038
+ [
1039
+ "c2cc",
1040
+ "cc"
1041
+ ],
1042
+ [
1043
+ "1",
1044
+ "C(=O)"
1045
+ ],
1046
+ [
1047
+ "[C@H]",
1048
+ "(C"
1049
+ ],
1050
+ [
1051
+ "C",
1052
+ "=C"
1053
+ ],
1054
+ [
1055
+ "=",
1056
+ "CC"
1057
+ ],
1058
+ [
1059
+ "CCC",
1060
+ "1"
1061
+ ],
1062
+ [
1063
+ "C",
1064
+ "1"
1065
+ ],
1066
+ [
1067
+ "n",
1068
+ "cc"
1069
+ ],
1070
+ [
1071
+ "[C@@H]",
1072
+ "1"
1073
+ ],
1074
+ [
1075
+ "[C@@H]",
1076
+ "(C"
1077
+ ],
1078
+ [
1079
+ "[C@H]",
1080
+ "2CC"
1081
+ ],
1082
+ [
1083
+ "Br",
1084
+ ")"
1085
+ ],
1086
+ [
1087
+ "(",
1088
+ "NC(=O)"
1089
+ ],
1090
+ [
1091
+ "O",
1092
+ "CC"
1093
+ ],
1094
+ [
1095
+ "C",
1096
+ "N"
1097
+ ],
1098
+ [
1099
+ "[C@H]",
1100
+ "(C)C"
1101
+ ],
1102
+ [
1103
+ "3",
1104
+ ")C"
1105
+ ],
1106
+ [
1107
+ "3",
1108
+ ")C2)"
1109
+ ],
1110
+ [
1111
+ "c",
1112
+ "4"
1113
+ ],
1114
+ [
1115
+ "c2",
1116
+ "cn"
1117
+ ],
1118
+ [
1119
+ "C",
1120
+ "c1n"
1121
+ ],
1122
+ [
1123
+ "(C)",
1124
+ "CC"
1125
+ ],
1126
+ [
1127
+ "2)",
1128
+ "CC1"
1129
+ ],
1130
+ [
1131
+ "S",
1132
+ "(=O)"
1133
+ ],
1134
+ [
1135
+ ")C",
1136
+ "(=O)N"
1137
+ ],
1138
+ [
1139
+ "(C",
1140
+ "l)"
1141
+ ],
1142
+ [
1143
+ "=",
1144
+ "N"
1145
+ ],
1146
+ [
1147
+ "[C@H](",
1148
+ "NC(=O)"
1149
+ ],
1150
+ [
1151
+ "[C@]",
1152
+ "1"
1153
+ ],
1154
+ [
1155
+ "c1",
1156
+ "C"
1157
+ ],
1158
+ [
1159
+ "[C@@H]",
1160
+ "(C)C"
1161
+ ],
1162
+ [
1163
+ "O=C",
1164
+ "(N"
1165
+ ],
1166
+ [
1167
+ "(C)C",
1168
+ ")"
1169
+ ],
1170
+ [
1171
+ "[C@H]",
1172
+ "(C(=O)N"
1173
+ ],
1174
+ [
1175
+ "[C@@H](",
1176
+ "NC(=O)"
1177
+ ],
1178
+ [
1179
+ "(N",
1180
+ ")"
1181
+ ],
1182
+ [
1183
+ "c2",
1184
+ ")"
1185
+ ],
1186
+ [
1187
+ "C",
1188
+ "#"
1189
+ ],
1190
+ [
1191
+ "[C@@H]",
1192
+ "2CC"
1193
+ ],
1194
+ [
1195
+ "C(=O)N",
1196
+ "1CC"
1197
+ ],
1198
+ [
1199
+ "c",
1200
+ "[nH]"
1201
+ ],
1202
+ [
1203
+ "[C@H]",
1204
+ "2"
1205
+ ],
1206
+ [
1207
+ "[C@@]",
1208
+ "1"
1209
+ ],
1210
+ [
1211
+ "c1cc",
1212
+ "cc"
1213
+ ],
1214
+ [
1215
+ "n1",
1216
+ "cc"
1217
+ ],
1218
+ [
1219
+ "c(",
1220
+ "O)"
1221
+ ],
1222
+ [
1223
+ "OC",
1224
+ ")"
1225
+ ],
1226
+ [
1227
+ "#",
1228
+ "N"
1229
+ ],
1230
+ [
1231
+ "c",
1232
+ "o"
1233
+ ],
1234
+ [
1235
+ "n",
1236
+ "c1"
1237
+ ],
1238
+ [
1239
+ "[C@@H]",
1240
+ "(C(=O)N"
1241
+ ],
1242
+ [
1243
+ "3",
1244
+ "CCC"
1245
+ ],
1246
+ [
1247
+ "N",
1248
+ "C"
1249
+ ],
1250
+ [
1251
+ "n",
1252
+ "2"
1253
+ ],
1254
+ [
1255
+ "(C)",
1256
+ "(C)"
1257
+ ],
1258
+ [
1259
+ "S",
1260
+ "C"
1261
+ ],
1262
+ [
1263
+ "c1cc",
1264
+ "ccc1"
1265
+ ],
1266
+ [
1267
+ "[C@@H]",
1268
+ "2"
1269
+ ],
1270
+ [
1271
+ "c2",
1272
+ "C"
1273
+ ],
1274
+ [
1275
+ "(",
1276
+ "CCC"
1277
+ ],
1278
+ [
1279
+ "c2",
1280
+ "n"
1281
+ ],
1282
+ [
1283
+ "[",
1284
+ "N"
1285
+ ],
1286
+ [
1287
+ "N",
1288
+ "1C(=O)"
1289
+ ],
1290
+ [
1291
+ "(F)",
1292
+ "F)"
1293
+ ],
1294
+ [
1295
+ "C",
1296
+ "OC"
1297
+ ],
1298
+ [
1299
+ "NC(=O)",
1300
+ "c1cc"
1301
+ ],
1302
+ [
1303
+ "N(C(=O)",
1304
+ "c2cc"
1305
+ ],
1306
+ [
1307
+ "3)",
1308
+ "CC"
1309
+ ],
1310
+ [
1311
+ "N",
1312
+ "1CC"
1313
+ ],
1314
+ [
1315
+ "S(=O)",
1316
+ "(=O)"
1317
+ ],
1318
+ [
1319
+ "C",
1320
+ "c1"
1321
+ ],
1322
+ [
1323
+ "(F)(F)",
1324
+ "F)"
1325
+ ],
1326
+ [
1327
+ "2",
1328
+ "C(=O)"
1329
+ ],
1330
+ [
1331
+ "2",
1332
+ "(CC"
1333
+ ],
1334
+ [
1335
+ "(",
1336
+ "F)C"
1337
+ ],
1338
+ [
1339
+ "2",
1340
+ "(C"
1341
+ ],
1342
+ [
1343
+ "=",
1344
+ "O"
1345
+ ],
1346
+ [
1347
+ "cc",
1348
+ "2"
1349
+ ],
1350
+ [
1351
+ "+",
1352
+ "]"
1353
+ ],
1354
+ [
1355
+ "CO",
1356
+ "c1cc"
1357
+ ],
1358
+ [
1359
+ "-",
1360
+ "]"
1361
+ ],
1362
+ [
1363
+ "[C@H](",
1364
+ "O)"
1365
+ ],
1366
+ [
1367
+ "F",
1368
+ ")C1"
1369
+ ],
1370
+ [
1371
+ "n",
1372
+ "(C)"
1373
+ ],
1374
+ [
1375
+ "1CC",
1376
+ "1"
1377
+ ],
1378
+ [
1379
+ "[C@@H]",
1380
+ "1CCC"
1381
+ ],
1382
+ [
1383
+ "[N",
1384
+ "+]"
1385
+ ],
1386
+ [
1387
+ "c(",
1388
+ "N"
1389
+ ],
1390
+ [
1391
+ "cn",
1392
+ "1"
1393
+ ],
1394
+ [
1395
+ "c3cc",
1396
+ "cc"
1397
+ ],
1398
+ [
1399
+ "[C@H]",
1400
+ "3"
1401
+ ],
1402
+ [
1403
+ "C[C@@H]",
1404
+ "1C"
1405
+ ],
1406
+ [
1407
+ ")C",
1408
+ "(=O)"
1409
+ ],
1410
+ [
1411
+ "(N)",
1412
+ "=O)"
1413
+ ],
1414
+ [
1415
+ "[C@H]",
1416
+ "1CCC"
1417
+ ],
1418
+ [
1419
+ "c1",
1420
+ "C(=O)N"
1421
+ ],
1422
+ [
1423
+ "(C(=O)N",
1424
+ "2CC"
1425
+ ],
1426
+ [
1427
+ "CC(=O)N",
1428
+ "1CC"
1429
+ ],
1430
+ [
1431
+ "C",
1432
+ "S"
1433
+ ],
1434
+ [
1435
+ "c(",
1436
+ "Br)"
1437
+ ],
1438
+ [
1439
+ "[C@@H]",
1440
+ "3"
1441
+ ],
1442
+ [
1443
+ "s",
1444
+ "1"
1445
+ ],
1446
+ [
1447
+ "C[C@H]",
1448
+ "1C"
1449
+ ],
1450
+ [
1451
+ "2CC",
1452
+ "2)"
1453
+ ],
1454
+ [
1455
+ "[C@H]",
1456
+ "(CC"
1457
+ ],
1458
+ [
1459
+ "c1n",
1460
+ "cc"
1461
+ ],
1462
+ [
1463
+ "n",
1464
+ "n1"
1465
+ ],
1466
+ [
1467
+ "[C@@H](",
1468
+ "O)"
1469
+ ],
1470
+ [
1471
+ "N(C(=O)",
1472
+ "CC"
1473
+ ],
1474
+ [
1475
+ "[C@H]",
1476
+ "(CNC(=O)"
1477
+ ],
1478
+ [
1479
+ "[",
1480
+ "O"
1481
+ ],
1482
+ [
1483
+ "[O",
1484
+ "-]"
1485
+ ],
1486
+ [
1487
+ "N",
1488
+ "2C(=O)"
1489
+ ],
1490
+ [
1491
+ "CO",
1492
+ "C(=O)"
1493
+ ],
1494
+ [
1495
+ "[C@]",
1496
+ "(C)"
1497
+ ],
1498
+ [
1499
+ "C",
1500
+ "="
1501
+ ],
1502
+ [
1503
+ "[C@H](",
1504
+ "O)C"
1505
+ ],
1506
+ [
1507
+ "c2",
1508
+ ")C1"
1509
+ ],
1510
+ [
1511
+ "o",
1512
+ "1"
1513
+ ],
1514
+ [
1515
+ "3",
1516
+ ")C1"
1517
+ ],
1518
+ [
1519
+ "NC(=O)C",
1520
+ "1"
1521
+ ],
1522
+ [
1523
+ "CCN",
1524
+ "1C(=O)"
1525
+ ],
1526
+ [
1527
+ "O=C",
1528
+ "(C"
1529
+ ],
1530
+ [
1531
+ "#N",
1532
+ ")"
1533
+ ],
1534
+ [
1535
+ "[C@@]",
1536
+ "(C)"
1537
+ ],
1538
+ [
1539
+ "[C@@H](",
1540
+ "O)C"
1541
+ ],
1542
+ [
1543
+ "(C",
1544
+ "O)"
1545
+ ],
1546
+ [
1547
+ "[C@@H]",
1548
+ "(CC"
1549
+ ],
1550
+ [
1551
+ "(=O)",
1552
+ "[O-]"
1553
+ ],
1554
+ [
1555
+ "[N+]",
1556
+ "(=O)[O-]"
1557
+ ],
1558
+ [
1559
+ "c1",
1560
+ ")"
1561
+ ],
1562
+ [
1563
+ "CC",
1564
+ "C(=O)N"
1565
+ ],
1566
+ [
1567
+ "4",
1568
+ ")C"
1569
+ ],
1570
+ [
1571
+ "[C@]",
1572
+ "2"
1573
+ ],
1574
+ [
1575
+ "(",
1576
+ "Br)"
1577
+ ],
1578
+ [
1579
+ "CC",
1580
+ "2"
1581
+ ],
1582
+ [
1583
+ "(=O)",
1584
+ "=O)"
1585
+ ],
1586
+ [
1587
+ "[nH]",
1588
+ "1"
1589
+ ],
1590
+ [
1591
+ "cc",
1592
+ "1C"
1593
+ ],
1594
+ [
1595
+ "O",
1596
+ "1"
1597
+ ],
1598
+ [
1599
+ "[C@@]",
1600
+ "2"
1601
+ ],
1602
+ [
1603
+ "C",
1604
+ "=CC"
1605
+ ],
1606
+ [
1607
+ "N",
1608
+ "("
1609
+ ],
1610
+ [
1611
+ "c(",
1612
+ "-"
1613
+ ],
1614
+ [
1615
+ "c3",
1616
+ "cn"
1617
+ ],
1618
+ [
1619
+ "[C@H]",
1620
+ "3C"
1621
+ ],
1622
+ [
1623
+ "CC(C)",
1624
+ "(C)C"
1625
+ ],
1626
+ [
1627
+ "(",
1628
+ "NC(=O)C"
1629
+ ],
1630
+ [
1631
+ "[C@H](",
1632
+ "NC(=O)C"
1633
+ ],
1634
+ [
1635
+ "cc1",
1636
+ "C(=O)N"
1637
+ ],
1638
+ [
1639
+ "2)",
1640
+ "CC"
1641
+ ],
1642
+ [
1643
+ "/",
1644
+ "C=C"
1645
+ ],
1646
+ [
1647
+ "2)",
1648
+ "c1"
1649
+ ],
1650
+ [
1651
+ "(CC",
1652
+ ")"
1653
+ ],
1654
+ [
1655
+ "/C=C",
1656
+ "/"
1657
+ ],
1658
+ [
1659
+ "[C@@H]",
1660
+ "2CCC"
1661
+ ],
1662
+ [
1663
+ "[C@H](C)C",
1664
+ "(=O)N"
1665
+ ],
1666
+ [
1667
+ "c(",
1668
+ "=N"
1669
+ ],
1670
+ [
1671
+ "[C@@H]",
1672
+ "(CNC(=O)"
1673
+ ],
1674
+ [
1675
+ "NC(=O)",
1676
+ "CC"
1677
+ ],
1678
+ [
1679
+ "(C(=O)N",
1680
+ "C"
1681
+ ],
1682
+ [
1683
+ "[C@@H](",
1684
+ "NC(=O)C"
1685
+ ],
1686
+ [
1687
+ "[C@H]",
1688
+ "2CCC"
1689
+ ],
1690
+ [
1691
+ "4",
1692
+ ")"
1693
+ ],
1694
+ [
1695
+ "CC(C)",
1696
+ "(C)"
1697
+ ],
1698
+ [
1699
+ "c2",
1700
+ "ncc"
1701
+ ],
1702
+ [
1703
+ "c2cccc",
1704
+ "c2"
1705
+ ],
1706
+ [
1707
+ "(C",
1708
+ "NC(=O)C"
1709
+ ],
1710
+ [
1711
+ "N",
1712
+ "="
1713
+ ],
1714
+ [
1715
+ "(F)",
1716
+ "(F)C"
1717
+ ],
1718
+ [
1719
+ "N",
1720
+ "(CC"
1721
+ ],
1722
+ [
1723
+ "CC",
1724
+ "(C)C"
1725
+ ],
1726
+ [
1727
+ "OC",
1728
+ "1"
1729
+ ],
1730
+ [
1731
+ "cc",
1732
+ "cc"
1733
+ ],
1734
+ [
1735
+ "n",
1736
+ "3"
1737
+ ],
1738
+ [
1739
+ "CCO",
1740
+ "CC"
1741
+ ],
1742
+ [
1743
+ "(",
1744
+ "-"
1745
+ ],
1746
+ [
1747
+ "(F)",
1748
+ "F"
1749
+ ],
1750
+ [
1751
+ ")C",
1752
+ "2"
1753
+ ],
1754
+ [
1755
+ "c3",
1756
+ "C"
1757
+ ],
1758
+ [
1759
+ "C[C@@H]",
1760
+ "1CC"
1761
+ ],
1762
+ [
1763
+ "c(C(=O)N",
1764
+ "2CC"
1765
+ ],
1766
+ [
1767
+ "CC1",
1768
+ ")"
1769
+ ],
1770
+ [
1771
+ "C",
1772
+ "c1cn"
1773
+ ],
1774
+ [
1775
+ "CC",
1776
+ "c1cc"
1777
+ ],
1778
+ [
1779
+ "[C@H]",
1780
+ "3CC"
1781
+ ],
1782
+ [
1783
+ "c(F)",
1784
+ "cc"
1785
+ ],
1786
+ [
1787
+ "c3",
1788
+ "n"
1789
+ ],
1790
+ [
1791
+ "[C@@H](C)C",
1792
+ "(=O)N"
1793
+ ],
1794
+ [
1795
+ "C[C@H]",
1796
+ "1CC"
1797
+ ],
1798
+ [
1799
+ "[N+](=O)[O-]",
1800
+ ")"
1801
+ ],
1802
+ [
1803
+ "n2",
1804
+ "cc"
1805
+ ],
1806
+ [
1807
+ "(C)(C)C",
1808
+ ")"
1809
+ ],
1810
+ [
1811
+ "CC",
1812
+ "N(C(=O)"
1813
+ ],
1814
+ [
1815
+ "[C@@H]",
1816
+ "3C"
1817
+ ],
1818
+ [
1819
+ "(C)",
1820
+ "CCC"
1821
+ ],
1822
+ [
1823
+ "C(=O)N",
1824
+ "C"
1825
+ ],
1826
+ [
1827
+ "cc",
1828
+ "c2"
1829
+ ],
1830
+ [
1831
+ "n",
1832
+ "c2"
1833
+ ],
1834
+ [
1835
+ "=",
1836
+ "CC(=O)N"
1837
+ ],
1838
+ [
1839
+ "3",
1840
+ "(CC"
1841
+ ],
1842
+ [
1843
+ "4",
1844
+ "CC"
1845
+ ],
1846
+ [
1847
+ "1)",
1848
+ "N"
1849
+ ],
1850
+ [
1851
+ "S(=O)",
1852
+ "(=O)N"
1853
+ ],
1854
+ [
1855
+ "N",
1856
+ "2CC"
1857
+ ],
1858
+ [
1859
+ "(",
1860
+ "O)C"
1861
+ ],
1862
+ [
1863
+ "cn",
1864
+ "c1"
1865
+ ],
1866
+ [
1867
+ "S",
1868
+ "(C)"
1869
+ ],
1870
+ [
1871
+ "1)C",
1872
+ "2"
1873
+ ],
1874
+ [
1875
+ "CC",
1876
+ "NC(=O)"
1877
+ ],
1878
+ [
1879
+ "l",
1880
+ ")C"
1881
+ ],
1882
+ [
1883
+ "C",
1884
+ "n1"
1885
+ ],
1886
+ [
1887
+ "N(C(=O)",
1888
+ "c3cc"
1889
+ ],
1890
+ [
1891
+ "[C@H]1",
1892
+ "C(=O)N"
1893
+ ],
1894
+ [
1895
+ "n",
1896
+ "2)"
1897
+ ],
1898
+ [
1899
+ "NC(=O)",
1900
+ "c2cc"
1901
+ ],
1902
+ [
1903
+ "c3cccc",
1904
+ "c3"
1905
+ ],
1906
+ [
1907
+ "3CC",
1908
+ "3)"
1909
+ ],
1910
+ [
1911
+ "c2",
1912
+ "nn"
1913
+ ],
1914
+ [
1915
+ "2)C1",
1916
+ ")"
1917
+ ],
1918
+ [
1919
+ "[C@@H]1",
1920
+ "C(=O)N"
1921
+ ],
1922
+ [
1923
+ "(F)(F)",
1924
+ "F"
1925
+ ],
1926
+ [
1927
+ "CO",
1928
+ "CC"
1929
+ ],
1930
+ [
1931
+ "CC",
1932
+ "2)"
1933
+ ],
1934
+ [
1935
+ "CC",
1936
+ "CCC"
1937
+ ],
1938
+ [
1939
+ "c1",
1940
+ "cs"
1941
+ ],
1942
+ [
1943
+ "C",
1944
+ "[C@]1"
1945
+ ],
1946
+ [
1947
+ "3)C",
1948
+ "2)C1"
1949
+ ],
1950
+ [
1951
+ "n1",
1952
+ ")"
1953
+ ],
1954
+ [
1955
+ "(C)",
1956
+ "=O)"
1957
+ ],
1958
+ [
1959
+ "O=C",
1960
+ "("
1961
+ ],
1962
+ [
1963
+ "CCN",
1964
+ "(CC"
1965
+ ],
1966
+ [
1967
+ "CC",
1968
+ "S"
1969
+ ],
1970
+ [
1971
+ "cs",
1972
+ "1"
1973
+ ],
1974
+ [
1975
+ "F)C",
1976
+ "(=O)N"
1977
+ ],
1978
+ [
1979
+ "c(C)",
1980
+ "n"
1981
+ ],
1982
+ [
1983
+ "N",
1984
+ "(C)"
1985
+ ],
1986
+ [
1987
+ "[C@@H]",
1988
+ "3CC"
1989
+ ],
1990
+ [
1991
+ "S(C)",
1992
+ "(=O)=O)"
1993
+ ],
1994
+ [
1995
+ "N(C)C",
1996
+ ")"
1997
+ ],
1998
+ [
1999
+ "CC1",
2000
+ "(C)C"
2001
+ ],
2002
+ [
2003
+ "O",
2004
+ "CC(=O)N"
2005
+ ],
2006
+ [
2007
+ "N(C(=O)C",
2008
+ "2"
2009
+ ],
2010
+ [
2011
+ "OC",
2012
+ "(C)(C)C"
2013
+ ],
2014
+ [
2015
+ "CC",
2016
+ "c1n"
2017
+ ],
2018
+ [
2019
+ "c4",
2020
+ "cc"
2021
+ ],
2022
+ [
2023
+ "(C(=O)N",
2024
+ "2C"
2025
+ ],
2026
+ [
2027
+ "N(C(=O)",
2028
+ "c2"
2029
+ ],
2030
+ [
2031
+ "N",
2032
+ "1CCC"
2033
+ ],
2034
+ [
2035
+ "C[C@H]",
2036
+ "2C"
2037
+ ],
2038
+ [
2039
+ "c1cc",
2040
+ "c("
2041
+ ],
2042
+ [
2043
+ "C",
2044
+ "1CC"
2045
+ ],
2046
+ [
2047
+ "3",
2048
+ "(C"
2049
+ ],
2050
+ [
2051
+ "n",
2052
+ "1C"
2053
+ ],
2054
+ [
2055
+ "c1cc",
2056
+ "n"
2057
+ ],
2058
+ [
2059
+ "CCO",
2060
+ "1"
2061
+ ],
2062
+ [
2063
+ "N",
2064
+ "1C"
2065
+ ],
2066
+ [
2067
+ "Cc1cc",
2068
+ "cc"
2069
+ ],
2070
+ [
2071
+ "CC",
2072
+ "OC"
2073
+ ],
2074
+ [
2075
+ "c(Cl)",
2076
+ "cc"
2077
+ ],
2078
+ [
2079
+ "=",
2080
+ "CCC"
2081
+ ],
2082
+ [
2083
+ "c2",
2084
+ "c("
2085
+ ],
2086
+ [
2087
+ "2)",
2088
+ "cc1"
2089
+ ],
2090
+ [
2091
+ "[C@H](C",
2092
+ "NC(=O)C"
2093
+ ],
2094
+ [
2095
+ "N",
2096
+ "3"
2097
+ ],
2098
+ [
2099
+ "N(C(=O)",
2100
+ "c2cn"
2101
+ ],
2102
+ [
2103
+ "cn",
2104
+ "c2"
2105
+ ],
2106
+ [
2107
+ "1)C",
2108
+ "(=O)N"
2109
+ ],
2110
+ [
2111
+ "c(C(=O)N",
2112
+ "C"
2113
+ ],
2114
+ [
2115
+ "NC(=O)",
2116
+ "c1cn"
2117
+ ],
2118
+ [
2119
+ "C",
2120
+ "n1cc"
2121
+ ],
2122
+ [
2123
+ "[C@H]",
2124
+ "(CC(=O)N"
2125
+ ],
2126
+ [
2127
+ "c1",
2128
+ "[nH]"
2129
+ ],
2130
+ [
2131
+ "OC",
2132
+ ")C"
2133
+ ],
2134
+ [
2135
+ "C",
2136
+ "[C@@]1"
2137
+ ],
2138
+ [
2139
+ "n",
2140
+ "2)C1"
2141
+ ],
2142
+ [
2143
+ "cc",
2144
+ "2C"
2145
+ ],
2146
+ [
2147
+ "O)C",
2148
+ "1"
2149
+ ],
2150
+ [
2151
+ "2)",
2152
+ "n1"
2153
+ ],
2154
+ [
2155
+ "(C)C",
2156
+ ")C"
2157
+ ],
2158
+ [
2159
+ "(C(=O)N",
2160
+ "2CCC"
2161
+ ],
2162
+ [
2163
+ "O=C",
2164
+ "(CC"
2165
+ ],
2166
+ [
2167
+ "C#",
2168
+ "CC"
2169
+ ],
2170
+ [
2171
+ "1)C",
2172
+ "(=O)"
2173
+ ],
2174
+ [
2175
+ "OC",
2176
+ ")C(=O)N"
2177
+ ],
2178
+ [
2179
+ "CN",
2180
+ "(CC"
2181
+ ],
2182
+ [
2183
+ "1CC",
2184
+ "CC1"
2185
+ ],
2186
+ [
2187
+ "c1cc",
2188
+ "c(F)"
2189
+ ],
2190
+ [
2191
+ "c",
2192
+ "(CC(=O)N"
2193
+ ],
2194
+ [
2195
+ "CC(=O)N",
2196
+ "1C"
2197
+ ],
2198
+ [
2199
+ "c3",
2200
+ ")"
2201
+ ],
2202
+ [
2203
+ "4",
2204
+ "C"
2205
+ ],
2206
+ [
2207
+ "c1cc",
2208
+ "c(C"
2209
+ ],
2210
+ [
2211
+ "CCC",
2212
+ "2"
2213
+ ],
2214
+ [
2215
+ "c3",
2216
+ "ncc"
2217
+ ],
2218
+ [
2219
+ "CCO",
2220
+ "C(=O)"
2221
+ ],
2222
+ [
2223
+ "c(N",
2224
+ ")"
2225
+ ],
2226
+ [
2227
+ "CC",
2228
+ "(=O)"
2229
+ ],
2230
+ [
2231
+ "c",
2232
+ "(=O)"
2233
+ ],
2234
+ [
2235
+ "CN",
2236
+ "1CC"
2237
+ ],
2238
+ [
2239
+ "Cc1n",
2240
+ "cc"
2241
+ ],
2242
+ [
2243
+ "c1C",
2244
+ "l"
2245
+ ],
2246
+ [
2247
+ "CC",
2248
+ "n1"
2249
+ ],
2250
+ [
2251
+ "CC",
2252
+ "[C@H](C)"
2253
+ ],
2254
+ [
2255
+ "C[C@@H]",
2256
+ "2C"
2257
+ ],
2258
+ [
2259
+ "c",
2260
+ "(CC"
2261
+ ],
2262
+ [
2263
+ "c(",
2264
+ "OC)"
2265
+ ],
2266
+ [
2267
+ "(",
2268
+ "OC)"
2269
+ ],
2270
+ [
2271
+ "NC(=O)",
2272
+ "c1"
2273
+ ],
2274
+ [
2275
+ "CC(=O)N",
2276
+ "1CCC"
2277
+ ],
2278
+ [
2279
+ "NC(=O)",
2280
+ "N"
2281
+ ],
2282
+ [
2283
+ "C(=O)N",
2284
+ "1C"
2285
+ ],
2286
+ [
2287
+ "C",
2288
+ "N(C"
2289
+ ],
2290
+ [
2291
+ "C[C@H]",
2292
+ "(C"
2293
+ ],
2294
+ [
2295
+ ")",
2296
+ "CC1"
2297
+ ],
2298
+ [
2299
+ "c2cccc",
2300
+ "c2)"
2301
+ ],
2302
+ [
2303
+ "N",
2304
+ "#"
2305
+ ],
2306
+ [
2307
+ "(C)(C)C",
2308
+ "(=O)N"
2309
+ ],
2310
+ [
2311
+ "2CC",
2312
+ "CC"
2313
+ ],
2314
+ [
2315
+ "3)",
2316
+ "CC1"
2317
+ ],
2318
+ [
2319
+ "n",
2320
+ "cn"
2321
+ ],
2322
+ [
2323
+ "3)CC",
2324
+ "2)"
2325
+ ],
2326
+ [
2327
+ "(C",
2328
+ "#N)"
2329
+ ],
2330
+ [
2331
+ "C[C@@H]",
2332
+ "(C"
2333
+ ],
2334
+ [
2335
+ "c(=N",
2336
+ ")"
2337
+ ],
2338
+ [
2339
+ "(C)C",
2340
+ ")C1"
2341
+ ],
2342
+ [
2343
+ "(F)",
2344
+ "cc"
2345
+ ],
2346
+ [
2347
+ "[C@@H](C",
2348
+ "NC(=O)C"
2349
+ ],
2350
+ [
2351
+ "[C@H]1C",
2352
+ "N(C(=O)"
2353
+ ],
2354
+ [
2355
+ "c1cc",
2356
+ "c(Cl)"
2357
+ ],
2358
+ [
2359
+ "C[C@H]",
2360
+ "1"
2361
+ ],
2362
+ [
2363
+ "[C@@H]1C",
2364
+ "N(C(=O)"
2365
+ ],
2366
+ [
2367
+ "(F)",
2368
+ "("
2369
+ ],
2370
+ [
2371
+ "CC",
2372
+ "1CC"
2373
+ ],
2374
+ [
2375
+ "Cc1cc",
2376
+ "(C(=O)N"
2377
+ ],
2378
+ [
2379
+ "C[C@@H]",
2380
+ "1"
2381
+ ],
2382
+ [
2383
+ "c(C)",
2384
+ "c1"
2385
+ ],
2386
+ [
2387
+ "C(=O)N",
2388
+ "1CCC"
2389
+ ],
2390
+ [
2391
+ "c(C(=O)N",
2392
+ "2C"
2393
+ ],
2394
+ [
2395
+ "CCC",
2396
+ "O"
2397
+ ],
2398
+ [
2399
+ "4",
2400
+ ")CC"
2401
+ ],
2402
+ [
2403
+ "(",
2404
+ "=N"
2405
+ ],
2406
+ [
2407
+ "2CC",
2408
+ "2)C1"
2409
+ ],
2410
+ [
2411
+ "(CC",
2412
+ "NC(=O)"
2413
+ ],
2414
+ [
2415
+ "n",
2416
+ "[nH]"
2417
+ ],
2418
+ [
2419
+ "cc",
2420
+ "3"
2421
+ ],
2422
+ [
2423
+ "(O)",
2424
+ "CC"
2425
+ ],
2426
+ [
2427
+ "2",
2428
+ "(CCC"
2429
+ ],
2430
+ [
2431
+ "[",
2432
+ "S"
2433
+ ],
2434
+ [
2435
+ "S",
2436
+ "(N)"
2437
+ ],
2438
+ [
2439
+ "ccc1",
2440
+ "C"
2441
+ ],
2442
+ [
2443
+ "N(C)C(=O)",
2444
+ "C"
2445
+ ],
2446
+ [
2447
+ "F",
2448
+ ")C2)"
2449
+ ],
2450
+ [
2451
+ "(F)",
2452
+ "F)C"
2453
+ ]
2454
+ ]
2455
+ }
2456
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<bos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<eos>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<bos>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<eos>",
39
+ "model_max_length": 1000000000000000019884624838656,
40
+ "pad_token": "<pad>",
41
+ "tokenizer_class": "PreTrainedTokenizerFast",
42
+ "unk_token": "<unk>"
43
+ }