Training in progress, step 60000, checkpoint
Browse files
checkpoint-60000/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_bias": false,
|
3 |
+
"attention_dropout": 0.0,
|
4 |
+
"auto_map": {
|
5 |
+
"AutoModelForCausalLM": "modeling_novomolgen.NovoMolGen"
|
6 |
+
},
|
7 |
+
"bos_token_id": 2,
|
8 |
+
"eos_token_id": 3,
|
9 |
+
"fused_bias_fc": false,
|
10 |
+
"fused_dropout_add_ln": false,
|
11 |
+
"fused_mlp": false,
|
12 |
+
"head_dim": 64,
|
13 |
+
"hidden_act": "silu",
|
14 |
+
"hidden_size": 768,
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 3072,
|
17 |
+
"loss_type": "ForCausalLM",
|
18 |
+
"max_position_embeddings": 2048,
|
19 |
+
"max_seq_length": 64,
|
20 |
+
"mlp_bias": false,
|
21 |
+
"model_type": "llama",
|
22 |
+
"num_attention_heads": 12,
|
23 |
+
"num_hidden_layers": 32,
|
24 |
+
"num_key_value_heads": 12,
|
25 |
+
"pretraining_tp": 1,
|
26 |
+
"residual_in_fp32": true,
|
27 |
+
"rms_norm_eps": 1e-06,
|
28 |
+
"rope_scaling": null,
|
29 |
+
"rope_theta": 10000.0,
|
30 |
+
"tie_word_embeddings": false,
|
31 |
+
"transformers_version": "4.46.2",
|
32 |
+
"use_cache": true,
|
33 |
+
"use_flash_attn": true,
|
34 |
+
"vocab_size": 84
|
35 |
+
}
|
checkpoint-60000/modeling_novomolgen.py
ADDED
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import json
|
3 |
+
import os.path
|
4 |
+
import re
|
5 |
+
import shutil
|
6 |
+
import inspect
|
7 |
+
from typing import Optional, Union
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import torch.nn.functional as F
|
11 |
+
from transformers import LlamaConfig
|
12 |
+
from transformers.loss.loss_utils import LOSS_MAPPING
|
13 |
+
from transformers.modeling_outputs import CausalLMOutput
|
14 |
+
from transformers.utils.hub import cached_file, get_checkpoint_shard_files
|
15 |
+
from transformers.utils import (
|
16 |
+
SAFE_WEIGHTS_NAME,
|
17 |
+
WEIGHTS_INDEX_NAME,
|
18 |
+
WEIGHTS_NAME,
|
19 |
+
)
|
20 |
+
from transformers.modeling_utils import unwrap_model, logger
|
21 |
+
from functools import partial
|
22 |
+
from safetensors.torch import load_file as safe_load_file
|
23 |
+
|
24 |
+
try:
|
25 |
+
from flash_attn.models.gpt import GPTLMHeadModel
|
26 |
+
except ImportError:
|
27 |
+
GPTLMHeadModel = None
|
28 |
+
|
29 |
+
try:
|
30 |
+
from flash_attn.models.llama import llama_config_to_gpt2_config, inv_remap_state_dict_hf_llama
|
31 |
+
except ImportError:
|
32 |
+
llama_config_to_gpt2_config = None
|
33 |
+
inv_remap_state_dict_hf_llama = None
|
34 |
+
|
35 |
+
|
36 |
+
def state_dict_from_pretrained(model_name, checkpoint_path: str = "", device=None, dtype=None):
|
37 |
+
"""
|
38 |
+
code modified from: https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/pretrained.py
|
39 |
+
"""
|
40 |
+
|
41 |
+
# If not fp32, then we don't want to load directly to the GPU
|
42 |
+
mapped_device = "cpu" if dtype not in [torch.float32, None] else device
|
43 |
+
is_sharded = False
|
44 |
+
load_safe = False
|
45 |
+
|
46 |
+
# Try loading from HF hub instead of from local files
|
47 |
+
resolved_archive_file = cached_file(model_name, os.path.join(checkpoint_path, WEIGHTS_NAME),
|
48 |
+
_raise_exceptions_for_missing_entries=False)
|
49 |
+
if resolved_archive_file is None:
|
50 |
+
resolved_archive_file = cached_file(model_name, os.path.join(checkpoint_path, WEIGHTS_INDEX_NAME),
|
51 |
+
_raise_exceptions_for_missing_entries=False)
|
52 |
+
if resolved_archive_file is not None:
|
53 |
+
is_sharded = True
|
54 |
+
|
55 |
+
if resolved_archive_file is None:
|
56 |
+
raise EnvironmentError(f"Model name {model_name} was not found.")
|
57 |
+
|
58 |
+
if load_safe:
|
59 |
+
loader = partial(safe_load_file, device=mapped_device)
|
60 |
+
else:
|
61 |
+
loader = partial(torch.load, map_location=mapped_device)
|
62 |
+
|
63 |
+
if is_sharded:
|
64 |
+
# resolved_archive_file becomes a list of files that point to the different
|
65 |
+
# checkpoint shards in this case.
|
66 |
+
resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
|
67 |
+
model_name, resolved_archive_file
|
68 |
+
)
|
69 |
+
state_dict = {}
|
70 |
+
for sharded_file in resolved_archive_file:
|
71 |
+
state_dict.update(loader(sharded_file))
|
72 |
+
else:
|
73 |
+
state_dict = loader(resolved_archive_file)
|
74 |
+
# Convert dtype before moving to GPU to save memory
|
75 |
+
if dtype is not None:
|
76 |
+
state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
|
77 |
+
state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
|
78 |
+
|
79 |
+
return state_dict
|
80 |
+
|
81 |
+
|
82 |
+
class NovoMolGenConfig(LlamaConfig):
|
83 |
+
# model_type = "NovoMolGen"
|
84 |
+
|
85 |
+
def __init__(self,
|
86 |
+
use_flash_attn: bool = True,
|
87 |
+
fused_bias_fc: bool = True,
|
88 |
+
fused_mlp: bool = False,
|
89 |
+
fused_dropout_add_ln: bool = True,
|
90 |
+
residual_in_fp32: bool = True,
|
91 |
+
loss_type: str = 'ForCausalLM',
|
92 |
+
**kwargs
|
93 |
+
):
|
94 |
+
super().__init__(**kwargs)
|
95 |
+
self.use_flash_attn = use_flash_attn
|
96 |
+
self.fused_bias_fc = fused_bias_fc
|
97 |
+
self.fused_mlp = fused_mlp
|
98 |
+
self.fused_dropout_add_ln = fused_dropout_add_ln
|
99 |
+
self.residual_in_fp32 = residual_in_fp32
|
100 |
+
self.loss_type = loss_type
|
101 |
+
self.auto_map = {"AutoModelForCausalLM": "modeling_novomolgen.NovoMolGen"}
|
102 |
+
|
103 |
+
@classmethod
|
104 |
+
def from_pretrained(
|
105 |
+
cls,
|
106 |
+
pretrained_model_name_or_path: Union[str, os.PathLike],
|
107 |
+
checkpoint_path: str = "",
|
108 |
+
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
109 |
+
force_download: bool = False,
|
110 |
+
local_files_only: bool = False,
|
111 |
+
token: Optional[Union[str, bool]] = None,
|
112 |
+
revision: str = "main",
|
113 |
+
**kwargs,
|
114 |
+
):
|
115 |
+
|
116 |
+
resolved_archive_config_file = cached_file(pretrained_model_name_or_path,
|
117 |
+
os.path.join(checkpoint_path, "config.json"),
|
118 |
+
_raise_exceptions_for_missing_entries=False)
|
119 |
+
|
120 |
+
if resolved_archive_config_file is not None:
|
121 |
+
with open(resolved_archive_config_file, "r", encoding="utf-8") as reader:
|
122 |
+
text = reader.read()
|
123 |
+
config_dict = json.loads(text)
|
124 |
+
|
125 |
+
else:
|
126 |
+
raise EnvironmentError(f"config for {pretrained_model_name_or_path} was not found.")
|
127 |
+
|
128 |
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
129 |
+
print(
|
130 |
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
131 |
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
132 |
+
)
|
133 |
+
|
134 |
+
return cls.from_dict(config_dict, **kwargs)
|
135 |
+
|
136 |
+
|
137 |
+
class NovoMolGen(GPTLMHeadModel):
|
138 |
+
def __init__(
|
139 |
+
self,
|
140 |
+
config: NovoMolGenConfig,
|
141 |
+
mol_type: str = "SMILES",
|
142 |
+
):
|
143 |
+
self.base_config = config
|
144 |
+
self.mol_type = mol_type
|
145 |
+
config = llama_config_to_gpt2_config(config)
|
146 |
+
config.use_flash_attn = self.base_config.use_flash_attn
|
147 |
+
config.fused_bias_fc = self.base_config.fused_bias_fc
|
148 |
+
config.fused_mlp = self.base_config.fused_mlp
|
149 |
+
config.fused_dropout_add_ln = self.base_config.fused_dropout_add_ln
|
150 |
+
config.residual_in_fp32 = self.base_config.residual_in_fp32
|
151 |
+
GPTLMHeadModel.__init__(self, config)
|
152 |
+
|
153 |
+
# TODO: here we ignore attention_mask to make it compatible with HF trainer. The MHA in flash-attention should
|
154 |
+
# be reimplement and integrate attention_mask like here:
|
155 |
+
# https://github.com/huggingface/transformers/blob/0864dd3beb238b7bec3528a3d1d6c17a28f51a51/src/transformers/models/llama/modeling_llama.py#L536
|
156 |
+
def forward(self, input_ids, attention_mask: Optional[torch.FloatTensor] = None,
|
157 |
+
labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = None,
|
158 |
+
position_ids=None, inference_params=None, num_last_tokens=0, **loss_kwargs):
|
159 |
+
"""
|
160 |
+
input_ids: (batch, seqlen) int tensor
|
161 |
+
inference_params: for generation. Adapted from Megatron-LM (and Apex)
|
162 |
+
https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
|
163 |
+
num_last_tokens: if > 0, only return the logits for the last n tokens
|
164 |
+
"""
|
165 |
+
assert (
|
166 |
+
input_ids.ndim == 2
|
167 |
+
), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
|
168 |
+
b, slen = input_ids.shape
|
169 |
+
hidden_states = self.transformer(
|
170 |
+
input_ids, position_ids=position_ids, inference_params=inference_params
|
171 |
+
)
|
172 |
+
if inference_params is not None:
|
173 |
+
assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
|
174 |
+
if num_last_tokens > 0:
|
175 |
+
hidden_states = hidden_states[:, -num_last_tokens:]
|
176 |
+
if self.project_out is not None:
|
177 |
+
hidden_states = self.project_out(hidden_states)
|
178 |
+
if self.output_scale != 1.0:
|
179 |
+
hidden_states = hidden_states * self.output_scale
|
180 |
+
if not self.norm_head:
|
181 |
+
lm_logits = self.lm_head(hidden_states)
|
182 |
+
else:
|
183 |
+
lm_head_weight = F.normalize(self.lm_head.weight)
|
184 |
+
# if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
|
185 |
+
# hidden_states = all_gather(hidden_states, self.lm_head.process_group)
|
186 |
+
lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
|
187 |
+
# During inference, we want the full logit for sampling
|
188 |
+
# if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
|
189 |
+
# lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
|
190 |
+
# lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
|
191 |
+
|
192 |
+
loss = None
|
193 |
+
if labels is not None:
|
194 |
+
loss = self.loss_function(logits=lm_logits, labels=labels, vocab_size=self.base_config.vocab_size,
|
195 |
+
**loss_kwargs)
|
196 |
+
|
197 |
+
return CausalLMOutput(
|
198 |
+
loss=loss,
|
199 |
+
logits=lm_logits,
|
200 |
+
hidden_states=hidden_states
|
201 |
+
)
|
202 |
+
|
203 |
+
@property
|
204 |
+
def loss_function(self):
|
205 |
+
if getattr(self.base_config, "loss_type", None) is not None:
|
206 |
+
loss_type = self.base_config.loss_type
|
207 |
+
else:
|
208 |
+
loss_type = self.__class__.__name__
|
209 |
+
if loss_type not in LOSS_MAPPING:
|
210 |
+
loss_groups = f"({'|'.join(LOSS_MAPPING)})"
|
211 |
+
loss_type = re.findall(loss_groups, self.__class__.__name__)
|
212 |
+
if len(loss_type) > 0:
|
213 |
+
loss_type = loss_type[0]
|
214 |
+
else:
|
215 |
+
loss_type = None
|
216 |
+
if loss_type is None or loss_type not in LOSS_MAPPING and getattr(self.base_config, "loss_type",
|
217 |
+
None) is not None:
|
218 |
+
print(
|
219 |
+
f"`loss_type={loss_type}` was set in the base_config but it is unrecognised."
|
220 |
+
f"Using the default loss: `ForCausalLMLoss`."
|
221 |
+
)
|
222 |
+
loss_type = "ForCausalLM"
|
223 |
+
return LOSS_MAPPING[loss_type]
|
224 |
+
|
225 |
+
def save_pretrained(
|
226 |
+
self,
|
227 |
+
save_directory: Union[str, os.PathLike],
|
228 |
+
is_main_process: bool = True,
|
229 |
+
state_dict: Optional[dict] = None,
|
230 |
+
safe_serialization: bool = False,
|
231 |
+
**kwargs,
|
232 |
+
):
|
233 |
+
|
234 |
+
if safe_serialization:
|
235 |
+
raise ImportError("`safe_serialization` is not implemented yet`.")
|
236 |
+
|
237 |
+
if os.path.isfile(save_directory):
|
238 |
+
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
|
239 |
+
return
|
240 |
+
os.makedirs(save_directory, exist_ok=True)
|
241 |
+
# Save the config
|
242 |
+
if is_main_process:
|
243 |
+
self.base_config.save_pretrained(save_directory)
|
244 |
+
|
245 |
+
# Save the model
|
246 |
+
if state_dict is None:
|
247 |
+
# Only save the model itself if we are using distributed training
|
248 |
+
model_to_save = unwrap_model(self)
|
249 |
+
state_dict = model_to_save.state_dict()
|
250 |
+
|
251 |
+
weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
|
252 |
+
torch.save(state_dict, os.path.join(save_directory, weights_name))
|
253 |
+
|
254 |
+
# find the file where NovoMolGen is defined
|
255 |
+
src = inspect.getsourcefile(type(self))
|
256 |
+
if src:
|
257 |
+
dst = os.path.join(save_directory, os.path.basename(src))
|
258 |
+
shutil.copy(src, dst)
|
259 |
+
|
260 |
+
@classmethod
|
261 |
+
def from_pretrained(
|
262 |
+
cls,
|
263 |
+
pretrained_model_name_or_path,
|
264 |
+
checkpoint_path: str = "",
|
265 |
+
config: Optional[Union[NovoMolGenConfig, str, os.PathLike]] = None,
|
266 |
+
**kwargs,
|
267 |
+
):
|
268 |
+
if config is None:
|
269 |
+
config = NovoMolGenConfig.from_pretrained(pretrained_model_name_or_path, checkpoint_path=checkpoint_path)
|
270 |
+
model = cls(config)
|
271 |
+
|
272 |
+
if os.path.exists(pretrained_model_name_or_path):
|
273 |
+
state_dict = torch.load(os.path.join(pretrained_model_name_or_path, checkpoint_path, WEIGHTS_NAME))
|
274 |
+
else:
|
275 |
+
state_dict = state_dict_from_pretrained(pretrained_model_name_or_path, checkpoint_path=checkpoint_path)
|
276 |
+
model.load_state_dict(state_dict)
|
277 |
+
return model
|
278 |
+
|
279 |
+
def sample(
|
280 |
+
self,
|
281 |
+
tokenizer,
|
282 |
+
batch_size: int = 4,
|
283 |
+
max_length: int = 64,
|
284 |
+
temperature: float = 1.0,
|
285 |
+
top_k: int = 50,
|
286 |
+
top_p: float = 0.95,
|
287 |
+
device: torch.device = torch.device("cuda"),
|
288 |
+
):
|
289 |
+
"""
|
290 |
+
Generate a batch of sequences from the model.
|
291 |
+
|
292 |
+
Returns a dictionary with up to three keys:
|
293 |
+
{
|
294 |
+
"<mol_type>": <list of raw sequences in that moltype>,
|
295 |
+
"sequences": <torch.LongTensor of valid token IDs>
|
296 |
+
}
|
297 |
+
"""
|
298 |
+
input_ids = tokenizer.encode("", return_tensors="pt").to(device)
|
299 |
+
# Repeat the prompt for the desired batch size
|
300 |
+
input_ids = input_ids.repeat_interleave(batch_size, dim=0)
|
301 |
+
# If the tokenizer includes an EOS token for an empty prompt, we remove it.
|
302 |
+
if input_ids.shape[1] > 1:
|
303 |
+
input_ids = input_ids[:, :-1]
|
304 |
+
|
305 |
+
generation_output = self.generate(
|
306 |
+
input_ids,
|
307 |
+
max_length=max_length,
|
308 |
+
temperature=temperature,
|
309 |
+
top_k=top_k,
|
310 |
+
top_p=top_p,
|
311 |
+
eos_token_id=tokenizer.eos_token_id,
|
312 |
+
return_dict_in_generate=True,
|
313 |
+
)
|
314 |
+
|
315 |
+
sequences = self._filter_tokens_after_eos(
|
316 |
+
generation_output.sequences, eos_id=tokenizer.eos_token_id
|
317 |
+
)
|
318 |
+
|
319 |
+
decoded_strings = tokenizer.batch_decode(sequences, skip_special_tokens=True)
|
320 |
+
decoded_strings = [s.replace(" ", "") for s in decoded_strings]
|
321 |
+
|
322 |
+
result = {
|
323 |
+
self.mol_type: decoded_strings,
|
324 |
+
"sequences": sequences,
|
325 |
+
}
|
326 |
+
return result
|
327 |
+
|
328 |
+
@staticmethod
|
329 |
+
def _filter_tokens_after_eos(sequences, eos_id):
|
330 |
+
output = copy.deepcopy(sequences)
|
331 |
+
for i in range(sequences.size(0)):
|
332 |
+
row = sequences[i]
|
333 |
+
eos_position = (row == eos_id).nonzero()
|
334 |
+
if eos_position.numel() > 0:
|
335 |
+
eos_position = eos_position[0, 0].item() # Get the index of the first occurrence
|
336 |
+
output[i, eos_position + 1:] = eos_id
|
337 |
+
return output
|
338 |
+
|
339 |
+
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
|
340 |
+
# HF’s GenerationMixin would normally do more, but for a basic LM this usually suffices:
|
341 |
+
return {"input_ids": input_ids, "attention_mask": attention_mask}
|
checkpoint-60000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c7e764411c733d928a5392b324e700061379401ba66a80bfa3bebaed18c15d7
|
3 |
+
size 1208773654
|
checkpoint-60000/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<bos>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<eos>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
checkpoint-60000/tokenizer.json
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "1.0",
|
3 |
+
"truncation": null,
|
4 |
+
"padding": null,
|
5 |
+
"added_tokens": [
|
6 |
+
{
|
7 |
+
"id": 0,
|
8 |
+
"content": "<unk>",
|
9 |
+
"single_word": false,
|
10 |
+
"lstrip": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"id": 1,
|
17 |
+
"content": "<pad>",
|
18 |
+
"single_word": false,
|
19 |
+
"lstrip": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"normalized": false,
|
22 |
+
"special": true
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id": 2,
|
26 |
+
"content": "<bos>",
|
27 |
+
"single_word": false,
|
28 |
+
"lstrip": false,
|
29 |
+
"rstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"special": true
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"id": 3,
|
35 |
+
"content": "<eos>",
|
36 |
+
"single_word": false,
|
37 |
+
"lstrip": false,
|
38 |
+
"rstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"special": true
|
41 |
+
}
|
42 |
+
],
|
43 |
+
"normalizer": null,
|
44 |
+
"pre_tokenizer": {
|
45 |
+
"type": "Split",
|
46 |
+
"pattern": {
|
47 |
+
"Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])"
|
48 |
+
},
|
49 |
+
"behavior": "Isolated",
|
50 |
+
"invert": false
|
51 |
+
},
|
52 |
+
"post_processor": {
|
53 |
+
"type": "TemplateProcessing",
|
54 |
+
"single": [
|
55 |
+
{
|
56 |
+
"SpecialToken": {
|
57 |
+
"id": "<bos>",
|
58 |
+
"type_id": 0
|
59 |
+
}
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"Sequence": {
|
63 |
+
"id": "A",
|
64 |
+
"type_id": 0
|
65 |
+
}
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"SpecialToken": {
|
69 |
+
"id": "<eos>",
|
70 |
+
"type_id": 0
|
71 |
+
}
|
72 |
+
}
|
73 |
+
],
|
74 |
+
"pair": [
|
75 |
+
{
|
76 |
+
"Sequence": {
|
77 |
+
"id": "A",
|
78 |
+
"type_id": 0
|
79 |
+
}
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"Sequence": {
|
83 |
+
"id": "B",
|
84 |
+
"type_id": 1
|
85 |
+
}
|
86 |
+
}
|
87 |
+
],
|
88 |
+
"special_tokens": {
|
89 |
+
"<bos>": {
|
90 |
+
"id": "<bos>",
|
91 |
+
"ids": [
|
92 |
+
2
|
93 |
+
],
|
94 |
+
"tokens": [
|
95 |
+
"<bos>"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
"<eos>": {
|
99 |
+
"id": "<eos>",
|
100 |
+
"ids": [
|
101 |
+
3
|
102 |
+
],
|
103 |
+
"tokens": [
|
104 |
+
"<eos>"
|
105 |
+
]
|
106 |
+
}
|
107 |
+
}
|
108 |
+
},
|
109 |
+
"decoder": {
|
110 |
+
"type": "BPEDecoder",
|
111 |
+
"suffix": "</w>"
|
112 |
+
},
|
113 |
+
"model": {
|
114 |
+
"type": "WordLevel",
|
115 |
+
"vocab": {
|
116 |
+
"<unk>": 0,
|
117 |
+
"<pad>": 1,
|
118 |
+
"<bos>": 2,
|
119 |
+
"<eos>": 3,
|
120 |
+
"C": 4,
|
121 |
+
"(": 5,
|
122 |
+
")": 6,
|
123 |
+
"c": 7,
|
124 |
+
"1": 8,
|
125 |
+
"O": 9,
|
126 |
+
"=": 10,
|
127 |
+
"N": 11,
|
128 |
+
"2": 12,
|
129 |
+
"n": 13,
|
130 |
+
"[C@H]": 14,
|
131 |
+
"[C@@H]": 15,
|
132 |
+
"3": 16,
|
133 |
+
"F": 17,
|
134 |
+
"S": 18,
|
135 |
+
"s": 19,
|
136 |
+
"4": 20,
|
137 |
+
"Cl": 21,
|
138 |
+
"[nH]": 22,
|
139 |
+
"o": 23,
|
140 |
+
"[C@]": 24,
|
141 |
+
"[C@@]": 25,
|
142 |
+
"#": 26,
|
143 |
+
"Br": 27,
|
144 |
+
"-": 28,
|
145 |
+
"/": 29,
|
146 |
+
"[N+]": 30,
|
147 |
+
"[O-]": 31,
|
148 |
+
"5": 32,
|
149 |
+
"I": 33,
|
150 |
+
"[N-]": 34,
|
151 |
+
"P": 35,
|
152 |
+
"[S@]": 36,
|
153 |
+
"[S@@]": 37,
|
154 |
+
"[n+]": 38,
|
155 |
+
"[Si]": 39,
|
156 |
+
"6": 40,
|
157 |
+
"[S+]": 41,
|
158 |
+
"B": 42,
|
159 |
+
"[P@]": 43,
|
160 |
+
"7": 44,
|
161 |
+
"[P@@]": 45,
|
162 |
+
"[N@]": 46,
|
163 |
+
"8": 47,
|
164 |
+
"[N@@]": 48,
|
165 |
+
"[B-]": 49,
|
166 |
+
"[NH+]": 50,
|
167 |
+
"[N@@H+]": 51,
|
168 |
+
"[NH2+]": 52,
|
169 |
+
"[N@H+]": 53,
|
170 |
+
"[O]": 54,
|
171 |
+
"[NH3+]": 55,
|
172 |
+
"[PH]": 56,
|
173 |
+
"[Si@]": 57,
|
174 |
+
"[Si@@]": 58,
|
175 |
+
"[n-]": 59,
|
176 |
+
"9": 60,
|
177 |
+
"[N@+]": 61,
|
178 |
+
"[nH+]": 62,
|
179 |
+
"[N@@+]": 63,
|
180 |
+
"[Sn]": 64,
|
181 |
+
"[s+]": 65,
|
182 |
+
"[Se]": 66,
|
183 |
+
".": 67,
|
184 |
+
"[Cl-]": 68,
|
185 |
+
"[N]": 69,
|
186 |
+
"[C-]": 70,
|
187 |
+
"[C]": 71,
|
188 |
+
"[S@@+]": 72,
|
189 |
+
"%10": 73,
|
190 |
+
"%11": 74,
|
191 |
+
"[O+]": 75,
|
192 |
+
"[SH]": 76,
|
193 |
+
"[Si@H]": 77,
|
194 |
+
"[NH]": 78,
|
195 |
+
"[P+]": 79,
|
196 |
+
"[P@@H]": 80,
|
197 |
+
"[Si@@H]": 81,
|
198 |
+
"[c-]": 82,
|
199 |
+
"[o+]": 83
|
200 |
+
},
|
201 |
+
"unk_token": "<unk>"
|
202 |
+
}
|
203 |
+
}
|
checkpoint-60000/tokenizer_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<unk>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "<bos>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<eos>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"bos_token": "<bos>",
|
37 |
+
"clean_up_tokenization_spaces": false,
|
38 |
+
"eos_token": "<eos>",
|
39 |
+
"model_max_length": 1000000000000000019884624838656,
|
40 |
+
"pad_token": "<pad>",
|
41 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
42 |
+
"unk_token": "<unk>"
|
43 |
+
}
|