cang commited on
Commit
ad7d40d
·
1 Parent(s): bb238e7
Files changed (5) hide show
  1. config.json +33 -0
  2. pytorch_model.bin +3 -0
  3. tokenization_sky.py +515 -0
  4. tokenizer_config.json +17 -0
  5. vocab.json +0 -0
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 6,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 1,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "max_length": 2048,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 2048,
15
+ "n_embd": 2560,
16
+ "n_head": 32,
17
+ "n_inner": null,
18
+ "n_layer": 32,
19
+ "n_positions": 2048,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.16.0",
31
+ "use_cache": true,
32
+ "vocab_size": 57600
33
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef3d582ee8433ed6e9b7efc01165f75b7a7a0218a91c12cba0b1a225dc5d6704
3
+ size 5475079165
tokenization_sky.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for OpenAI GPT."""
16
+
17
+ import json
18
+ import os
19
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
20
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
21
+ from transformers.utils import logging, to_py_obj
22
+ from transformers.tokenization_utils_base import BatchEncoding
23
+
24
+ import bisect
25
+ import itertools
26
+ import re
27
+ import unicodedata
28
+ from collections import OrderedDict
29
+ from typing import Any, Dict, List, Optional, Tuple, Union, overload
30
+
31
+ from transformers.tokenization_utils_base import (
32
+ ENCODE_KWARGS_DOCSTRING,
33
+ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
34
+ INIT_TOKENIZER_DOCSTRING,
35
+ AddedToken,
36
+ BatchEncoding,
37
+ EncodedInput,
38
+ EncodedInputPair,
39
+ PreTokenizedInput,
40
+ PreTokenizedInputPair,
41
+ PreTrainedTokenizerBase,
42
+ TextInput,
43
+ TextInputPair,
44
+ TruncationStrategy,
45
+ )
46
+ from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging
47
+
48
+
49
+ if TYPE_CHECKING:
50
+ from transformers.pipelines.conversational import Conversation
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+ VOCAB_FILES_NAMES = {
55
+ "vocab_file": "vocab.json",
56
+ }
57
+
58
+
59
+ class DATrie:
60
+ class Node:
61
+ def __init__(self, is_leaf=False, leaf_data=None, tail=""):
62
+ self._is_leaf = is_leaf
63
+ self._leaf_data = leaf_data
64
+ self._tail = tail
65
+ self._next_map = {}
66
+
67
+ def is_leaf(self):
68
+ return self._is_leaf
69
+
70
+ def set_leaf(self):
71
+ self._is_leaf = True
72
+
73
+ def has_next(self, w):
74
+ if w in self._next_map:
75
+ return True
76
+ return False
77
+
78
+ def add_node(self, w, node):
79
+ self._next_map[w] = node
80
+
81
+ def get_node(self, w):
82
+ if w in self._next_map:
83
+ return self._next_map[w]
84
+ return None
85
+
86
+ def get_tail(self):
87
+ return self._tail
88
+
89
+ def get_data(self):
90
+ return self._leaf_data
91
+
92
+ def set_data(self, data):
93
+ self._leaf_data = data
94
+
95
+ def __init__(self):
96
+ self.root = self.Node()
97
+ self.data = {}
98
+ self.r_data = {}
99
+ pass
100
+
101
+ def insert(self, word, data):
102
+ self.data[word] = data
103
+ self.r_data[data] = word
104
+ idx = 0
105
+ node = self.root
106
+ while idx < len(word):
107
+ w = word[idx]
108
+ is_leaf = (idx == (len(word) - 1))
109
+ leaf_data = (data if is_leaf else None)
110
+ # 不存在则插入
111
+ if not node.has_next(w):
112
+ node.add_node(w, self.Node(is_leaf=is_leaf, leaf_data=leaf_data))
113
+ # last word
114
+ node = node.get_node(w)
115
+ idx += 1
116
+ if not node.is_leaf():
117
+ node.set_leaf()
118
+ node.set_data(data)
119
+
120
+ def findStrict(self, word):
121
+ idx = 0
122
+ node = self.root
123
+ while node is not None and idx < len(word):
124
+ w = word[idx]
125
+ if not node.has_next(w):
126
+ return None
127
+ # last word
128
+ node = node.get_node(w)
129
+ idx += 1
130
+ if node.is_leaf():
131
+ return node.get_data()
132
+ return None
133
+
134
+ def prefix(self, word):
135
+ idx = 0
136
+ node = self.root
137
+ result = []
138
+ while node is not None and idx < len(word):
139
+ w = word[idx]
140
+ if not node.has_next(w):
141
+ return result
142
+ # last word
143
+ node = node.get_node(w)
144
+ if node.is_leaf():
145
+ result.append([word[:idx + 1], node.get_data()])
146
+ idx += 1
147
+ return result
148
+
149
+ def max_prefix(self, content, start_idx):
150
+ idx = start_idx
151
+ node = self.root
152
+ l = len(content)
153
+ result = [["", ], ]
154
+ while node is not None and idx < l:
155
+ w = content[idx]
156
+ if not node.has_next(w):
157
+ return result[-1]
158
+ # last word
159
+ node = node.get_node(w)
160
+ if node.is_leaf():
161
+ result.append([content[start_idx:idx + 1], node.get_data()])
162
+ idx += 1
163
+ return result[-1]
164
+
165
+ def max_score(self, content, start_idx):
166
+ idx = start_idx
167
+ node = self.root
168
+ l = len(content)
169
+ result = [["", (3, 0)], ]
170
+ while node is not None and idx < l:
171
+ w = content[idx]
172
+ if not node.has_next(w):
173
+ break
174
+ # last word
175
+ node = node.get_node(w)
176
+ if node.is_leaf():
177
+ result.append([content[start_idx:idx + 1], node.get_data()])
178
+ idx += 1
179
+ if len(result) > 1:
180
+ result = sorted(result, key=lambda x: x[1][1])
181
+ return result[-1]
182
+
183
+ def match(self, content, add_unk=True, unk_id=-1, **kwargs):
184
+ # length
185
+ l = len(content)
186
+ i = 0
187
+ result_list = []
188
+ while i < l:
189
+ match_word = self.max_prefix(content=content, start_idx=i)
190
+ # print(match_word)
191
+ w = match_word[0]
192
+ if len(w) > 0:
193
+ result_list.append(match_word[1])
194
+ i += len(w)
195
+ else:
196
+ if add_unk:
197
+ result_list.append(unk_id)
198
+ i += 1
199
+ return result_list
200
+
201
+ def id2str(self, ids, escape_special_ids=True, end_ids=[], **kwargs):
202
+ res_str = ""
203
+ for rid in ids:
204
+ if rid in self.r_data:
205
+ if rid in end_ids:
206
+ break
207
+ rstr = self.r_data[rid]
208
+ if escape_special_ids is True:
209
+ if rstr.startswith("[") and rstr.endswith("]") \
210
+ and rstr.upper() == rstr:
211
+ continue
212
+ res_str += rstr
213
+ else:
214
+ print("ERROR unknown id %d" % rid)
215
+ return res_str
216
+
217
+ def id2str_v2(self, ids, escape_special_ids=True, end_ids=[], **kwargs):
218
+ res_str = ""
219
+ for rid in ids:
220
+ if rid in self.r_data:
221
+ if rid in end_ids:
222
+ break
223
+ rstr = self.r_data[rid]
224
+ if escape_special_ids is True:
225
+ if rstr.startswith("[") and rstr.endswith("]") \
226
+ and rstr.upper() == rstr:
227
+ break
228
+ res_str += rstr
229
+ else:
230
+ print("ERROR unknown id %d" % rid)
231
+ return res_str
232
+
233
+
234
+ class SkyTokenizer(PreTrainedTokenizer):
235
+ vocab_files_names = VOCAB_FILES_NAMES
236
+ model_input_names = ["input_ids", "attention_mask"]
237
+
238
+ def __init__(
239
+ self,
240
+ vocab_file,
241
+ errors="replace",
242
+ unk_token="[UNK]",
243
+ bos_token="[BOS]",
244
+ eos_token="[EOS]",
245
+ pad_token="[PAD]",
246
+ add_bos_token=False,
247
+ **kwargs
248
+ ):
249
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
250
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
251
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
252
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
253
+ super().__init__(
254
+ errors=errors,
255
+ unk_token=unk_token,
256
+ bos_token=bos_token,
257
+ eos_token=eos_token,
258
+ pad_token=pad_token,
259
+ add_bos_token=add_bos_token,
260
+ **kwargs,
261
+ )
262
+ self.add_bos_token = add_bos_token
263
+
264
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
265
+ self.encoder = json.load(vocab_handle)
266
+ self.decoder = {v: k for k, v in self.encoder.items()}
267
+ self.trie = DATrie()
268
+ for k, v in self.encoder.items():
269
+ self.trie.insert(k, v)
270
+ self.errors = errors # how to handle errors in decoding
271
+ self.cache = {}
272
+
273
+ @property
274
+ def vocab_size(self):
275
+ return len(self.encoder)
276
+
277
+ def get_vocab(self):
278
+ return dict(self.encoder, **self.added_tokens_encoder)
279
+
280
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
281
+ if self.add_bos_token:
282
+ bos_token_ids = [self.bos_token_id]
283
+ else:
284
+ bos_token_ids = []
285
+
286
+ output = bos_token_ids + token_ids_0
287
+
288
+ if token_ids_1 is None:
289
+ return output
290
+
291
+ return output + bos_token_ids + token_ids_1
292
+
293
+ def get_special_tokens_mask(
294
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
295
+ already_has_special_tokens: bool = False
296
+ ) -> List[int]:
297
+ """
298
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
299
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
300
+
301
+ Args:
302
+ token_ids_0 (`List[int]`):
303
+ List of IDs.
304
+ token_ids_1 (`List[int]`, *optional*):
305
+ Optional second list of IDs for sequence pairs.
306
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
307
+ Whether or not the token list is already formatted with special tokens for the model.
308
+
309
+ Returns:
310
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
311
+ """
312
+ if already_has_special_tokens:
313
+ return super().get_special_tokens_mask(
314
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
315
+ )
316
+
317
+ if not self.add_bos_token:
318
+ return super().get_special_tokens_mask(
319
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
320
+ )
321
+
322
+ if token_ids_1 is None:
323
+ return [1] + ([0] * len(token_ids_0))
324
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
325
+
326
+ def _tokenize(self, text, **kwargs):
327
+ """Tokenize a string."""
328
+ return self.trie.match(text, unk_id=self.unk_token_id, **kwargs)
329
+
330
+ def _decode(self,
331
+ token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
332
+ skip_special_tokens: bool = False,
333
+ **kwargs
334
+ ) -> str:
335
+
336
+ # Convert inputs to python lists
337
+ token_ids = to_py_obj(token_ids)
338
+ if isinstance(token_ids, int):
339
+ return self.decoder.get(token_ids, self.unk_token)
340
+ elif isinstance(token_ids, list):
341
+ return self.trie.id2str(
342
+ token_ids,
343
+ escape_special_ids=skip_special_tokens,
344
+ **kwargs
345
+ )
346
+ else:
347
+ return token_ids
348
+
349
+ def _convert_token_to_id(self, token):
350
+ """Converts a token (str) in an id using the vocab."""
351
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
352
+
353
+ def _convert_id_to_token(self, index):
354
+ """Converts an index (integer) in a token (str) using the vocab."""
355
+ return self.decoder.get(index)
356
+
357
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
358
+ if not os.path.exists(save_directory):
359
+ os.mkdir(save_directory)
360
+ if not os.path.isdir(save_directory):
361
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
362
+ return
363
+ vocab_file = os.path.join(
364
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
365
+ )
366
+
367
+ with open(vocab_file, "w", encoding="utf-8") as f:
368
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
369
+
370
+ return (vocab_file,)
371
+
372
+ def prepare_for_tokenization(self, text, **kwargs):
373
+ return (text, kwargs)
374
+
375
+ def _encode_plus(
376
+ self,
377
+ text: Union[TextInput, EncodedInput],
378
+ add_special_tokens: bool = True,
379
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
380
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
381
+ max_length: Optional[int] = None,
382
+ stride: int = 0,
383
+ pad_to_multiple_of: Optional[int] = None,
384
+ return_tensors: Optional[Union[str, TensorType]] = None,
385
+ return_token_type_ids: Optional[bool] = None,
386
+ return_attention_mask: Optional[bool] = None,
387
+ return_overflowing_tokens: bool = False,
388
+ return_special_tokens_mask: bool = False,
389
+ return_offsets_mapping: bool = False,
390
+ return_length: bool = False,
391
+ verbose: bool = True,
392
+ **kwargs
393
+ ) -> BatchEncoding:
394
+ def get_input_ids(text):
395
+ if isinstance(text, str):
396
+ text_id = self.trie.match(text, unk_id=self.unk_token_id)
397
+ return text_id
398
+ elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
399
+ return [self.trie.match(t, unk_id=self.unk_token_id) for t in text]
400
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
401
+ return text
402
+ else:
403
+ raise ValueError(
404
+ "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
405
+ )
406
+
407
+ if return_offsets_mapping:
408
+ raise NotImplementedError(
409
+ "return_offset_mapping is not available when using Python tokenizers. "
410
+ "To use this feature, change your tokenizer to one deriving from "
411
+ "transformers.PreTrainedTokenizerFast. "
412
+ "More information on available tokenizers at "
413
+ "https://github.com/huggingface/transformers/pull/2674"
414
+ )
415
+
416
+ first_ids = get_input_ids(text)
417
+
418
+ return self.prepare_for_model(
419
+ first_ids,
420
+ pair_ids=None,
421
+ add_special_tokens=add_special_tokens,
422
+ padding=padding_strategy.value,
423
+ truncation=truncation_strategy.value,
424
+ max_length=max_length,
425
+ stride=stride,
426
+ pad_to_multiple_of=pad_to_multiple_of,
427
+ return_tensors=return_tensors,
428
+ prepend_batch_axis=True,
429
+ return_attention_mask=return_attention_mask,
430
+ return_token_type_ids=return_token_type_ids,
431
+ return_overflowing_tokens=return_overflowing_tokens,
432
+ return_special_tokens_mask=return_special_tokens_mask,
433
+ return_length=return_length,
434
+ verbose=verbose,
435
+ )
436
+
437
+ def _batch_encode_plus(
438
+ self,
439
+ batch_text_or_text_pairs: Union[
440
+ List[TextInput],
441
+ List[EncodedInput],
442
+ ],
443
+ add_special_tokens: bool = True,
444
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
445
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
446
+ max_length: Optional[int] = None,
447
+ stride: int = 0,
448
+ pad_to_multiple_of: Optional[int] = None,
449
+ return_tensors: Optional[Union[str, TensorType]] = None,
450
+ return_token_type_ids: Optional[bool] = None,
451
+ return_attention_mask: Optional[bool] = None,
452
+ return_overflowing_tokens: bool = False,
453
+ return_special_tokens_mask: bool = False,
454
+ return_offsets_mapping: bool = False,
455
+ return_length: bool = False,
456
+ verbose: bool = True,
457
+ **kwargs
458
+ ) -> BatchEncoding:
459
+ def get_input_ids(text):
460
+ if isinstance(text, str):
461
+ text_id = self.trie.match(text, unk_id=self.unk_token_id)
462
+ return text_id
463
+ elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
464
+ return [self.trie.match(t, unk_id=self.unk_token_id) for t in text]
465
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
466
+ return text
467
+ else:
468
+ raise ValueError(
469
+ "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
470
+ )
471
+
472
+ if return_offsets_mapping:
473
+ raise NotImplementedError(
474
+ "return_offset_mapping is not available when using Python tokenizers. "
475
+ "To use this feature, change your tokenizer to one deriving from "
476
+ "transformers.PreTrainedTokenizerFast."
477
+ )
478
+
479
+ input_ids = []
480
+ for ids_or_pair_ids in batch_text_or_text_pairs:
481
+ if not isinstance(ids_or_pair_ids, (list, tuple)):
482
+ ids, pair_ids = ids_or_pair_ids, None
483
+ else:
484
+ ids, pair_ids = ids_or_pair_ids
485
+
486
+ first_ids = get_input_ids(ids)
487
+ second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
488
+ input_ids.append((first_ids, second_ids))
489
+
490
+ batch_outputs = self._batch_prepare_for_model(
491
+ input_ids,
492
+ add_special_tokens=add_special_tokens,
493
+ padding_strategy=padding_strategy,
494
+ truncation_strategy=truncation_strategy,
495
+ max_length=max_length,
496
+ stride=stride,
497
+ pad_to_multiple_of=pad_to_multiple_of,
498
+ return_attention_mask=return_attention_mask,
499
+ return_token_type_ids=return_token_type_ids,
500
+ return_overflowing_tokens=return_overflowing_tokens,
501
+ return_special_tokens_mask=return_special_tokens_mask,
502
+ return_length=return_length,
503
+ return_tensors=return_tensors,
504
+ verbose=verbose,
505
+ )
506
+
507
+ return BatchEncoding(batch_outputs)
508
+
509
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
510
+ input_ids = []
511
+ for is_user, text in conversation.iter_texts():
512
+ input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
513
+ if len(input_ids) > self.model_max_length:
514
+ input_ids = input_ids[-self.model_max_length:]
515
+ return input_ids
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name_or_path": "sky-text",
3
+ "bos_token": "[BOS]",
4
+ "eos_token": "[EOS]",
5
+ "pad_token": "[PAD]",
6
+ "mask_token": "[MASK]",
7
+ "unk_token": "[UNK]",
8
+ "add_prefix_space": false,
9
+ "tokenizer_class": "SkyTokenizer",
10
+ "use_fast": false,
11
+ "auto_map": {
12
+ "AutoTokenizer": [
13
+ "tokenization_sky.SkyTokenizer",
14
+ null
15
+ ]
16
+ }
17
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff