faidrap commited on
Commit
7ca565c
·
verified ·
1 Parent(s): 7636ede

Upload finetuned dclm-german model

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +30 -0
  2. tiktoken.py +391 -0
  3. tokenizer_config.json +34 -0
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tiktoken.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 MosaicML LLM Foundry authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from functools import lru_cache
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+ from transformers import PreTrainedTokenizer
8
+
9
+ __all__ = [
10
+ 'TiktokenTokenizerWrapper',
11
+ ]
12
+
13
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
14
+
15
+
16
+ # Taken from
17
+ # https://github.com/huggingface/transformers/blob/8aca43bdb3cb9a5020f6d57589d85679dc873b1c/src/transformers/models/gpt2/tokenization_gpt2.py#L62-L84
18
+ @lru_cache()
19
+ def bytes_to_unicode():
20
+ """Returns list of utf-8 byte and a mapping to unicode strings.
21
+
22
+ We specifically avoids mapping to whitespace/control characters the bpe code
23
+ barfs on.
24
+
25
+ The reversible bpe codes work on unicode strings. This means you need a
26
+ large # of unicode characters in your vocab if you want to avoid UNKs. When
27
+ you're at something like a 10B token dataset you end up needing around 5K
28
+ for decent coverage. This is a significant percentage of your normal, say,
29
+ 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
30
+ unicode strings.
31
+ """
32
+ bs = (
33
+ list(range(ord('!'),
34
+ ord('~') + 1)) + list(range(ord('¡'),
35
+ ord('¬') + 1)) +
36
+ list(range(ord('®'),
37
+ ord('ÿ') + 1))
38
+ )
39
+ cs = bs[:]
40
+ n = 0
41
+ for b in range(2**8):
42
+ if b not in bs:
43
+ bs.append(b)
44
+ cs.append(2**8 + n)
45
+ n += 1
46
+ cs = [chr(n) for n in cs]
47
+ return dict(zip(bs, cs))
48
+
49
+
50
+ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
51
+ """A thin wrapper around tiktoken to make it compatible with Hugging Face.
52
+
53
+ tokenizers.
54
+
55
+ See HuggingFace for further documentation on general tokenizer methods.
56
+ """
57
+
58
+ model_input_names = ['input_ids', 'attention_mask']
59
+
60
+ def __init__(
61
+ self,
62
+ model_name: Optional[str] = None,
63
+ encoding_name: Optional[str] = None,
64
+ add_bos_token: bool = False,
65
+ add_eos_token: bool = False,
66
+ use_default_system_prompt: bool = False,
67
+ unk_token: Optional[str] = '<|endoftext|>',
68
+ eos_token: Optional[str] = '<|endoftext|>',
69
+ bos_token: Optional[str] = '<|endoftext|>',
70
+ pad_token: Optional[str] = None,
71
+ errors: str = 'replace',
72
+ **kwargs: Any,
73
+ ):
74
+ """Constructor creates a tiktoken tokenizer to use as the underlying.
75
+
76
+ tokenizer.
77
+
78
+ Args:
79
+ model_name (Optional[str], optional): The name of the model to load from tiktoken. Defaults to None.
80
+ Either model_name or encoding_name must be set, but not both.
81
+ encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None.
82
+ Either model_name or encoding_name must be set, but not both.
83
+ add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
84
+ add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
85
+ use_default_system_prompt (bool, optional): Use the default system prompt or not. Defaults to False.
86
+ unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
87
+ eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
88
+ bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
89
+ pad_token (Optional[str], optional): The pad token. Defaults to None.
90
+ errors (str, optional): Paradigm to follow when decoding bytes to UTF-8. See
91
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
92
+ Defaults to `"replace"`.
93
+ """
94
+ try:
95
+ import tiktoken
96
+ except:
97
+ raise ImportError(
98
+ 'You need to install tiktoken to use TiktokenTokenizerWrapper.',
99
+ )
100
+
101
+ # Workaround to make tiktokenizer picklable.
102
+ # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347
103
+ # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181
104
+ import copyreg
105
+ import functools
106
+
107
+ from tiktoken import Encoding # type: ignore (thirdParty)
108
+
109
+ def pickle_Encoding(enc: Encoding):
110
+ return (
111
+ functools.partial(
112
+ Encoding,
113
+ enc.name,
114
+ pat_str=enc._pat_str,
115
+ mergeable_ranks=enc._mergeable_ranks,
116
+ special_tokens=enc._special_tokens,
117
+ ),
118
+ (),
119
+ )
120
+
121
+ copyreg.pickle(Encoding, pickle_Encoding)
122
+
123
+ if model_name is not None and encoding_name is not None:
124
+ raise ValueError(
125
+ 'You need to specify either model_name or encoding_name, not both.',
126
+ )
127
+
128
+ self.model_name = model_name
129
+ self.encoding_name = encoding_name
130
+
131
+ if self.model_name is not None:
132
+ self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty)
133
+ self.model_name)
134
+ elif self.encoding_name is not None:
135
+ self.encoding = tiktoken.get_encoding( # type: ignore (thirdParty)
136
+ self.encoding_name)
137
+ else:
138
+ raise ValueError(
139
+ 'You need to specify either model_name or encoding_name.',
140
+ )
141
+
142
+ self.add_bos_token = add_bos_token
143
+ self.add_eos_token = add_eos_token
144
+ self.use_default_system_prompt = use_default_system_prompt
145
+
146
+ self.byte_encoder = bytes_to_unicode()
147
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
148
+ self.errors = errors
149
+
150
+ self.decoder: Dict[int, str] = {}
151
+ for i in range(self.encoding.n_vocab):
152
+ try:
153
+ self.encoding.decode_single_token_bytes(i)
154
+ except KeyError:
155
+ continue
156
+ # Taken from
157
+ # https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
158
+ decoding = ''.join([
159
+ bytes_to_unicode()[ord(char)] for char in
160
+ self.encoding.decode_single_token_bytes(i).decode('latin-1')
161
+ ])
162
+ self.decoder[i] = decoding
163
+
164
+ self.encoder: Dict[str, int] = {}
165
+ for i in range(self.encoding.n_vocab):
166
+ if i in self.decoder:
167
+ self.encoder[self.decoder[i]] = i
168
+
169
+ super().__init__(
170
+ model_name=model_name,
171
+ encoding_name=encoding_name,
172
+ add_bos_token=add_bos_token,
173
+ add_eos_token=add_eos_token,
174
+ use_default_system_prompt=use_default_system_prompt,
175
+ unk_token=unk_token,
176
+ eos_token=eos_token,
177
+ bos_token=bos_token,
178
+ pad_token=pad_token,
179
+ errors=errors,
180
+ **kwargs,
181
+ )
182
+
183
+ @property
184
+ def vocab_size(self) -> int:
185
+ """Returns vocab size."""
186
+ return self.encoding.n_vocab
187
+
188
+ @property
189
+ def is_fast(self) -> bool:
190
+ return False
191
+
192
+ @property
193
+ def default_chat_template(self):
194
+ """Chat ML Template for User/Assistant.
195
+
196
+ Pinning default Chat ML template in case defaults change.
197
+ """
198
+ template = (
199
+ "{% if messages[0]['role'] == 'system' %}"
200
+ '{% set loop_messages = messages[1:] %}'
201
+ "{% set system_message = messages[0]['content'] %}"
202
+ "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
203
+ '{% set loop_messages = messages %}'
204
+ "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
205
+ '{% else %}'
206
+ '{% set loop_messages = messages %}'
207
+ '{% set system_message = false %}'
208
+ '{% endif %}'
209
+ '{% for message in loop_messages %}'
210
+ '{% if loop.index0 == 0 %}'
211
+ '{% if system_message != false %}'
212
+ "{{ '<|im_start|>system\n' + system_message.strip() + '<|im_end|>\n'}}"
213
+ '{% endif %}'
214
+ "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
215
+ '{% else %}'
216
+ "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
217
+ '{% endif %}'
218
+ '{% if (add_generation_prompt == true and loop.last) %}'
219
+ "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
220
+ '{% endif %}'
221
+ '{% endfor %}'
222
+ )
223
+ template = template.replace(
224
+ 'USE_DEFAULT_PROMPT',
225
+ 'true' if self.use_default_system_prompt else 'false',
226
+ )
227
+ template = template.replace(
228
+ 'DEFAULT_SYSTEM_PROMPT',
229
+ DEFAULT_SYSTEM_PROMPT,
230
+ )
231
+ return template
232
+
233
+ def get_vocab(self) -> Dict[str, int]:
234
+ """Returns vocab as a dict."""
235
+ # As far as I can tell, we don't require get_vocab to completely work,
236
+ # but when using additional_special_tokens, Hugging Face determines the next
237
+ # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
238
+ vocab_clone = self.encoder.copy()
239
+ extra_id_index = 0
240
+ candidate_extra_id = f'<extra_id_{extra_id_index}>'
241
+ indices_to_fill_in = (
242
+ set(range(self.vocab_size)) - set(vocab_clone.values())
243
+ )
244
+
245
+ # Add enough indices to make get_vocab() the right length
246
+ for index_to_add in indices_to_fill_in:
247
+ # Make sure we don't overwrite a token that already exists
248
+ while candidate_extra_id in vocab_clone:
249
+ extra_id_index += 1
250
+ candidate_extra_id = f'<extra_id_{extra_id_index}>'
251
+
252
+ # Get an index to add and add the item
253
+ vocab_clone[candidate_extra_id] = index_to_add
254
+
255
+ return dict(vocab_clone, **self.added_tokens_encoder)
256
+
257
+ def _tokenize(self, text: str) -> List[str]:
258
+ """Returns a tokenized string."""
259
+ if not isinstance(text, str):
260
+ raise ValueError(
261
+ f'Expected a string input to _tokenize but got {type(text)}.',
262
+ )
263
+
264
+ tokens = [
265
+ self.decoder[t]
266
+ for t in self.encoding.encode(text, allowed_special='all')
267
+ ]
268
+
269
+ return tokens
270
+
271
+ def _convert_token_to_id(self, token: str) -> Optional[int]:
272
+ """Converts a token (str) in an id using the vocab."""
273
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
274
+
275
+ def _convert_id_to_token(self, index: int) -> Optional[str]:
276
+ """Converts an index (integer) in a token (str) using the vocab."""
277
+ # For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer,
278
+ # we return empty string. This matches the behavior of Hugging Face fast tokenizers,
279
+ # but not slow tokenizers.
280
+ return self.decoder.get(index, '')
281
+
282
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
283
+ """Converts a sequence of tokens (string) in a single string."""
284
+ text = ''.join(tokens)
285
+ text = bytearray([self.byte_decoder[c] for c in text
286
+ ],).decode('utf-8', errors=self.errors)
287
+ return text
288
+
289
+ def build_inputs_with_special_tokens(
290
+ self,
291
+ token_ids_0: List[int],
292
+ token_ids_1: Optional[List[int]] = None,
293
+ ) -> List[int]:
294
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
295
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
296
+
297
+ output = bos_token_id + token_ids_0 + eos_token_id
298
+
299
+ if token_ids_1 is not None:
300
+ output = output + bos_token_id + token_ids_1 + eos_token_id
301
+
302
+ return output
303
+
304
+ def get_special_tokens_mask(
305
+ self,
306
+ token_ids_0: List[int],
307
+ token_ids_1: Optional[List[int]] = None,
308
+ already_has_special_tokens: bool = False,
309
+ ) -> List[int]:
310
+ """Retrieves sequence ids from a token list that has no special tokens.
311
+
312
+ Function copied from
313
+ https://github.com/huggingface/transformers/blob/e3a4bd2bee212a2d0fd9f03b27fe7bfc1debe42d/src/transformers/models/gpt2/tokenization_gpt2.py#L265-L295
314
+
315
+ added. This method is called when adding special tokens using the
316
+ tokenizer `prepare_for_model` or `encode_plus` methods.
317
+
318
+ Args:
319
+ token_ids_0 (`List[int]`):
320
+ List of IDs.
321
+ token_ids_1 (`List[int]`, *optional*):
322
+ Optional second list of IDs for sequence pairs.
323
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
324
+ Whether or not the token list is already formatted with special tokens for the model.
325
+
326
+ Returns:
327
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
328
+ """
329
+ if already_has_special_tokens:
330
+ return super().get_special_tokens_mask(
331
+ token_ids_0=token_ids_0,
332
+ token_ids_1=token_ids_1,
333
+ already_has_special_tokens=True,
334
+ )
335
+
336
+ bos_token_id = [1] if self.add_bos_token else []
337
+ eos_token_id = [1] if self.add_eos_token else []
338
+
339
+ if token_ids_1 is None:
340
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
341
+ return (
342
+ bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
343
+ bos_token_id + ([0] * len(token_ids_1)) + eos_token_id
344
+ )
345
+
346
+ def create_token_type_ids_from_sequences(
347
+ self,
348
+ token_ids_0: List[int],
349
+ token_ids_1: Optional[List[int]] = None,
350
+ ) -> List[int]:
351
+ sep = [self.sep_token_id]
352
+
353
+ if token_ids_1 is None:
354
+ return len(token_ids_0 + sep) * [0]
355
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
356
+
357
+ def save_vocabulary(
358
+ self,
359
+ save_directory: str,
360
+ filename_prefix: Optional[str] = None,
361
+ ) -> Tuple[str]:
362
+
363
+ # ignore the below type to keep the original signature
364
+ # we are knowingly breaking the signature here, although not 100% certain
365
+ # it doesn't have side effects
366
+ # There is some code in huggingface that calls this function to get the vocab files,
367
+ # but it doesn't seem to access them (or at least checks for their existence
368
+ # before accessing them)
369
+ return (None, None) # type: ignore
370
+
371
+ def sanitize_special_tokens(self) -> int:
372
+ """Make sure that all the special tokens attributes of the tokenizer.
373
+
374
+ (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the
375
+ vocabulary.
376
+
377
+ Add the missing ones to the vocabulary if needed.
378
+
379
+ Return:
380
+ `int`: The number of tokens added in the vocabulary during the operation.
381
+ """
382
+ actual_new_tokens = []
383
+ for token in self.all_special_tokens_extended:
384
+ encoded = self.encoding.encode(token, allowed_special='all')
385
+ if len(encoded) > 1:
386
+ actual_new_tokens.append(token)
387
+
388
+ return self.add_tokens(actual_new_tokens, special_tokens=True)
389
+
390
+
391
+ TiktokenTokenizerWrapper.register_for_auto_class()
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "100257": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "auto_map": {
15
+ "AutoTokenizer": [
16
+ "tiktoken.TiktokenTokenizerWrapper",
17
+ null
18
+ ]
19
+ },
20
+ "bos_token": "<|endoftext|>",
21
+ "clean_up_tokenization_spaces": true,
22
+ "encoding_name": null,
23
+ "eos_token": "<|endoftext|>",
24
+ "errors": "replace",
25
+ "extra_special_tokens": {},
26
+ "model_max_length": 1000000000000000019884624838656,
27
+ "model_name": "gpt-4",
28
+ "pad_token": "<|endoftext|>",
29
+ "padding_side": "right",
30
+ "split_special_tokens": false,
31
+ "tokenizer_class": "TiktokenTokenizerWrapper",
32
+ "unk_token": "<|endoftext|>",
33
+ "use_default_system_prompt": false
34
+ }