SVECTOR-OFFICIAL commited on
Commit
e717e47
·
verified ·
1 Parent(s): f23df61

Update tessar_tokenizer.py

Browse files
Files changed (1) hide show
  1. tessar_tokenizer.py +175 -16
tessar_tokenizer.py CHANGED
@@ -1,9 +1,12 @@
1
  import json
2
  import os
3
- from typing import List, Optional, Union
4
 
5
  from transformers import PreTrainedTokenizerFast
 
 
6
 
 
7
 
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
@@ -14,6 +17,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
14
  """
15
 
16
  model_input_names = ['input_ids', 'attention_mask']
 
17
 
18
  def __init__(
19
  self,
@@ -40,7 +44,7 @@ class TessarTokenizer(PreTrainedTokenizerFast):
40
  max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
41
  """
42
  # Prepare special tokens
43
- special_tokens = {
44
  "unk_token": unk_token,
45
  "sep_token": sep_token,
46
  "pad_token": pad_token,
@@ -50,15 +54,20 @@ class TessarTokenizer(PreTrainedTokenizerFast):
50
  "eos_token": eos_token,
51
  }
52
 
53
- # Remove None values
54
- special_tokens = {k: v for k, v in special_tokens.items() if v is not None}
 
 
 
 
 
 
55
 
56
  # Call parent constructor
57
  super().__init__(
58
  vocab_file=vocab_file,
59
  tokenizer_file=tokenizer_file,
60
- do_lower_case=do_lower_case,
61
- **special_tokens,
62
  **kwargs
63
  )
64
 
@@ -66,7 +75,26 @@ class TessarTokenizer(PreTrainedTokenizerFast):
66
  self.do_lower_case = do_lower_case
67
  self.max_cell_length = max_cell_length
68
 
69
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  """
71
  Save the tokenizer vocabulary and special tokens file
72
 
@@ -86,15 +114,28 @@ class TessarTokenizer(PreTrainedTokenizerFast):
86
  f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
87
  )
88
 
 
 
 
 
 
 
89
  # Save special tokens configuration
90
  special_tokens_file = os.path.join(
91
  save_directory,
92
  f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
93
  )
94
 
 
 
 
95
  # Save vocabulary
96
  with open(vocab_file, 'w', encoding='utf-8') as f:
97
- json.dump(self.vocab, f, ensure_ascii=False, indent=2)
 
 
 
 
98
 
99
  # Save special tokens configuration
100
  special_tokens_config = {
@@ -109,10 +150,15 @@ class TessarTokenizer(PreTrainedTokenizerFast):
109
  "max_cell_length": self.max_cell_length
110
  }
111
 
 
 
 
 
 
112
  with open(special_tokens_file, 'w', encoding='utf-8') as f:
113
  json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
114
 
115
- return (vocab_file, special_tokens_file)
116
 
117
  def _tokenize(self, text: str) -> List[str]:
118
  """
@@ -132,7 +178,8 @@ class TessarTokenizer(PreTrainedTokenizerFast):
132
  tokens = super()._tokenize(text)
133
 
134
  # Optional: Add custom cell-length truncation
135
- tokens = tokens[:self.max_cell_length]
 
136
 
137
  return tokens
138
 
@@ -140,8 +187,22 @@ class TessarTokenizer(PreTrainedTokenizerFast):
140
  self,
141
  ids: List[int],
142
  pair_ids: Optional[List[int]] = None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  **kwargs
144
- ) -> dict:
145
  """
146
  Prepare tokenized inputs for the model
147
 
@@ -153,33 +214,131 @@ class TessarTokenizer(PreTrainedTokenizerFast):
153
  dict: Prepared model inputs
154
  """
155
  # Implement any Tessar-specific model preparation logic
156
- # This method can be extended to add Tessar-specific preprocessing
157
- return super().prepare_for_model(ids, pair_ids, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
 
160
- def load_tessar_tokenizer(pretrained_model_name_or_path: str):
161
  """
162
  Load a pretrained Tessar tokenizer
163
 
164
  Args:
165
  pretrained_model_name_or_path (str): Path to the pretrained model
 
166
 
167
  Returns:
168
  TessarTokenizer: Initialized tokenizer
169
  """
170
- return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
 
 
 
 
 
171
 
172
 
173
- # Optionally, add some example usage
174
  if __name__ == "__main__":
175
  # Example of loading a pretrained tokenizer
176
  try:
 
177
  tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
178
  print("Tokenizer loaded successfully!")
179
 
 
 
 
 
 
180
  # Basic tokenization example
181
  text = "Hello, how are you doing today?"
182
  encoded = tokenizer(text, return_tensors="pt")
183
  print("Encoded Input:", encoded)
 
 
 
 
 
 
 
 
 
 
 
 
184
  except Exception as e:
185
  print(f"Error loading tokenizer: {e}")
 
1
  import json
2
  import os
3
+ from typing import List, Optional, Union, Dict, Any, Tuple
4
 
5
  from transformers import PreTrainedTokenizerFast
6
+ from transformers.tokenization_utils_base import AddedToken
7
+ from transformers.utils import logging
8
 
9
+ logger = logging.get_logger(__name__)
10
 
11
  class TessarTokenizer(PreTrainedTokenizerFast):
12
  """
 
17
  """
18
 
19
  model_input_names = ['input_ids', 'attention_mask']
20
+ vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
21
 
22
  def __init__(
23
  self,
 
44
  max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
45
  """
46
  # Prepare special tokens
47
+ special_tokens_dict = {
48
  "unk_token": unk_token,
49
  "sep_token": sep_token,
50
  "pad_token": pad_token,
 
54
  "eos_token": eos_token,
55
  }
56
 
57
+ # Convert string tokens to AddedToken objects if they're not already
58
+ for token_name, token_value in special_tokens_dict.items():
59
+ if isinstance(token_value, str):
60
+ special_tokens_dict[token_name] = AddedToken(token_value,
61
+ lstrip=False,
62
+ rstrip=False,
63
+ normalized=True,
64
+ special=True)
65
 
66
  # Call parent constructor
67
  super().__init__(
68
  vocab_file=vocab_file,
69
  tokenizer_file=tokenizer_file,
70
+ **special_tokens_dict,
 
71
  **kwargs
72
  )
73
 
 
75
  self.do_lower_case = do_lower_case
76
  self.max_cell_length = max_cell_length
77
 
78
+ @property
79
+ def vocab_size(self) -> int:
80
+ """
81
+ Return the size of vocabulary
82
+
83
+ Returns:
84
+ int: The vocabulary size
85
+ """
86
+ return len(self.vocab)
87
+
88
+ def get_vocab(self) -> Dict[str, int]:
89
+ """
90
+ Return the vocabulary mapping
91
+
92
+ Returns:
93
+ Dict[str, int]: The vocabulary mapping
94
+ """
95
+ return dict(self.vocab)
96
+
97
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]:
98
  """
99
  Save the tokenizer vocabulary and special tokens file
100
 
 
114
  f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
115
  )
116
 
117
+ # Save tokenizer file
118
+ tokenizer_file = os.path.join(
119
+ save_directory,
120
+ f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json"
121
+ )
122
+
123
  # Save special tokens configuration
124
  special_tokens_file = os.path.join(
125
  save_directory,
126
  f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
127
  )
128
 
129
+ # Get vocabulary from tokenizer
130
+ vocab_dict = self.get_vocab()
131
+
132
  # Save vocabulary
133
  with open(vocab_file, 'w', encoding='utf-8') as f:
134
+ json.dump(vocab_dict, f, ensure_ascii=False, indent=2)
135
+
136
+ # Save the tokenizer file if it exists
137
+ if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"):
138
+ self.backend_tokenizer.save(tokenizer_file)
139
 
140
  # Save special tokens configuration
141
  special_tokens_config = {
 
150
  "max_cell_length": self.max_cell_length
151
  }
152
 
153
+ # Convert token objects to strings for JSON serialization
154
+ for key, token in special_tokens_config.items():
155
+ if hasattr(token, "content"):
156
+ special_tokens_config[key] = token.content
157
+
158
  with open(special_tokens_file, 'w', encoding='utf-8') as f:
159
  json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
160
 
161
+ return (vocab_file, tokenizer_file, special_tokens_file)
162
 
163
  def _tokenize(self, text: str) -> List[str]:
164
  """
 
178
  tokens = super()._tokenize(text)
179
 
180
  # Optional: Add custom cell-length truncation
181
+ if self.max_cell_length > 0:
182
+ tokens = tokens[:self.max_cell_length]
183
 
184
  return tokens
185
 
 
187
  self,
188
  ids: List[int],
189
  pair_ids: Optional[List[int]] = None,
190
+ add_special_tokens: bool = True,
191
+ padding: Union[bool, str] = False,
192
+ truncation: Union[bool, str] = False,
193
+ max_length: Optional[int] = None,
194
+ stride: int = 0,
195
+ pad_to_multiple_of: Optional[int] = None,
196
+ return_tensors: Optional[str] = None,
197
+ return_token_type_ids: Optional[bool] = None,
198
+ return_attention_mask: Optional[bool] = None,
199
+ return_overflowing_tokens: bool = False,
200
+ return_special_tokens_mask: bool = False,
201
+ return_offsets_mapping: bool = False,
202
+ return_length: bool = False,
203
+ verbose: bool = True,
204
  **kwargs
205
+ ) -> Dict[str, Any]:
206
  """
207
  Prepare tokenized inputs for the model
208
 
 
214
  dict: Prepared model inputs
215
  """
216
  # Implement any Tessar-specific model preparation logic
217
+ # For example, you might want to handle table data differently
218
+
219
+ return super().prepare_for_model(
220
+ ids,
221
+ pair_ids=pair_ids,
222
+ add_special_tokens=add_special_tokens,
223
+ padding=padding,
224
+ truncation=truncation,
225
+ max_length=max_length,
226
+ stride=stride,
227
+ pad_to_multiple_of=pad_to_multiple_of,
228
+ return_tensors=return_tensors,
229
+ return_token_type_ids=return_token_type_ids,
230
+ return_attention_mask=return_attention_mask,
231
+ return_overflowing_tokens=return_overflowing_tokens,
232
+ return_special_tokens_mask=return_special_tokens_mask,
233
+ return_offsets_mapping=return_offsets_mapping,
234
+ return_length=return_length,
235
+ verbose=verbose,
236
+ **kwargs
237
+ )
238
+
239
+ def batch_encode_tables(
240
+ self,
241
+ tables: List[List[List[str]]],
242
+ max_length: Optional[int] = None,
243
+ padding: Union[bool, str] = True,
244
+ truncation: Union[bool, str] = True,
245
+ return_tensors: Optional[str] = "pt",
246
+ **kwargs
247
+ ) -> Dict[str, Any]:
248
+ """
249
+ Encode a batch of tables for table question answering
250
+
251
+ Args:
252
+ tables (List[List[List[str]]]): List of tables, where each table is a list of rows,
253
+ and each row is a list of cell values
254
+ max_length (Optional[int], optional): Maximum sequence length
255
+ padding (Union[bool, str], optional): Padding strategy
256
+ truncation (Union[bool, str], optional): Truncation strategy
257
+ return_tensors (Optional[str], optional): Type of tensors to return
258
+
259
+ Returns:
260
+ Dict[str, Any]: Encoded table batch
261
+ """
262
+ # Flatten tables into text sequences with appropriate format
263
+ flattened_inputs = []
264
+
265
+ for table in tables:
266
+ # Convert table to a flattened text representation
267
+ # This is a simplified example - real implementation would depend on your specific format
268
+ table_text = ""
269
+
270
+ for row_idx, row in enumerate(table):
271
+ for col_idx, cell in enumerate(row):
272
+ # Apply cell-level processing
273
+ if self.do_lower_case:
274
+ cell = cell.lower()
275
+
276
+ # Add cell with position information
277
+ table_text += f"[CELL_{row_idx}_{col_idx}] {cell} "
278
+
279
+ # Add row separator
280
+ table_text += "[ROW_END] "
281
+
282
+ flattened_inputs.append(table_text.strip())
283
+
284
+ # Encode the flattened text inputs
285
+ return self(
286
+ flattened_inputs,
287
+ max_length=max_length,
288
+ padding=padding,
289
+ truncation=truncation,
290
+ return_tensors=return_tensors,
291
+ **kwargs
292
+ )
293
 
294
 
295
+ def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs):
296
  """
297
  Load a pretrained Tessar tokenizer
298
 
299
  Args:
300
  pretrained_model_name_or_path (str): Path to the pretrained model
301
+ **kwargs: Additional arguments to pass to from_pretrained
302
 
303
  Returns:
304
  TessarTokenizer: Initialized tokenizer
305
  """
306
+ return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
307
+
308
+
309
+ # Register the tokenizer with the Transformers library
310
+ from transformers import AutoTokenizer
311
+ AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer)
312
 
313
 
314
+ # Example usage
315
  if __name__ == "__main__":
316
  # Example of loading a pretrained tokenizer
317
  try:
318
+ # Method 1: Direct loading with the class
319
  tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
320
  print("Tokenizer loaded successfully!")
321
 
322
+ # Method 2: Loading through AutoTokenizer
323
+ # This will work after the registration above
324
+ auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
325
+ print("AutoTokenizer loaded successfully!")
326
+
327
  # Basic tokenization example
328
  text = "Hello, how are you doing today?"
329
  encoded = tokenizer(text, return_tensors="pt")
330
  print("Encoded Input:", encoded)
331
+
332
+ # Example with table data
333
+ table = [
334
+ ["Header1", "Header2", "Header3"],
335
+ ["Value1", "Value2", "Value3"],
336
+ ["Value4", "Value5", "Value6"]
337
+ ]
338
+
339
+ # Example of batch encoding tables
340
+ encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt")
341
+ print("Encoded Table:", encoded_table)
342
+
343
  except Exception as e:
344
  print(f"Error loading tokenizer: {e}")