|
import json |
|
import os |
|
from typing import List, Optional, Union, Dict, Any, Tuple |
|
|
|
from transformers import PreTrainedTokenizerFast |
|
from transformers.tokenization_utils_base import AddedToken |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
class TessarTokenizer(PreTrainedTokenizerFast): |
|
""" |
|
Tessar Tokenizer implementation for Hugging Face Transformers |
|
|
|
This custom tokenizer extends the PreTrainedTokenizerFast with specialized |
|
configuration and tokenization methods for the Tessar model. |
|
""" |
|
|
|
model_input_names = ['input_ids', 'attention_mask'] |
|
vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"} |
|
|
|
def __init__( |
|
self, |
|
vocab_file=None, |
|
tokenizer_file=None, |
|
do_lower_case=True, |
|
unk_token="<unk>", |
|
sep_token="</s>", |
|
pad_token="<pad>", |
|
cls_token="<s>", |
|
mask_token="<mask>", |
|
bos_token="<s>", |
|
eos_token="</s>", |
|
max_cell_length=15, |
|
**kwargs |
|
): |
|
""" |
|
Initialize the Tessar Tokenizer with specific token configurations |
|
|
|
Args: |
|
vocab_file (str, optional): Path to the vocabulary file |
|
tokenizer_file (str, optional): Path to the pre-trained tokenizer file |
|
do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True. |
|
max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15. |
|
""" |
|
|
|
special_tokens_dict = { |
|
"unk_token": unk_token, |
|
"sep_token": sep_token, |
|
"pad_token": pad_token, |
|
"cls_token": cls_token, |
|
"mask_token": mask_token, |
|
"bos_token": bos_token, |
|
"eos_token": eos_token, |
|
} |
|
|
|
|
|
for token_name, token_value in special_tokens_dict.items(): |
|
if isinstance(token_value, str): |
|
special_tokens_dict[token_name] = AddedToken(token_value, |
|
lstrip=False, |
|
rstrip=False, |
|
normalized=True, |
|
special=True) |
|
|
|
|
|
super().__init__( |
|
vocab_file=vocab_file, |
|
tokenizer_file=tokenizer_file, |
|
**special_tokens_dict, |
|
**kwargs |
|
) |
|
|
|
|
|
self.do_lower_case = do_lower_case |
|
self.max_cell_length = max_cell_length |
|
|
|
@property |
|
def vocab_size(self) -> int: |
|
""" |
|
Return the size of vocabulary |
|
|
|
Returns: |
|
int: The vocabulary size |
|
""" |
|
return len(self.vocab) |
|
|
|
def get_vocab(self) -> Dict[str, int]: |
|
""" |
|
Return the vocabulary mapping |
|
|
|
Returns: |
|
Dict[str, int]: The vocabulary mapping |
|
""" |
|
return dict(self.vocab) |
|
|
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]: |
|
""" |
|
Save the tokenizer vocabulary and special tokens file |
|
|
|
Args: |
|
save_directory (str): Directory to save the vocabulary |
|
filename_prefix (str, optional): Prefix for the saved files |
|
|
|
Returns: |
|
tuple: Paths to the saved files |
|
""" |
|
|
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
|
|
vocab_file = os.path.join( |
|
save_directory, |
|
f"{filename_prefix + '-' if filename_prefix else ''}vocab.json" |
|
) |
|
|
|
|
|
tokenizer_file = os.path.join( |
|
save_directory, |
|
f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json" |
|
) |
|
|
|
|
|
special_tokens_file = os.path.join( |
|
save_directory, |
|
f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json" |
|
) |
|
|
|
|
|
vocab_dict = self.get_vocab() |
|
|
|
|
|
with open(vocab_file, 'w', encoding='utf-8') as f: |
|
json.dump(vocab_dict, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"): |
|
self.backend_tokenizer.save(tokenizer_file) |
|
|
|
|
|
special_tokens_config = { |
|
"unk_token": self.unk_token, |
|
"sep_token": self.sep_token, |
|
"pad_token": self.pad_token, |
|
"cls_token": self.cls_token, |
|
"mask_token": self.mask_token, |
|
"bos_token": self.bos_token, |
|
"eos_token": self.eos_token, |
|
"do_lower_case": self.do_lower_case, |
|
"max_cell_length": self.max_cell_length |
|
} |
|
|
|
|
|
for key, token in special_tokens_config.items(): |
|
if hasattr(token, "content"): |
|
special_tokens_config[key] = token.content |
|
|
|
with open(special_tokens_file, 'w', encoding='utf-8') as f: |
|
json.dump(special_tokens_config, f, ensure_ascii=False, indent=2) |
|
|
|
return (vocab_file, tokenizer_file, special_tokens_file) |
|
|
|
def _tokenize(self, text: str) -> List[str]: |
|
""" |
|
Custom tokenization method |
|
|
|
Args: |
|
text (str): Input text to tokenize |
|
|
|
Returns: |
|
List[str]: List of tokens |
|
""" |
|
|
|
if self.do_lower_case: |
|
text = text.lower() |
|
|
|
|
|
tokens = super()._tokenize(text) |
|
|
|
|
|
if self.max_cell_length > 0: |
|
tokens = tokens[:self.max_cell_length] |
|
|
|
return tokens |
|
|
|
def prepare_for_model( |
|
self, |
|
ids: List[int], |
|
pair_ids: Optional[List[int]] = None, |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str] = False, |
|
truncation: Union[bool, str] = False, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[str] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs |
|
) -> Dict[str, Any]: |
|
""" |
|
Prepare tokenized inputs for the model |
|
|
|
Args: |
|
ids (List[int]): List of input token ids |
|
pair_ids (Optional[List[int]], optional): List of pair token ids |
|
|
|
Returns: |
|
dict: Prepared model inputs |
|
""" |
|
|
|
|
|
|
|
return super().prepare_for_model( |
|
ids, |
|
pair_ids=pair_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
return_token_type_ids=return_token_type_ids, |
|
return_attention_mask=return_attention_mask, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_offsets_mapping=return_offsets_mapping, |
|
return_length=return_length, |
|
verbose=verbose, |
|
**kwargs |
|
) |
|
|
|
def batch_encode_tables( |
|
self, |
|
tables: List[List[List[str]]], |
|
max_length: Optional[int] = None, |
|
padding: Union[bool, str] = True, |
|
truncation: Union[bool, str] = True, |
|
return_tensors: Optional[str] = "pt", |
|
**kwargs |
|
) -> Dict[str, Any]: |
|
""" |
|
Encode a batch of tables for table question answering |
|
|
|
Args: |
|
tables (List[List[List[str]]]): List of tables, where each table is a list of rows, |
|
and each row is a list of cell values |
|
max_length (Optional[int], optional): Maximum sequence length |
|
padding (Union[bool, str], optional): Padding strategy |
|
truncation (Union[bool, str], optional): Truncation strategy |
|
return_tensors (Optional[str], optional): Type of tensors to return |
|
|
|
Returns: |
|
Dict[str, Any]: Encoded table batch |
|
""" |
|
|
|
flattened_inputs = [] |
|
|
|
for table in tables: |
|
|
|
|
|
table_text = "" |
|
|
|
for row_idx, row in enumerate(table): |
|
for col_idx, cell in enumerate(row): |
|
|
|
if self.do_lower_case: |
|
cell = cell.lower() |
|
|
|
|
|
table_text += f"[CELL_{row_idx}_{col_idx}] {cell} " |
|
|
|
|
|
table_text += "[ROW_END] " |
|
|
|
flattened_inputs.append(table_text.strip()) |
|
|
|
|
|
return self( |
|
flattened_inputs, |
|
max_length=max_length, |
|
padding=padding, |
|
truncation=truncation, |
|
return_tensors=return_tensors, |
|
**kwargs |
|
) |
|
|
|
|
|
def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs): |
|
""" |
|
Load a pretrained Tessar tokenizer |
|
|
|
Args: |
|
pretrained_model_name_or_path (str): Path to the pretrained model |
|
**kwargs: Additional arguments to pass to from_pretrained |
|
|
|
Returns: |
|
TessarTokenizer: Initialized tokenizer |
|
""" |
|
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) |
|
|
|
|
|
|
|
from transformers import AutoTokenizer |
|
AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
try: |
|
|
|
tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest") |
|
print("Tokenizer loaded successfully!") |
|
|
|
|
|
|
|
auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest") |
|
print("AutoTokenizer loaded successfully!") |
|
|
|
|
|
text = "Hello, how are you doing today?" |
|
encoded = tokenizer(text, return_tensors="pt") |
|
print("Encoded Input:", encoded) |
|
|
|
|
|
table = [ |
|
["Header1", "Header2", "Header3"], |
|
["Value1", "Value2", "Value3"], |
|
["Value4", "Value5", "Value6"] |
|
] |
|
|
|
|
|
encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt") |
|
print("Encoded Table:", encoded_table) |
|
|
|
except Exception as e: |
|
print(f"Error loading tokenizer: {e}") |