Tessar-largest / tessar_tokenizer.py
SVECTOR-OFFICIAL's picture
Update tessar_tokenizer.py
e717e47 verified
import json
import os
from typing import List, Optional, Union, Dict, Any, Tuple
from transformers import PreTrainedTokenizerFast
from transformers.tokenization_utils_base import AddedToken
from transformers.utils import logging
logger = logging.get_logger(__name__)
class TessarTokenizer(PreTrainedTokenizerFast):
"""
Tessar Tokenizer implementation for Hugging Face Transformers
This custom tokenizer extends the PreTrainedTokenizerFast with specialized
configuration and tokenization methods for the Tessar model.
"""
model_input_names = ['input_ids', 'attention_mask']
vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="<unk>",
sep_token="</s>",
pad_token="<pad>",
cls_token="<s>",
mask_token="<mask>",
bos_token="<s>",
eos_token="</s>",
max_cell_length=15,
**kwargs
):
"""
Initialize the Tessar Tokenizer with specific token configurations
Args:
vocab_file (str, optional): Path to the vocabulary file
tokenizer_file (str, optional): Path to the pre-trained tokenizer file
do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
"""
# Prepare special tokens
special_tokens_dict = {
"unk_token": unk_token,
"sep_token": sep_token,
"pad_token": pad_token,
"cls_token": cls_token,
"mask_token": mask_token,
"bos_token": bos_token,
"eos_token": eos_token,
}
# Convert string tokens to AddedToken objects if they're not already
for token_name, token_value in special_tokens_dict.items():
if isinstance(token_value, str):
special_tokens_dict[token_name] = AddedToken(token_value,
lstrip=False,
rstrip=False,
normalized=True,
special=True)
# Call parent constructor
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
**special_tokens_dict,
**kwargs
)
# Custom Tessar-specific attributes
self.do_lower_case = do_lower_case
self.max_cell_length = max_cell_length
@property
def vocab_size(self) -> int:
"""
Return the size of vocabulary
Returns:
int: The vocabulary size
"""
return len(self.vocab)
def get_vocab(self) -> Dict[str, int]:
"""
Return the vocabulary mapping
Returns:
Dict[str, int]: The vocabulary mapping
"""
return dict(self.vocab)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]:
"""
Save the tokenizer vocabulary and special tokens file
Args:
save_directory (str): Directory to save the vocabulary
filename_prefix (str, optional): Prefix for the saved files
Returns:
tuple: Paths to the saved files
"""
# Ensure the save directory exists
os.makedirs(save_directory, exist_ok=True)
# Prepare file paths
vocab_file = os.path.join(
save_directory,
f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
)
# Save tokenizer file
tokenizer_file = os.path.join(
save_directory,
f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json"
)
# Save special tokens configuration
special_tokens_file = os.path.join(
save_directory,
f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
)
# Get vocabulary from tokenizer
vocab_dict = self.get_vocab()
# Save vocabulary
with open(vocab_file, 'w', encoding='utf-8') as f:
json.dump(vocab_dict, f, ensure_ascii=False, indent=2)
# Save the tokenizer file if it exists
if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"):
self.backend_tokenizer.save(tokenizer_file)
# Save special tokens configuration
special_tokens_config = {
"unk_token": self.unk_token,
"sep_token": self.sep_token,
"pad_token": self.pad_token,
"cls_token": self.cls_token,
"mask_token": self.mask_token,
"bos_token": self.bos_token,
"eos_token": self.eos_token,
"do_lower_case": self.do_lower_case,
"max_cell_length": self.max_cell_length
}
# Convert token objects to strings for JSON serialization
for key, token in special_tokens_config.items():
if hasattr(token, "content"):
special_tokens_config[key] = token.content
with open(special_tokens_file, 'w', encoding='utf-8') as f:
json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
return (vocab_file, tokenizer_file, special_tokens_file)
def _tokenize(self, text: str) -> List[str]:
"""
Custom tokenization method
Args:
text (str): Input text to tokenize
Returns:
List[str]: List of tokens
"""
# Apply lowercase if required
if self.do_lower_case:
text = text.lower()
# Use the parent tokenizer's tokenization method
tokens = super()._tokenize(text)
# Optional: Add custom cell-length truncation
if self.max_cell_length > 0:
tokens = tokens[:self.max_cell_length]
return tokens
def prepare_for_model(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str] = False,
truncation: Union[bool, str] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> Dict[str, Any]:
"""
Prepare tokenized inputs for the model
Args:
ids (List[int]): List of input token ids
pair_ids (Optional[List[int]], optional): List of pair token ids
Returns:
dict: Prepared model inputs
"""
# Implement any Tessar-specific model preparation logic
# For example, you might want to handle table data differently
return super().prepare_for_model(
ids,
pair_ids=pair_ids,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs
)
def batch_encode_tables(
self,
tables: List[List[List[str]]],
max_length: Optional[int] = None,
padding: Union[bool, str] = True,
truncation: Union[bool, str] = True,
return_tensors: Optional[str] = "pt",
**kwargs
) -> Dict[str, Any]:
"""
Encode a batch of tables for table question answering
Args:
tables (List[List[List[str]]]): List of tables, where each table is a list of rows,
and each row is a list of cell values
max_length (Optional[int], optional): Maximum sequence length
padding (Union[bool, str], optional): Padding strategy
truncation (Union[bool, str], optional): Truncation strategy
return_tensors (Optional[str], optional): Type of tensors to return
Returns:
Dict[str, Any]: Encoded table batch
"""
# Flatten tables into text sequences with appropriate format
flattened_inputs = []
for table in tables:
# Convert table to a flattened text representation
# This is a simplified example - real implementation would depend on your specific format
table_text = ""
for row_idx, row in enumerate(table):
for col_idx, cell in enumerate(row):
# Apply cell-level processing
if self.do_lower_case:
cell = cell.lower()
# Add cell with position information
table_text += f"[CELL_{row_idx}_{col_idx}] {cell} "
# Add row separator
table_text += "[ROW_END] "
flattened_inputs.append(table_text.strip())
# Encode the flattened text inputs
return self(
flattened_inputs,
max_length=max_length,
padding=padding,
truncation=truncation,
return_tensors=return_tensors,
**kwargs
)
def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs):
"""
Load a pretrained Tessar tokenizer
Args:
pretrained_model_name_or_path (str): Path to the pretrained model
**kwargs: Additional arguments to pass to from_pretrained
Returns:
TessarTokenizer: Initialized tokenizer
"""
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
# Register the tokenizer with the Transformers library
from transformers import AutoTokenizer
AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer)
# Example usage
if __name__ == "__main__":
# Example of loading a pretrained tokenizer
try:
# Method 1: Direct loading with the class
tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
print("Tokenizer loaded successfully!")
# Method 2: Loading through AutoTokenizer
# This will work after the registration above
auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
print("AutoTokenizer loaded successfully!")
# Basic tokenization example
text = "Hello, how are you doing today?"
encoded = tokenizer(text, return_tensors="pt")
print("Encoded Input:", encoded)
# Example with table data
table = [
["Header1", "Header2", "Header3"],
["Value1", "Value2", "Value3"],
["Value4", "Value5", "Value6"]
]
# Example of batch encoding tables
encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt")
print("Encoded Table:", encoded_table)
except Exception as e:
print(f"Error loading tokenizer: {e}")