File size: 12,289 Bytes

import json
import os
from typing import List, Optional, Union, Dict, Any, Tuple

from transformers import PreTrainedTokenizerFast
from transformers.tokenization_utils_base import AddedToken
from transformers.utils import logging

logger = logging.get_logger(__name__)

class TessarTokenizer(PreTrainedTokenizerFast):
    """
    Tessar Tokenizer implementation for Hugging Face Transformers
    
    This custom tokenizer extends the PreTrainedTokenizerFast with specialized 
    configuration and tokenization methods for the Tessar model.
    """
    
    model_input_names = ['input_ids', 'attention_mask']
    vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
    
    def __init__(
        self, 
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="<unk>",
        sep_token="</s>",
        pad_token="<pad>",
        cls_token="<s>",
        mask_token="<mask>",
        bos_token="<s>",
        eos_token="</s>",
        max_cell_length=15,
        **kwargs
    ):
        """
        Initialize the Tessar Tokenizer with specific token configurations
        
        Args:
            vocab_file (str, optional): Path to the vocabulary file
            tokenizer_file (str, optional): Path to the pre-trained tokenizer file
            do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
            max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
        """
        # Prepare special tokens
        special_tokens_dict = {
            "unk_token": unk_token,
            "sep_token": sep_token,
            "pad_token": pad_token,
            "cls_token": cls_token,
            "mask_token": mask_token,
            "bos_token": bos_token,
            "eos_token": eos_token,
        }
        
        # Convert string tokens to AddedToken objects if they're not already
        for token_name, token_value in special_tokens_dict.items():
            if isinstance(token_value, str):
                special_tokens_dict[token_name] = AddedToken(token_value, 
                                                           lstrip=False, 
                                                           rstrip=False, 
                                                           normalized=True,
                                                           special=True)
        
        # Call parent constructor
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            **special_tokens_dict,
            **kwargs
        )
        
        # Custom Tessar-specific attributes
        self.do_lower_case = do_lower_case
        self.max_cell_length = max_cell_length
    
    @property
    def vocab_size(self) -> int:
        """
        Return the size of vocabulary
        
        Returns:
            int: The vocabulary size
        """
        return len(self.vocab)
    
    def get_vocab(self) -> Dict[str, int]:
        """
        Return the vocabulary mapping
        
        Returns:
            Dict[str, int]: The vocabulary mapping
        """
        return dict(self.vocab)
    
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]:
        """
        Save the tokenizer vocabulary and special tokens file
        
        Args:
            save_directory (str): Directory to save the vocabulary
            filename_prefix (str, optional): Prefix for the saved files
        
        Returns:
            tuple: Paths to the saved files
        """
        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)
        
        # Prepare file paths
        vocab_file = os.path.join(
            save_directory, 
            f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
        )
        
        # Save tokenizer file
        tokenizer_file = os.path.join(
            save_directory, 
            f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json"
        )
        
        # Save special tokens configuration
        special_tokens_file = os.path.join(
            save_directory, 
            f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
        )
        
        # Get vocabulary from tokenizer
        vocab_dict = self.get_vocab()
        
        # Save vocabulary
        with open(vocab_file, 'w', encoding='utf-8') as f:
            json.dump(vocab_dict, f, ensure_ascii=False, indent=2)
        
        # Save the tokenizer file if it exists
        if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"):
            self.backend_tokenizer.save(tokenizer_file)
        
        # Save special tokens configuration
        special_tokens_config = {
            "unk_token": self.unk_token,
            "sep_token": self.sep_token,
            "pad_token": self.pad_token,
            "cls_token": self.cls_token,
            "mask_token": self.mask_token,
            "bos_token": self.bos_token,
            "eos_token": self.eos_token,
            "do_lower_case": self.do_lower_case,
            "max_cell_length": self.max_cell_length
        }
        
        # Convert token objects to strings for JSON serialization
        for key, token in special_tokens_config.items():
            if hasattr(token, "content"):
                special_tokens_config[key] = token.content
        
        with open(special_tokens_file, 'w', encoding='utf-8') as f:
            json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)
        
        return (vocab_file, tokenizer_file, special_tokens_file)
    
    def _tokenize(self, text: str) -> List[str]:
        """
        Custom tokenization method
        
        Args:
            text (str): Input text to tokenize
        
        Returns:
            List[str]: List of tokens
        """
        # Apply lowercase if required
        if self.do_lower_case:
            text = text.lower()
        
        # Use the parent tokenizer's tokenization method
        tokens = super()._tokenize(text)
        
        # Optional: Add custom cell-length truncation
        if self.max_cell_length > 0:
            tokens = tokens[:self.max_cell_length]
        
        return tokens
    
    def prepare_for_model(
        self, 
        ids: List[int], 
        pair_ids: Optional[List[int]] = None, 
        add_special_tokens: bool = True,
        padding: Union[bool, str] = False,
        truncation: Union[bool, str] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs
    ) -> Dict[str, Any]:
        """
        Prepare tokenized inputs for the model
        
        Args:
            ids (List[int]): List of input token ids
            pair_ids (Optional[List[int]], optional): List of pair token ids
        
        Returns:
            dict: Prepared model inputs
        """
        # Implement any Tessar-specific model preparation logic
        # For example, you might want to handle table data differently
        
        return super().prepare_for_model(
            ids,
            pair_ids=pair_ids,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs
        )
    
    def batch_encode_tables(
        self,
        tables: List[List[List[str]]],
        max_length: Optional[int] = None,
        padding: Union[bool, str] = True,
        truncation: Union[bool, str] = True,
        return_tensors: Optional[str] = "pt",
        **kwargs
    ) -> Dict[str, Any]:
        """
        Encode a batch of tables for table question answering
        
        Args:
            tables (List[List[List[str]]]): List of tables, where each table is a list of rows,
                                          and each row is a list of cell values
            max_length (Optional[int], optional): Maximum sequence length
            padding (Union[bool, str], optional): Padding strategy
            truncation (Union[bool, str], optional): Truncation strategy
            return_tensors (Optional[str], optional): Type of tensors to return
        
        Returns:
            Dict[str, Any]: Encoded table batch
        """
        # Flatten tables into text sequences with appropriate format
        flattened_inputs = []
        
        for table in tables:
            # Convert table to a flattened text representation
            # This is a simplified example - real implementation would depend on your specific format
            table_text = ""
            
            for row_idx, row in enumerate(table):
                for col_idx, cell in enumerate(row):
                    # Apply cell-level processing
                    if self.do_lower_case:
                        cell = cell.lower()
                    
                    # Add cell with position information
                    table_text += f"[CELL_{row_idx}_{col_idx}] {cell} "
                
                # Add row separator
                table_text += "[ROW_END] "
            
            flattened_inputs.append(table_text.strip())
        
        # Encode the flattened text inputs
        return self(
            flattened_inputs,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            return_tensors=return_tensors,
            **kwargs
        )


def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs):
    """
    Load a pretrained Tessar tokenizer
    
    Args:
        pretrained_model_name_or_path (str): Path to the pretrained model
        **kwargs: Additional arguments to pass to from_pretrained
    
    Returns:
        TessarTokenizer: Initialized tokenizer
    """
    return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)


# Register the tokenizer with the Transformers library
from transformers import AutoTokenizer
AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer)


# Example usage
if __name__ == "__main__":
    # Example of loading a pretrained tokenizer
    try:
        # Method 1: Direct loading with the class
        tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
        print("Tokenizer loaded successfully!")
        
        # Method 2: Loading through AutoTokenizer
        # This will work after the registration above
        auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
        print("AutoTokenizer loaded successfully!")
        
        # Basic tokenization example
        text = "Hello, how are you doing today?"
        encoded = tokenizer(text, return_tensors="pt")
        print("Encoded Input:", encoded)
        
        # Example with table data
        table = [
            ["Header1", "Header2", "Header3"],
            ["Value1", "Value2", "Value3"],
            ["Value4", "Value5", "Value6"]
        ]
        
        # Example of batch encoding tables
        encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt")
        print("Encoded Table:", encoded_table)
        
    except Exception as e:
        print(f"Error loading tokenizer: {e}")