Tessar-largest / tessar_tokenizer.py

Update tessar_tokenizer.py

e717e47 verified 3 months ago

12.3 kB

	import json
	import os
	from typing import List, Optional, Union, Dict, Any, Tuple

	from transformers import PreTrainedTokenizerFast
	from transformers.tokenization_utils_base import AddedToken
	from transformers.utils import logging

	logger = logging.get_logger(__name__)

	class TessarTokenizer(PreTrainedTokenizerFast):
	"""
	Tessar Tokenizer implementation for Hugging Face Transformers

	This custom tokenizer extends the PreTrainedTokenizerFast with specialized
	configuration and tokenization methods for the Tessar model.
	"""

	model_input_names = ['input_ids', 'attention_mask']
	vocab_files_names = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}

	def __init__(
	self,
	vocab_file=None,
	tokenizer_file=None,
	do_lower_case=True,
	unk_token="<unk>",
	sep_token="</s>",
	pad_token="<pad>",
	cls_token="<s>",
	mask_token="<mask>",
	bos_token="<s>",
	eos_token="</s>",
	max_cell_length=15,
	**kwargs
	):
	"""
	Initialize the Tessar Tokenizer with specific token configurations

	Args:
	vocab_file (str, optional): Path to the vocabulary file
	tokenizer_file (str, optional): Path to the pre-trained tokenizer file
	do_lower_case (bool, optional): Whether to lowercase the input. Defaults to True.
	max_cell_length (int, optional): Maximum length for cell tokenization. Defaults to 15.
	"""
	# Prepare special tokens
	special_tokens_dict = {
	"unk_token": unk_token,
	"sep_token": sep_token,
	"pad_token": pad_token,
	"cls_token": cls_token,
	"mask_token": mask_token,
	"bos_token": bos_token,
	"eos_token": eos_token,
	}

	# Convert string tokens to AddedToken objects if they're not already
	for token_name, token_value in special_tokens_dict.items():
	if isinstance(token_value, str):
	special_tokens_dict[token_name] = AddedToken(token_value,
	lstrip=False,
	rstrip=False,
	normalized=True,
	special=True)

	# Call parent constructor
	super().__init__(
	vocab_file=vocab_file,
	tokenizer_file=tokenizer_file,
	**special_tokens_dict,
	**kwargs
	)

	# Custom Tessar-specific attributes
	self.do_lower_case = do_lower_case
	self.max_cell_length = max_cell_length

	@property
	def vocab_size(self) -> int:
	"""
	Return the size of vocabulary

	Returns:
	int: The vocabulary size
	"""
	return len(self.vocab)

	def get_vocab(self) -> Dict[str, int]:
	"""
	Return the vocabulary mapping

	Returns:
	Dict[str, int]: The vocabulary mapping
	"""
	return dict(self.vocab)

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, ...]:
	"""
	Save the tokenizer vocabulary and special tokens file

	Args:
	save_directory (str): Directory to save the vocabulary
	filename_prefix (str, optional): Prefix for the saved files

	Returns:
	tuple: Paths to the saved files
	"""
	# Ensure the save directory exists
	os.makedirs(save_directory, exist_ok=True)

	# Prepare file paths
	vocab_file = os.path.join(
	save_directory,
	f"{filename_prefix + '-' if filename_prefix else ''}vocab.json"
	)

	# Save tokenizer file
	tokenizer_file = os.path.join(
	save_directory,
	f"{filename_prefix + '-' if filename_prefix else ''}tokenizer.json"
	)

	# Save special tokens configuration
	special_tokens_file = os.path.join(
	save_directory,
	f"{filename_prefix + '-' if filename_prefix else ''}special_tokens.json"
	)

	# Get vocabulary from tokenizer
	vocab_dict = self.get_vocab()

	# Save vocabulary
	with open(vocab_file, 'w', encoding='utf-8') as f:
	json.dump(vocab_dict, f, ensure_ascii=False, indent=2)

	# Save the tokenizer file if it exists
	if hasattr(self, "backend_tokenizer") and hasattr(self.backend_tokenizer, "save"):
	self.backend_tokenizer.save(tokenizer_file)

	# Save special tokens configuration
	special_tokens_config = {
	"unk_token": self.unk_token,
	"sep_token": self.sep_token,
	"pad_token": self.pad_token,
	"cls_token": self.cls_token,
	"mask_token": self.mask_token,
	"bos_token": self.bos_token,
	"eos_token": self.eos_token,
	"do_lower_case": self.do_lower_case,
	"max_cell_length": self.max_cell_length
	}

	# Convert token objects to strings for JSON serialization
	for key, token in special_tokens_config.items():
	if hasattr(token, "content"):
	special_tokens_config[key] = token.content

	with open(special_tokens_file, 'w', encoding='utf-8') as f:
	json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)

	return (vocab_file, tokenizer_file, special_tokens_file)

	def _tokenize(self, text: str) -> List[str]:
	"""
	Custom tokenization method

	Args:
	text (str): Input text to tokenize

	Returns:
	List[str]: List of tokens
	"""
	# Apply lowercase if required
	if self.do_lower_case:
	text = text.lower()

	# Use the parent tokenizer's tokenization method
	tokens = super()._tokenize(text)

	# Optional: Add custom cell-length truncation
	if self.max_cell_length > 0:
	tokens = tokens[:self.max_cell_length]

	return tokens

	def prepare_for_model(
	self,
	ids: List[int],
	pair_ids: Optional[List[int]] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str] = False,
	truncation: Union[bool, str] = False,
	max_length: Optional[int] = None,
	stride: int = 0,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[str] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> Dict[str, Any]:
	"""
	Prepare tokenized inputs for the model

	Args:
	ids (List[int]): List of input token ids
	pair_ids (Optional[List[int]], optional): List of pair token ids

	Returns:
	dict: Prepared model inputs
	"""
	# Implement any Tessar-specific model preparation logic
	# For example, you might want to handle table data differently

	return super().prepare_for_model(
	ids,
	pair_ids=pair_ids,
	add_special_tokens=add_special_tokens,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	stride=stride,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs
	)

	def batch_encode_tables(
	self,
	tables: List[List[List[str]]],
	max_length: Optional[int] = None,
	padding: Union[bool, str] = True,
	truncation: Union[bool, str] = True,
	return_tensors: Optional[str] = "pt",
	**kwargs
	) -> Dict[str, Any]:
	"""
	Encode a batch of tables for table question answering

	Args:
	tables (List[List[List[str]]]): List of tables, where each table is a list of rows,
	and each row is a list of cell values
	max_length (Optional[int], optional): Maximum sequence length
	padding (Union[bool, str], optional): Padding strategy
	truncation (Union[bool, str], optional): Truncation strategy
	return_tensors (Optional[str], optional): Type of tensors to return

	Returns:
	Dict[str, Any]: Encoded table batch
	"""
	# Flatten tables into text sequences with appropriate format
	flattened_inputs = []

	for table in tables:
	# Convert table to a flattened text representation
	# This is a simplified example - real implementation would depend on your specific format
	table_text = ""

	for row_idx, row in enumerate(table):
	for col_idx, cell in enumerate(row):
	# Apply cell-level processing
	if self.do_lower_case:
	cell = cell.lower()

	# Add cell with position information
	table_text += f"[CELL_{row_idx}_{col_idx}] {cell} "

	# Add row separator
	table_text += "[ROW_END] "

	flattened_inputs.append(table_text.strip())

	# Encode the flattened text inputs
	return self(
	flattened_inputs,
	max_length=max_length,
	padding=padding,
	truncation=truncation,
	return_tensors=return_tensors,
	**kwargs
	)


	def load_tessar_tokenizer(pretrained_model_name_or_path: str, **kwargs):
	"""
	Load a pretrained Tessar tokenizer

	Args:
	pretrained_model_name_or_path (str): Path to the pretrained model
	**kwargs: Additional arguments to pass to from_pretrained

	Returns:
	TessarTokenizer: Initialized tokenizer
	"""
	return TessarTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)


	# Register the tokenizer with the Transformers library
	from transformers import AutoTokenizer
	AutoTokenizer.register("SVECTOR-CORPORATION/Tessar-largest", TessarTokenizer)


	# Example usage
	if __name__ == "__main__":
	# Example of loading a pretrained tokenizer
	try:
	# Method 1: Direct loading with the class
	tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
	print("Tokenizer loaded successfully!")

	# Method 2: Loading through AutoTokenizer
	# This will work after the registration above
	auto_tokenizer = AutoTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest")
	print("AutoTokenizer loaded successfully!")

	# Basic tokenization example
	text = "Hello, how are you doing today?"
	encoded = tokenizer(text, return_tensors="pt")
	print("Encoded Input:", encoded)

	# Example with table data
	table = [
	["Header1", "Header2", "Header3"],
	["Value1", "Value2", "Value3"],
	["Value4", "Value5", "Value6"]
	]

	# Example of batch encoding tables
	encoded_table = tokenizer.batch_encode_tables([table], return_tensors="pt")
	print("Encoded Table:", encoded_table)

	except Exception as e:
	print(f"Error loading tokenizer: {e}")