kowalsky
/

multi_task_bert_model

Token Classification

Model card Files Files and versions Community

multi_task_bert_model / data_processing.py

kowalsky's picture

Upload 18 files

78a5823 verified 7 months ago

history blame contribute delete

No virus

3.55 kB

	from torch.utils.data import DataLoader, Dataset
	import torch
	from transformers import BertTokenizerFast, BertModel
	from transformers import BertConfig, BertPreTrainedModel
	import numpy as np
	from typing import Dict, List, Union, Tuple
	from utils import ner_labels_to_ids, intent_labels_to_ids, structure_data

	class tokenized_dataset(Dataset):
	"""
	A Pytorch Dataset for tokenizing and encoding text data for a BERT-based model.

	Args:
	dataset (dict): A dictionary containing 'text', 'ner', and 'intent' keys.
	tokenizer (BertTokenizerFast): A tokenizer for processing text input.
	max_len (int, optionl): Maximum length of tokenized sequences (default: 128).

	Attributes:
	len (int): Number of samples in the dataset.

	Methods:
	__getitem__(self, index: int) -> Dict[str, torch.Tensor]:
	Retrieve and preprocess a single sample from the dataset.

	__len__(self) -> int:
	Get the total number of samples int the dataset.

	Returns:
	Dict[str, torch.Tensor]: A dictionary containing tokenized and encoded text, NER and intent labels.
	"""
	def __init__(self, dataset: Dict[str, List[str]], tokenizer: BertTokenizerFast, max_len: int = 128):
	self.len = len(dataset['text'])
	self.ner_labels_to_ids = ner_labels_to_ids()
	self.intent_labels_to_ids = intent_labels_to_ids()
	self.text = dataset['text']
	self.intent = dataset['intent']
	self.ner = dataset['entities']
	self.tokenizer = tokenizer()
	self.max_len = max_len

	def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
	# step 1: get the sentence, ner label, and intent_label
	sentence = self.text[index].strip()
	intent_label = self.intent[index].strip()
	ner_labels = self.ner[index]

	# step 2: use tokenizer to encode a sentence (includes padding/truncation up to max length)
	# BertTokenizerFast provides a handy "return_offsets_mapping" which highlights where each token starts and ends
	encoding = self.tokenizer(
	sentence,
	return_offsets_mapping=True,
	padding='max_length',
	truncation=True,
	max_length=self.max_len
	)

	# step 3: create ner token labels only for first word pieces of each tokenized word
	tokenized_ner_labels = [self.ner_labels_to_ids[label] for label in ner_labels]
	# create an empty array of -100 of length max_length
	encoded_ner_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100

	# set only labels whose first offset position is 0 and the second is not 0
	i = 0
	prev = -1
	for idx, mapping in enumerate(encoding['offset_mapping']):
	if mapping[0] == mapping[1] == 0:
	continue
	if mapping[0] != prev:
	# overwrite label
	encoded_ner_labels[idx] = tokenized_ner_labels[i]
	prev = mapping[1]
	i += 1
	else:
	prev = mapping[1]

	# create intent token labels
	tokenized_intent_label = self.intent_labels_to_ids[intent_label]

	# step 4: turn everything into Pytorch tensors
	item = {key: torch.as_tensor(val) for key, val in encoding.items()}
	item['ner_labels'] = torch.as_tensor(encoded_ner_labels)
	item['intent_labels'] = torch.as_tensor(tokenized_intent_label)

	return item

	def __len__(self) -> int:
	return self.len