from src.modules.tokenizers import * from src.modules.embeddings import * from src.utils.mapper import configmapper class Preprocessor: def preprocess(self): pass @configmapper.map("preprocessors", "glove") class GlovePreprocessor(Preprocessor): """GlovePreprocessor.""" def __init__(self, config): """ Args: config (src.utils.module.Config): configuration for preprocessor """ super(GlovePreprocessor, self).__init__() self.config = config self.tokenizer = configmapper.get_object( "tokenizers", self.config.main.preprocessor.tokenizer.name )(**self.config.main.preprocessor.tokenizer.init_params.as_dict()) self.tokenizer_params = ( self.config.main.preprocessor.tokenizer.init_vector_params.as_dict() ) self.tokenizer.initialize_vectors(**self.tokenizer_params) self.embeddings = configmapper.get_object( "embeddings", self.config.main.preprocessor.embedding.name )( self.tokenizer.text_field.vocab.vectors, self.tokenizer.text_field.vocab.stoi[self.tokenizer.text_field.pad_token], ) def preprocess(self, model_config, data_config): train_dataset = configmapper.get_object("datasets", data_config.main.name)( data_config.train, self.tokenizer ) val_dataset = configmapper.get_object("datasets", data_config.main.name)( data_config.val, self.tokenizer ) model = configmapper.get_object("models", model_config.name)( self.embeddings, **model_config.params.as_dict() ) return model, train_dataset, val_dataset @configmapper.map("preprocessors", "clozePreprocessor") class ClozePreprocessor(Preprocessor): """GlovePreprocessor.""" def __init__(self, config): """ Args: config (src.utils.module.Config): configuration for preprocessor """ super(ClozePreprocessor, self).__init__() self.config = config self.tokenizer = configmapper.get_object( "tokenizers", self.config.main.preprocessor.tokenizer.name ).from_pretrained( **self.config.main.preprocessor.tokenizer.init_params.as_dict() ) def preprocess(self, model_config, data_config): train_dataset = configmapper.get_object("datasets", data_config.main.name)( data_config.train, self.tokenizer ) val_dataset = configmapper.get_object("datasets", data_config.main.name)( data_config.val, self.tokenizer ) model = configmapper.get_object("models", model_config.name).from_pretrained( **model_config.params.as_dict() ) return model, train_dataset, val_dataset @configmapper.map("preprocessors", "transformersConcretenessPreprocessor") class TransformersConcretenessPreprocessor(Preprocessor): """BertConcretenessPreprocessor.""" def __init__(self, config): """ Args: config (src.utils.module.Config): configuration for preprocessor """ super(TransformersConcretenessPreprocessor, self).__init__() self.config = config self.tokenizer = configmapper.get_object( "tokenizers", self.config.main.preprocessor.tokenizer.name ).from_pretrained( **self.config.main.preprocessor.tokenizer.init_params.as_dict() ) def preprocess(self, model_config, data_config): train_dataset = configmapper.get_object("datasets", data_config.main.name)( data_config.train, self.tokenizer ) val_dataset = configmapper.get_object("datasets", data_config.main.name)( data_config.val, self.tokenizer ) model = configmapper.get_object("models", model_config.name)( **model_config.params.as_dict() ) return model, train_dataset, val_dataset