jinaai
/

jina-bert-implementation

Transformers

English

Model card Files Files and versions

xet

Community

alaeddine-13 commited on Oct 4, 2023

Commit

b4f2b16

1 Parent(s): e36c994

rename to jina bert

Browse files

Files changed (1) hide show

modeling_bert.py +75 -129

modeling_bert.py CHANGED Viewed

@@ -54,7 +54,7 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
-from .configuration_bert import MyBertConfig
 try:
     from tqdm.autonotebook import trange
@@ -66,7 +66,7 @@ except ImportError:
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "bert-base-uncased"
-_CONFIG_FOR_DOC = "MyBertConfig"
 # TokenClassification docstring
 _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = (
@@ -197,10 +197,10 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
     return model
-class MyBertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
-    def __init__(self, config: MyBertConfig):
         super().__init__()
         self.word_embeddings = nn.Embedding(
             config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
@@ -280,7 +280,7 @@ class MyBertEmbeddings(nn.Module):
         return embeddings
-class MyBertSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
@@ -448,7 +448,7 @@ class MyBertSelfAttention(nn.Module):
         return outputs
-class MyBertSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -464,13 +464,13 @@ class MyBertSelfOutput(nn.Module):
         return hidden_states
-class MyBertAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = MyBertSelfAttention(
             config, position_embedding_type=position_embedding_type
         )
-        self.output = MyBertSelfOutput(config)
         self.pruned_heads = set()
     def prune_heads(self, heads):
@@ -524,7 +524,7 @@ class MyBertAttention(nn.Module):
         return outputs
-class MyBertIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -539,8 +539,8 @@ class MyBertIntermediate(nn.Module):
         return hidden_states
-class MyBertOutput(nn.Module):
-    def __init__(self, config: MyBertConfig):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -555,8 +555,8 @@ class MyBertOutput(nn.Module):
         return hidden_states
-class MyBertGLUMLP(nn.Module):
-    def __init__(self, config: MyBertConfig):
         super().__init__()
         self.config = config
         self.gated_layers = nn.Linear(
@@ -589,12 +589,12 @@ class MyBertGLUMLP(nn.Module):
         return hidden_states
-class MyBertLayer(nn.Module):
-    def __init__(self, config: MyBertConfig):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = MyBertAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         self.feed_forward_type = config.feed_forward_type
@@ -603,14 +603,14 @@ class MyBertLayer(nn.Module):
                 raise ValueError(
                     f"{self} should be used as a decoder model if cross attention is added"
                 )
-            self.crossattention = MyBertAttention(
                 config, position_embedding_type="absolute"
             )
         if self.feed_forward_type.endswith('glu'):
-            self.mlp = MyBertGLUMLP(config)
         else:
-            self.intermediate = MyBertIntermediate(config)
-            self.output = MyBertOutput(config)
     def forward(
         self,
@@ -699,12 +699,12 @@ class MyBertLayer(nn.Module):
         return layer_output
-class MyBertEncoder(nn.Module):
-    def __init__(self, config: MyBertConfig):
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList(
-            [MyBertLayer(config) for _ in range(config.num_hidden_layers)]
         )
         self.gradient_checkpointing = False
         self.num_attention_heads = config.num_attention_heads
@@ -724,26 +724,6 @@ class MyBertEncoder(nn.Module):
         # will be applied, it is necessary to construct the diagonal mask.
         n_heads = self.num_attention_heads
-        # Mosaics one
-        # def _get_alibi_head_slopes(n_heads: int) -> List[float]:
-        #     def get_slopes_power_of_2(n_heads: int) -> List[float]:
-        #         start = 2 ** (-(2 ** -(math.log2(n_heads) - 3)))
-        #         ratio = start
-        #         return [start * ratio**i for i in range(n_heads)]
-        #     # In the paper, they only train models that have 2^a heads for some a. This function
-        #     # has some good properties that only occur when the input is a power of 2. To
-        #     # maintain that even when the number of heads is not a power of 2, we use a
-        #     # workaround.
-        #     if math.log2(n_heads).is_integer():
-        #         return get_slopes_power_of_2(n_heads)
-        #     closest_power_of_2 = 2 ** math.floor(math.log2(n_heads))
-        #     slopes_a = get_slopes_power_of_2(closest_power_of_2)
-        #     slopes_b = _get_alibi_head_slopes(2 * closest_power_of_2)
-        #     slopes_b = slopes_b[0::2][: n_heads - closest_power_of_2]
-        #     return slopes_a + slopes_b
         def _get_alibi_head_slopes(n_heads: int) -> List[float]:
             def get_slopes_power_of_2(n):
                 start = 2 ** (-(2 ** -(math.log2(n) - 3)))
@@ -893,7 +873,7 @@ class MyBertEncoder(nn.Module):
         )
-class MyBertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -908,7 +888,7 @@ class MyBertPooler(nn.Module):
         return pooled_output
-class MyBertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -925,10 +905,10 @@ class MyBertPredictionHeadTransform(nn.Module):
         return hidden_states
-class MyBertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.transform = MyBertPredictionHeadTransform(config)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
@@ -945,17 +925,17 @@ class MyBertLMPredictionHead(nn.Module):
         return hidden_states
-class MyBertOnlyMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.predictions = MyBertLMPredictionHead(config)
     def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
         prediction_scores = self.predictions(sequence_output)
         return prediction_scores
-class MyBertOnlyNSPHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
@@ -965,10 +945,10 @@ class MyBertOnlyNSPHead(nn.Module):
         return seq_relationship_score
-class MyBertPreTrainingHeads(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.predictions = MyBertLMPredictionHead(config)
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
     def forward(self, sequence_output, pooled_output):
@@ -977,13 +957,13 @@ class MyBertPreTrainingHeads(nn.Module):
         return prediction_scores, seq_relationship_score
-class MyBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
-    config_class = MyBertConfig
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
@@ -1005,12 +985,12 @@ class MyBertPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, MyBertEncoder):
             module.gradient_checkpointing = value
 @dataclass
-class MyBertForPreTrainingOutput(ModelOutput):
     """
     Output type of [`BertForPreTraining`].
@@ -1113,7 +1093,7 @@ BERT_INPUTS_DOCSTRING = r"""
     "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
     BERT_START_DOCSTRING,
 )
-class MyBertModel(MyBertPreTrainedModel):
     """
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -1126,7 +1106,7 @@ class MyBertModel(MyBertPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
-    def __init__(self, config: MyBertConfig, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
@@ -1137,17 +1117,17 @@ class MyBertModel(MyBertPreTrainedModel):
             self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
-        self.embeddings = MyBertEmbeddings(config)
-        self.encoder = MyBertEncoder(config)
-        self.pooler = MyBertPooler(config) if add_pooling_layer else None
         # Initialize weights and apply final processing
         self.post_init()
     @torch.inference_mode()
     def encode(
-        self: 'MyBertModel',
         sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
@@ -1479,14 +1459,14 @@ class MyBertModel(MyBertPreTrainedModel):
     """,
     BERT_START_DOCSTRING,
 )
-class MyBertForPreTraining(MyBertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.bert = MyBertModel(config)
-        self.cls = MyBertPreTrainingHeads(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1501,7 +1481,7 @@ class MyBertForPreTraining(MyBertPreTrainedModel):
         BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @replace_return_docstrings(
-        output_type=MyBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
     )
     def forward(
         self,
@@ -1516,7 +1496,7 @@ class MyBertForPreTraining(MyBertPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], MyBertForPreTrainingOutput]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -1532,22 +1512,6 @@ class MyBertForPreTraining(MyBertPreTrainedModel):
                 Used to hide legacy arguments that have been deprecated.
         Returns:
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, MyBertForPreTraining
-        >>> import torch
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = MyBertForPreTraining.from_pretrained("bert-base-uncased")
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
-        ```
         """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
@@ -1585,7 +1549,7 @@ class MyBertForPreTraining(MyBertPreTrainedModel):
             output = (prediction_scores, seq_relationship_score) + outputs[2:]
             return ((total_loss,) + output) if total_loss is not None else output
-        return MyBertForPreTrainingOutput(
             loss=total_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
@@ -1595,10 +1559,10 @@ class MyBertForPreTraining(MyBertPreTrainedModel):
 @add_start_docstrings(
-    """MyBert Model with a `language modeling` head on top for CLM fine-tuning.""",
     BERT_START_DOCSTRING,
 )
-class MyBertLMHeadModel(MyBertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config):
@@ -1606,11 +1570,11 @@ class MyBertLMHeadModel(MyBertPreTrainedModel):
         if not config.is_decoder:
             logger.warning(
-                "If you want to use `MyBertLMHeadModel` as a standalone, add `is_decoder=True.`"
             )
-        self.bert = MyBertModel(config, add_pooling_layer=False)
-        self.cls = MyBertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1755,9 +1719,9 @@ class MyBertLMHeadModel(MyBertPreTrainedModel):
 @add_start_docstrings(
-    """MyBert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING
 )
-class MyBertForMaskedLM(MyBertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config):
@@ -1765,12 +1729,12 @@ class MyBertForMaskedLM(MyBertPreTrainedModel):
         if config.is_decoder:
             logger.warning(
-                "If you want to use `MyBertForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
-        self.bert = MyBertModel(config, add_pooling_layer=False)
-        self.cls = MyBertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1880,15 +1844,15 @@ class MyBertForMaskedLM(MyBertPreTrainedModel):
 @add_start_docstrings(
-    """MyBert Model with a `next sentence prediction (classification)` head on top.""",
     BERT_START_DOCSTRING,
 )
-class MyBertForNextSentencePrediction(MyBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.bert = MyBertModel(config)
-        self.cls = MyBertOnlyNSPHead(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1922,24 +1886,6 @@ class MyBertForNextSentencePrediction(MyBertPreTrainedModel):
             - 1 indicates sequence B is a random sequence.
         Returns:
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, MyBertForNextSentencePrediction
-        >>> import torch
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-        >>> model = MyBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
-        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
-        ```
         """
         if "next_sentence_label" in kwargs:
@@ -1995,18 +1941,18 @@ class MyBertForNextSentencePrediction(MyBertPreTrainedModel):
 @add_start_docstrings(
     """
-    MyBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
     output) e.g. for GLUE tasks.
     """,
     BERT_START_DOCSTRING,
 )
-class MyBertForSequenceClassification(MyBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
-        self.bert = MyBertModel(config)
         classifier_dropout = (
             config.classifier_dropout
             if config.classifier_dropout is not None
@@ -2106,16 +2052,16 @@ class MyBertForSequenceClassification(MyBertPreTrainedModel):
 @add_start_docstrings(
     """
-    MyBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
     softmax) e.g. for RocStories/SWAG tasks.
     """,
     BERT_START_DOCSTRING,
 )
-class MyBertForMultipleChoice(MyBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.bert = MyBertModel(config)
         classifier_dropout = (
             config.classifier_dropout
             if config.classifier_dropout is not None
@@ -2222,17 +2168,17 @@ class MyBertForMultipleChoice(MyBertPreTrainedModel):
 @add_start_docstrings(
     """
-    MyBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
     BERT_START_DOCSTRING,
 )
-class MyBertForTokenClassification(MyBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.bert = MyBertModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout
             if config.classifier_dropout is not None
@@ -2311,17 +2257,17 @@ class MyBertForTokenClassification(MyBertPreTrainedModel):
 @add_start_docstrings(
     """
-    MyBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
     layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     BERT_START_DOCSTRING,
 )
-class MyBertForQuestionAnswering(MyBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.bert = MyBertModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
         # Initialize weights and apply final processing

     logging,
     replace_return_docstrings,
 )
+from .configuration_bert import JinaBertConfig
 try:
     from tqdm.autonotebook import trange
 logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CONFIG_FOR_DOC = "JinaBertConfig"
 # TokenClassification docstring
 _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = (
     return model
+class JinaBertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.word_embeddings = nn.Embedding(
             config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
         return embeddings
+class JinaBertSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
         return outputs
+class JinaBertSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         return hidden_states
+class JinaBertAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
+        self.self = JinaBertSelfAttention(
             config, position_embedding_type=position_embedding_type
         )
+        self.output = JinaBertSelfOutput(config)
         self.pruned_heads = set()
     def prune_heads(self, heads):
         return outputs
+class JinaBertIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
         return hidden_states
+class JinaBertOutput(nn.Module):
+    def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         return hidden_states
+class JinaBertGLUMLP(nn.Module):
+    def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.config = config
         self.gated_layers = nn.Linear(
         return hidden_states
+class JinaBertLayer(nn.Module):
+    def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
+        self.attention = JinaBertAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         self.feed_forward_type = config.feed_forward_type
                 raise ValueError(
                     f"{self} should be used as a decoder model if cross attention is added"
                 )
+            self.crossattention = JinaBertAttention(
                 config, position_embedding_type="absolute"
             )
         if self.feed_forward_type.endswith('glu'):
+            self.mlp = JinaBertGLUMLP(config)
         else:
+            self.intermediate = JinaBertIntermediate(config)
+            self.output = JinaBertOutput(config)
     def forward(
         self,
         return layer_output
+class JinaBertEncoder(nn.Module):
+    def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList(
+            [JinaBertLayer(config) for _ in range(config.num_hidden_layers)]
         )
         self.gradient_checkpointing = False
         self.num_attention_heads = config.num_attention_heads
         # will be applied, it is necessary to construct the diagonal mask.
         n_heads = self.num_attention_heads
         def _get_alibi_head_slopes(n_heads: int) -> List[float]:
             def get_slopes_power_of_2(n):
                 start = 2 ** (-(2 ** -(math.log2(n) - 3)))
         )
+class JinaBertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         return pooled_output
+class JinaBertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         return hidden_states
+class JinaBertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.transform = JinaBertPredictionHeadTransform(config)
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         return hidden_states
+class JinaBertOnlyMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.predictions = JinaBertLMPredictionHead(config)
     def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
         prediction_scores = self.predictions(sequence_output)
         return prediction_scores
+class JinaBertOnlyNSPHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
         return seq_relationship_score
+class JinaBertPreTrainingHeads(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.predictions = JinaBertLMPredictionHead(config)
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
     def forward(self, sequence_output, pooled_output):
         return prediction_scores, seq_relationship_score
+class JinaBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+    config_class = JinaBertConfig
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
             module.weight.data.fill_(1.0)
     def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, JinaBertEncoder):
             module.gradient_checkpointing = value
 @dataclass
+class JinaBertForPreTrainingOutput(ModelOutput):
     """
     Output type of [`BertForPreTraining`].
     "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
     BERT_START_DOCSTRING,
 )
+class JinaBertModel(JinaBertPreTrainedModel):
     """
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
+    def __init__(self, config: JinaBertConfig, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
             self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
+        self.embeddings = JinaBertEmbeddings(config)
+        self.encoder = JinaBertEncoder(config)
+        self.pooler = JinaBertPooler(config) if add_pooling_layer else None
         # Initialize weights and apply final processing
         self.post_init()
     @torch.inference_mode()
     def encode(
+        self: 'JinaBertModel',
         sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
     """,
     BERT_START_DOCSTRING,
 )
+class JinaBertForPreTraining(JinaBertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.bert = JinaBertModel(config)
+        self.cls = JinaBertPreTrainingHeads(config)
         # Initialize weights and apply final processing
         self.post_init()
         BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @replace_return_docstrings(
+        output_type=JinaBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
     )
     def forward(
         self,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], JinaBertForPreTrainingOutput]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                 Used to hide legacy arguments that have been deprecated.
         Returns:
         """
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
             output = (prediction_scores, seq_relationship_score) + outputs[2:]
             return ((total_loss,) + output) if total_loss is not None else output
+        return JinaBertForPreTrainingOutput(
             loss=total_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
 @add_start_docstrings(
+    """JinaBert Model with a `language modeling` head on top for CLM fine-tuning.""",
     BERT_START_DOCSTRING,
 )
+class JinaBertLMHeadModel(JinaBertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config):
         if not config.is_decoder:
             logger.warning(
+                "If you want to use `JinaBertLMHeadModel` as a standalone, add `is_decoder=True.`"
             )
+        self.bert = JinaBertModel(config, add_pooling_layer=False)
+        self.cls = JinaBertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
 @add_start_docstrings(
+    """JinaBert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING
 )
+class JinaBertForMaskedLM(JinaBertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config):
         if config.is_decoder:
             logger.warning(
+                "If you want to use `JinaBertForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
+        self.bert = JinaBertModel(config, add_pooling_layer=False)
+        self.cls = JinaBertOnlyMLMHead(config)
         # Initialize weights and apply final processing
         self.post_init()
 @add_start_docstrings(
+    """JinaBert Model with a `next sentence prediction (classification)` head on top.""",
     BERT_START_DOCSTRING,
 )
+class JinaBertForNextSentencePrediction(JinaBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
+        self.bert = JinaBertModel(config)
+        self.cls = JinaBertOnlyNSPHead(config)
         # Initialize weights and apply final processing
         self.post_init()
             - 1 indicates sequence B is a random sequence.
         Returns:
         """
         if "next_sentence_label" in kwargs:
 @add_start_docstrings(
     """
+    JinaBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
     output) e.g. for GLUE tasks.
     """,
     BERT_START_DOCSTRING,
 )
+class JinaBertForSequenceClassification(JinaBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
+        self.bert = JinaBertModel(config)
         classifier_dropout = (
             config.classifier_dropout
             if config.classifier_dropout is not None
 @add_start_docstrings(
     """
+    JinaBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
     softmax) e.g. for RocStories/SWAG tasks.
     """,
     BERT_START_DOCSTRING,
 )
+class JinaBertForMultipleChoice(JinaBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
+        self.bert = JinaBertModel(config)
         classifier_dropout = (
             config.classifier_dropout
             if config.classifier_dropout is not None
 @add_start_docstrings(
     """
+    JinaBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
     Named-Entity-Recognition (NER) tasks.
     """,
     BERT_START_DOCSTRING,
 )
+class JinaBertForTokenClassification(JinaBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.bert = JinaBertModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout
             if config.classifier_dropout is not None
 @add_start_docstrings(
     """
+    JinaBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
     layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     BERT_START_DOCSTRING,
 )
+class JinaBertForQuestionAnswering(JinaBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.bert = JinaBertModel(config, add_pooling_layer=False)
         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
         # Initialize weights and apply final processing