ccdv commited on
Commit
a316d2e
·
1 Parent(s): 7a01dba
Files changed (1) hide show
  1. modeling_lsg_camembert.py +36 -74
modeling_lsg_camembert.py CHANGED
@@ -55,7 +55,8 @@ class LSGCamembertConfig(CamembertConfig):
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
- "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
 
59
  self.sparsity_type = None
60
 
61
  if self.sparsity_type in ["stride", "block_stride"]:
@@ -71,7 +72,7 @@ class LSGCamembertConfig(CamembertConfig):
71
  self.num_global_tokens = 1
72
  elif self.num_global_tokens > 512:
73
  logger.warning(
74
- "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
75
  )
76
  self.num_global_tokens = 512
77
 
@@ -79,6 +80,16 @@ class LSGCamembertConfig(CamembertConfig):
79
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
80
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
81
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  class BaseSelfAttention(nn.Module):
84
 
@@ -436,39 +447,13 @@ class LSGCamembertEmbeddings(RobertaEmbeddings):
436
  return embeddings
437
 
438
 
439
- class LSGCamembertSelfOutput(RobertaSelfOutput):
440
-
441
- def __init__(self, config):
442
- super().__init__(config)
443
-
444
-
445
  class LSGAttention(RobertaAttention):
446
 
447
  def __init__(self, config):
448
 
449
- nn.Module.__init__(self)
450
 
451
  self.self = LSGSelfAttention(config)
452
- self.output = LSGCamembertSelfOutput(config)
453
- self.pruned_heads = set()
454
-
455
-
456
- class LSGCamembertIntermediate(RobertaIntermediate):
457
-
458
- def __init__(self, config):
459
- super().__init__(config)
460
-
461
-
462
- class LSGCamembertOutput(RobertaOutput):
463
-
464
- def __init__(self, config):
465
- super().__init__(config)
466
-
467
-
468
- class LSGCamembertPooler(RobertaPooler):
469
-
470
- def __init__(self, config):
471
- super().__init__(config)
472
 
473
 
474
  class LSGSelfAttention(BaseSelfAttention):
@@ -898,29 +883,21 @@ class LSGCamembertLayer(RobertaLayer):
898
 
899
  def __init__(self, config):
900
 
901
- nn.Module.__init__(self)
902
 
903
- self.chunk_size_feed_forward = config.chunk_size_feed_forward
904
- self.seq_len_dim = 1
905
  self.attention = LSGAttention(config)
906
- self.is_decoder = config.is_decoder
907
- self.add_cross_attention = config.add_cross_attention
908
  if self.add_cross_attention:
909
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
910
  self.crossattention = LSGAttention(config)
911
- self.intermediate = LSGCamembertIntermediate(config)
912
- self.output = LSGCamembertOutput(config)
913
 
914
 
915
  class LSGCamembertEncoder(RobertaEncoder):
916
 
917
  def __init__(self, config):
918
 
919
- nn.Module.__init__(self)
920
 
921
- self.config = config
922
  self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
923
- self.gradient_checkpointing = False
924
 
925
 
926
  class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
@@ -945,7 +922,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
945
  config_class = LSGCamembertConfig
946
 
947
 
948
- def __init__(self, config, add_pooling_layer=False):
949
 
950
  LSGCamembertPreTrainedModel.__init__(self, config)
951
 
@@ -961,7 +938,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
961
 
962
  self.embeddings = LSGCamembertEmbeddings(config)
963
  self.encoder = LSGCamembertEncoder(config)
964
- self.pooler = LSGCamembertPooler(config) if add_pooling_layer else None
965
 
966
  if config.add_cross_attention:
967
  logger.warning(
@@ -988,6 +965,12 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
988
  return_dict=None
989
  ):
990
 
 
 
 
 
 
 
991
  inputs_ = input_ids if input_ids is not None else inputs_embeds
992
  n, t = inputs_.size()[:2]
993
 
@@ -1032,33 +1015,26 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
1032
  return_dict=return_dict
1033
  )
1034
 
1035
- context = encoder_outputs[0]
1036
  if self.pool_with_global:
1037
- context[:, self.num_global_tokens] = context[:, 0]
1038
 
1039
  diff = t - t_
1040
- n, _, d = context.size()
1041
- context = context[..., self.num_global_tokens:, :]
1042
 
1043
  # Adapt sequence to initial shape
1044
  if diff < 0:
1045
- context = context[:, :t]
1046
 
1047
- encoder_outputs.last_hidden_state = context
1048
- sequence_output = encoder_outputs[0]
1049
  pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1050
 
1051
  if not return_dict:
1052
  return (sequence_output, pooled_output) + encoder_outputs[1:]
1053
-
1054
- return BaseModelOutputWithPoolingAndCrossAttentions(
1055
- last_hidden_state=sequence_output,
1056
- pooler_output=pooled_output,
1057
- past_key_values=encoder_outputs.past_key_values,
1058
- hidden_states=encoder_outputs.hidden_states,
1059
- attentions=encoder_outputs.attentions,
1060
- cross_attentions=encoder_outputs.cross_attentions,
1061
- )
1062
 
1063
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1064
 
@@ -1092,7 +1068,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
1092
  logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
1093
 
1094
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1095
- self.lm_head = LSGCamembertLMHead(config)
1096
 
1097
  # The LM head weights require special treatment only when they are tied with the word embeddings
1098
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1122,7 +1098,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1122
  )
1123
 
1124
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1125
- self.lm_head = LSGCamembertLMHead(config)
1126
 
1127
  # The LM head weights require special treatment only when they are tied with the word embeddings
1128
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1131,13 +1107,6 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1131
  self.post_init()
1132
 
1133
 
1134
- class LSGCamembertLMHead(RobertaLMHead):
1135
- """LSG Head for masked language modeling."""
1136
-
1137
- def __init__(self, config):
1138
- super().__init__(config)
1139
-
1140
-
1141
  class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
1142
  """
1143
  This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
@@ -1154,19 +1123,12 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Roberta
1154
  self.config = config
1155
 
1156
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1157
- self.classifier = LSGCamembertClassificationHead(config)
1158
 
1159
  # Initialize weights and apply final processing
1160
  self.post_init()
1161
 
1162
 
1163
- class LSGCamembertClassificationHead(RobertaClassificationHead):
1164
- """Head for sentence-level classification tasks."""
1165
-
1166
- def __init__(self, config):
1167
- super().__init__(config)
1168
-
1169
-
1170
  class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
1171
  """
1172
  This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
 
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
+ "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
59
+ setting sparsity_type=None, computation will skip sparse attention")
60
  self.sparsity_type = None
61
 
62
  if self.sparsity_type in ["stride", "block_stride"]:
 
72
  self.num_global_tokens = 1
73
  elif self.num_global_tokens > 512:
74
  logger.warning(
75
+ "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
76
  )
77
  self.num_global_tokens = 512
78
 
 
80
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
81
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
82
 
83
+ if self.mask_first_token and not pool_with_global:
84
+ logger.warning(
85
+ "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
86
+ self.pool_with_global = True
87
+
88
+ if hasattr(self, "position_embedding_type"):
89
+ if self.position_embedding_type != "absolute":
90
+ logger.warning(
91
+ "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
92
+
93
 
94
  class BaseSelfAttention(nn.Module):
95
 
 
447
  return embeddings
448
 
449
 
 
 
 
 
 
 
450
  class LSGAttention(RobertaAttention):
451
 
452
  def __init__(self, config):
453
 
454
+ super().__init__(config)
455
 
456
  self.self = LSGSelfAttention(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
 
459
  class LSGSelfAttention(BaseSelfAttention):
 
883
 
884
  def __init__(self, config):
885
 
886
+ super().__init__(config)
887
 
 
 
888
  self.attention = LSGAttention(config)
 
 
889
  if self.add_cross_attention:
890
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
891
  self.crossattention = LSGAttention(config)
 
 
892
 
893
 
894
  class LSGCamembertEncoder(RobertaEncoder):
895
 
896
  def __init__(self, config):
897
 
898
+ super().__init__(config)
899
 
 
900
  self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
 
901
 
902
 
903
  class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
 
922
  config_class = LSGCamembertConfig
923
 
924
 
925
+ def __init__(self, config, add_pooling_layer=True):
926
 
927
  LSGCamembertPreTrainedModel.__init__(self, config)
928
 
 
938
 
939
  self.embeddings = LSGCamembertEmbeddings(config)
940
  self.encoder = LSGCamembertEncoder(config)
941
+ self.pooler = RobertaPooler(config) if add_pooling_layer else None
942
 
943
  if config.add_cross_attention:
944
  logger.warning(
 
965
  return_dict=None
966
  ):
967
 
968
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
969
+ output_hidden_states = (
970
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
971
+ )
972
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
973
+
974
  inputs_ = input_ids if input_ids is not None else inputs_embeds
975
  n, t = inputs_.size()[:2]
976
 
 
1015
  return_dict=return_dict
1016
  )
1017
 
1018
+ sequence_output = encoder_outputs[0]
1019
  if self.pool_with_global:
1020
+ sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
1021
 
1022
  diff = t - t_
1023
+ n, _, d = sequence_output.size()
1024
+ sequence_output = sequence_output[..., self.num_global_tokens:, :]
1025
 
1026
  # Adapt sequence to initial shape
1027
  if diff < 0:
1028
+ sequence_output = sequence_output[:, :t]
1029
 
 
 
1030
  pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1031
 
1032
  if not return_dict:
1033
  return (sequence_output, pooled_output) + encoder_outputs[1:]
1034
+
1035
+ encoder_outputs.last_hidden_state = sequence_output
1036
+ encoder_outputs.pooler_output = pooled_output
1037
+ return encoder_outputs
 
 
 
 
 
1038
 
1039
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1040
 
 
1068
  logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
1069
 
1070
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1071
+ self.lm_head = RobertaLMHead(config)
1072
 
1073
  # The LM head weights require special treatment only when they are tied with the word embeddings
1074
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1098
  )
1099
 
1100
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1101
+ self.lm_head = RobertaLMHead(config)
1102
 
1103
  # The LM head weights require special treatment only when they are tied with the word embeddings
1104
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1107
  self.post_init()
1108
 
1109
 
 
 
 
 
 
 
 
1110
  class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
1111
  """
1112
  This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
 
1123
  self.config = config
1124
 
1125
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1126
+ self.classifier = RobertaClassificationHead(config)
1127
 
1128
  # Initialize weights and apply final processing
1129
  self.post_init()
1130
 
1131
 
 
 
 
 
 
 
 
1132
  class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
1133
  """
1134
  This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the