merge changes
Browse filesSigned-off-by: Meow <[email protected]>
- embedding.py +0 -2
- mha.py +1 -0
- modeling_xlm_roberta.py +0 -1
embedding.py
CHANGED
|
@@ -59,7 +59,6 @@ class XLMRobertaEmbeddings(nn.Module):
|
|
| 59 |
embeddings[task_indices] = task_embeddings
|
| 60 |
else:
|
| 61 |
embeddings = self.word_embeddings(input_ids)
|
| 62 |
-
|
| 63 |
if self.max_position_embeddings > 0:
|
| 64 |
if position_ids is None:
|
| 65 |
position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=self.word_embeddings.padding_idx).to(input_ids.device)
|
|
@@ -79,5 +78,4 @@ class XLMRobertaEmbeddings(nn.Module):
|
|
| 79 |
else:
|
| 80 |
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
| 81 |
embeddings = embeddings + token_type_embeddings
|
| 82 |
-
|
| 83 |
return embeddings
|
|
|
|
| 59 |
embeddings[task_indices] = task_embeddings
|
| 60 |
else:
|
| 61 |
embeddings = self.word_embeddings(input_ids)
|
|
|
|
| 62 |
if self.max_position_embeddings > 0:
|
| 63 |
if position_ids is None:
|
| 64 |
position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=self.word_embeddings.padding_idx).to(input_ids.device)
|
|
|
|
| 78 |
else:
|
| 79 |
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
| 80 |
embeddings = embeddings + token_type_embeddings
|
|
|
|
| 81 |
return embeddings
|
mha.py
CHANGED
|
@@ -643,6 +643,7 @@ class MHA(nn.Module):
|
|
| 643 |
inference_params.max_sequence_len if inference_params is not None else max_seqlen
|
| 644 |
)
|
| 645 |
batch, seqlen = x.shape[:2]
|
|
|
|
| 646 |
if not self.cross_attn and self.num_heads_kv == self.num_heads:
|
| 647 |
assert x_kv is None and mixer_subset is None
|
| 648 |
|
|
|
|
| 643 |
inference_params.max_sequence_len if inference_params is not None else max_seqlen
|
| 644 |
)
|
| 645 |
batch, seqlen = x.shape[:2]
|
| 646 |
+
lora_kwargs = {}
|
| 647 |
if not self.cross_attn and self.num_heads_kv == self.num_heads:
|
| 648 |
assert x_kv is None and mixer_subset is None
|
| 649 |
|
modeling_xlm_roberta.py
CHANGED
|
@@ -213,7 +213,6 @@ class XLMRobertaEncoder(nn.Module):
|
|
| 213 |
mixer_kwargs = {'adapter_mask': adapter_mask}
|
| 214 |
if key_padding_mask is not None:
|
| 215 |
mixer_kwargs['key_padding_mask'] = key_padding_mask.bool()
|
| 216 |
-
|
| 217 |
for layer in self.layers:
|
| 218 |
if self._grad_checkpointing:
|
| 219 |
hidden_states = torch.utils.checkpoint.checkpoint(
|
|
|
|
| 213 |
mixer_kwargs = {'adapter_mask': adapter_mask}
|
| 214 |
if key_padding_mask is not None:
|
| 215 |
mixer_kwargs['key_padding_mask'] = key_padding_mask.bool()
|
|
|
|
| 216 |
for layer in self.layers:
|
| 217 |
if self._grad_checkpointing:
|
| 218 |
hidden_states = torch.utils.checkpoint.checkpoint(
|