add mask_first_token
Browse files- README.md +5 -1
- config.json +1 -0
- modeling_lsg_roberta.py +5 -0
README.md
CHANGED
|
@@ -50,13 +50,17 @@ You can change various parameters like :
|
|
| 50 |
Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
|
| 51 |
|
| 52 |
```python:
|
|
|
|
|
|
|
| 53 |
model = AutoModel.from_pretrained("ccdv/lsg-base-4096",
|
| 54 |
trust_remote_code=True,
|
| 55 |
num_global_tokens=16,
|
| 56 |
block_size=64,
|
| 57 |
sparse_block_size=64,
|
| 58 |
-
sparsity_factor=4,
|
| 59 |
attention_probs_dropout_prob=0.0
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
```
|
| 62 |
|
|
|
|
| 50 |
Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
|
| 51 |
|
| 52 |
```python:
|
| 53 |
+
from transformers import AutoModel
|
| 54 |
+
|
| 55 |
model = AutoModel.from_pretrained("ccdv/lsg-base-4096",
|
| 56 |
trust_remote_code=True,
|
| 57 |
num_global_tokens=16,
|
| 58 |
block_size=64,
|
| 59 |
sparse_block_size=64,
|
|
|
|
| 60 |
attention_probs_dropout_prob=0.0
|
| 61 |
+
sparsity_factor=4,
|
| 62 |
+
sparsity_type="none",
|
| 63 |
+
mask_first_token=True
|
| 64 |
)
|
| 65 |
```
|
| 66 |
|
config.json
CHANGED
|
@@ -28,6 +28,7 @@
|
|
| 28 |
"intermediate_size": 3072,
|
| 29 |
"layer_norm_eps": 1e-12,
|
| 30 |
"lsh_num_pre_rounds": 1,
|
|
|
|
| 31 |
"max_position_embeddings": 4098,
|
| 32 |
"model_type": "roberta",
|
| 33 |
"num_attention_heads": 12,
|
|
|
|
| 28 |
"intermediate_size": 3072,
|
| 29 |
"layer_norm_eps": 1e-12,
|
| 30 |
"lsh_num_pre_rounds": 1,
|
| 31 |
+
"mask_first_token": false,
|
| 32 |
"max_position_embeddings": 4098,
|
| 33 |
"model_type": "roberta",
|
| 34 |
"num_attention_heads": 12,
|
modeling_lsg_roberta.py
CHANGED
|
@@ -30,6 +30,7 @@ class LSGRobertaConfig(RobertaConfig):
|
|
| 30 |
base_model_prefix="lsg",
|
| 31 |
block_size=128,
|
| 32 |
lsh_num_pre_rounds=1,
|
|
|
|
| 33 |
num_global_tokens=1,
|
| 34 |
pool_with_global=True,
|
| 35 |
sparse_block_size=128,
|
|
@@ -45,6 +46,7 @@ class LSGRobertaConfig(RobertaConfig):
|
|
| 45 |
self.base_model_prefix = base_model_prefix
|
| 46 |
self.block_size = block_size
|
| 47 |
self.lsh_num_pre_rounds = lsh_num_pre_rounds
|
|
|
|
| 48 |
self.num_global_tokens = num_global_tokens
|
| 49 |
self.pool_with_global = pool_with_global
|
| 50 |
self.sparse_block_size = sparse_block_size
|
|
@@ -950,6 +952,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
| 950 |
assert hasattr(config, "block_size") and hasattr(config, "adaptive")
|
| 951 |
self.block_size = config.block_size
|
| 952 |
self.adaptive = config.adaptive
|
|
|
|
| 953 |
self.pool_with_global = config.pool_with_global
|
| 954 |
|
| 955 |
self.embeddings = LSGRobertaEmbeddings(config)
|
|
@@ -986,6 +989,8 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
| 986 |
|
| 987 |
if attention_mask is None:
|
| 988 |
attention_mask = torch.ones(n, t, device=inputs_.device)
|
|
|
|
|
|
|
| 989 |
|
| 990 |
b = self.block_size * 2
|
| 991 |
pad = t % self.block_size
|
|
|
|
| 30 |
base_model_prefix="lsg",
|
| 31 |
block_size=128,
|
| 32 |
lsh_num_pre_rounds=1,
|
| 33 |
+
mask_first_token=False,
|
| 34 |
num_global_tokens=1,
|
| 35 |
pool_with_global=True,
|
| 36 |
sparse_block_size=128,
|
|
|
|
| 46 |
self.base_model_prefix = base_model_prefix
|
| 47 |
self.block_size = block_size
|
| 48 |
self.lsh_num_pre_rounds = lsh_num_pre_rounds
|
| 49 |
+
self.mask_first_token = mask_first_token
|
| 50 |
self.num_global_tokens = num_global_tokens
|
| 51 |
self.pool_with_global = pool_with_global
|
| 52 |
self.sparse_block_size = sparse_block_size
|
|
|
|
| 952 |
assert hasattr(config, "block_size") and hasattr(config, "adaptive")
|
| 953 |
self.block_size = config.block_size
|
| 954 |
self.adaptive = config.adaptive
|
| 955 |
+
self.mask_first_token = config.mask_first_token
|
| 956 |
self.pool_with_global = config.pool_with_global
|
| 957 |
|
| 958 |
self.embeddings = LSGRobertaEmbeddings(config)
|
|
|
|
| 989 |
|
| 990 |
if attention_mask is None:
|
| 991 |
attention_mask = torch.ones(n, t, device=inputs_.device)
|
| 992 |
+
if self.mask_first_token:
|
| 993 |
+
attention_mask[:,0] = 0
|
| 994 |
|
| 995 |
b = self.block_size * 2
|
| 996 |
pad = t % self.block_size
|