Commit
·
8d72d2d
1
Parent(s):
643c1b2
model improved
Browse files- maker.py +2 -0
- pytorch_model.bin +1 -1
- tokenizer.json +13 -4
maker.py
CHANGED
@@ -100,7 +100,9 @@ class UDEmbedsDataset(object):
|
|
100 |
emb=torch.stack(m)
|
101 |
return{"inputs_embeds":emb[ids[:8192],:],"labels":[self.label2id[p] for p in upos[:8192]]}
|
102 |
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
|
|
103 |
tkz=AutoTokenizer.from_pretrained(src)
|
|
|
104 |
trainDS=UDEmbedsDataset("train.conllu",tkz)
|
105 |
devDS=UDEmbedsDataset("dev.conllu",tkz)
|
106 |
testDS=UDEmbedsDataset("test.conllu",tkz)
|
|
|
100 |
emb=torch.stack(m)
|
101 |
return{"inputs_embeds":emb[ids[:8192],:],"labels":[self.label2id[p] for p in upos[:8192]]}
|
102 |
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
103 |
+
from tokenizers.pre_tokenizers import Sequence,Punctuation
|
104 |
tkz=AutoTokenizer.from_pretrained(src)
|
105 |
+
tkz.backend_tokenizer.pre_tokenizer=Sequence([Punctuation(),tkz.backend_tokenizer.pre_tokenizer])
|
106 |
trainDS=UDEmbedsDataset("train.conllu",tkz)
|
107 |
devDS=UDEmbedsDataset("dev.conllu",tkz)
|
108 |
testDS=UDEmbedsDataset("test.conllu",tkz)
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 663202290
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:622ac1217a6e379a516f12900f773f83fd05c5a2c2cf9b87b3f848a6d7810162
|
3 |
size 663202290
|
tokenizer.json
CHANGED
@@ -1052,10 +1052,19 @@
|
|
1052 |
"type": "NFC"
|
1053 |
},
|
1054 |
"pre_tokenizer": {
|
1055 |
-
"type": "
|
1056 |
-
"
|
1057 |
-
|
1058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1059 |
},
|
1060 |
"post_processor": {
|
1061 |
"type": "TemplateProcessing",
|
|
|
1052 |
"type": "NFC"
|
1053 |
},
|
1054 |
"pre_tokenizer": {
|
1055 |
+
"type": "Sequence",
|
1056 |
+
"pretokenizers": [
|
1057 |
+
{
|
1058 |
+
"type": "Punctuation",
|
1059 |
+
"behavior": "Isolated"
|
1060 |
+
},
|
1061 |
+
{
|
1062 |
+
"type": "ByteLevel",
|
1063 |
+
"add_prefix_space": false,
|
1064 |
+
"trim_offsets": true,
|
1065 |
+
"use_regex": true
|
1066 |
+
}
|
1067 |
+
]
|
1068 |
},
|
1069 |
"post_processor": {
|
1070 |
"type": "TemplateProcessing",
|