KoichiYasuoka commited on
Commit
8d72d2d
·
1 Parent(s): 643c1b2

model improved

Browse files
Files changed (3) hide show
  1. maker.py +2 -0
  2. pytorch_model.bin +1 -1
  3. tokenizer.json +13 -4
maker.py CHANGED
@@ -100,7 +100,9 @@ class UDEmbedsDataset(object):
100
  emb=torch.stack(m)
101
  return{"inputs_embeds":emb[ids[:8192],:],"labels":[self.label2id[p] for p in upos[:8192]]}
102
  from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
 
103
  tkz=AutoTokenizer.from_pretrained(src)
 
104
  trainDS=UDEmbedsDataset("train.conllu",tkz)
105
  devDS=UDEmbedsDataset("dev.conllu",tkz)
106
  testDS=UDEmbedsDataset("test.conllu",tkz)
 
100
  emb=torch.stack(m)
101
  return{"inputs_embeds":emb[ids[:8192],:],"labels":[self.label2id[p] for p in upos[:8192]]}
102
  from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
103
+ from tokenizers.pre_tokenizers import Sequence,Punctuation
104
  tkz=AutoTokenizer.from_pretrained(src)
105
+ tkz.backend_tokenizer.pre_tokenizer=Sequence([Punctuation(),tkz.backend_tokenizer.pre_tokenizer])
106
  trainDS=UDEmbedsDataset("train.conllu",tkz)
107
  devDS=UDEmbedsDataset("dev.conllu",tkz)
108
  testDS=UDEmbedsDataset("test.conllu",tkz)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:830c264f28fba9953e623b9f3dd35e58d30bd6469e297dfaf1d2220e037e9ce4
3
  size 663202290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:622ac1217a6e379a516f12900f773f83fd05c5a2c2cf9b87b3f848a6d7810162
3
  size 663202290
tokenizer.json CHANGED
@@ -1052,10 +1052,19 @@
1052
  "type": "NFC"
1053
  },
1054
  "pre_tokenizer": {
1055
- "type": "ByteLevel",
1056
- "add_prefix_space": false,
1057
- "trim_offsets": true,
1058
- "use_regex": true
 
 
 
 
 
 
 
 
 
1059
  },
1060
  "post_processor": {
1061
  "type": "TemplateProcessing",
 
1052
  "type": "NFC"
1053
  },
1054
  "pre_tokenizer": {
1055
+ "type": "Sequence",
1056
+ "pretokenizers": [
1057
+ {
1058
+ "type": "Punctuation",
1059
+ "behavior": "Isolated"
1060
+ },
1061
+ {
1062
+ "type": "ByteLevel",
1063
+ "add_prefix_space": false,
1064
+ "trim_offsets": true,
1065
+ "use_regex": true
1066
+ }
1067
+ ]
1068
  },
1069
  "post_processor": {
1070
  "type": "TemplateProcessing",