lfoppiano commited on
Commit
ca0c310
·
verified ·
1 Parent(s): 110d75b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. README.md +16 -0
  3. context_bert/config.json +20 -0
  4. context_bert/model_weights.hdf5 +3 -0
  5. context_bert/transformer-config.json +25 -0
  6. context_bert/transformer-tokenizer/special_tokens_map.json +1 -0
  7. context_bert/transformer-tokenizer/tokenizer.json +0 -0
  8. context_bert/transformer-tokenizer/tokenizer_config.json +1 -0
  9. context_bert/transformer-tokenizer/vocab.txt +0 -0
  10. context_creation_bert/config.json +19 -0
  11. context_creation_bert/transformer-config.json +25 -0
  12. context_creation_bert/transformer-tokenizer/special_tokens_map.json +1 -0
  13. context_creation_bert/transformer-tokenizer/tokenizer.json +0 -0
  14. context_creation_bert/transformer-tokenizer/tokenizer_config.json +1 -0
  15. context_creation_bert/transformer-tokenizer/vocab.txt +0 -0
  16. context_shared_bert/config.json +19 -0
  17. context_shared_bert/transformer-config.json +25 -0
  18. context_shared_bert/transformer-tokenizer/special_tokens_map.json +1 -0
  19. context_shared_bert/transformer-tokenizer/tokenizer.json +0 -0
  20. context_shared_bert/transformer-tokenizer/tokenizer_config.json +1 -0
  21. context_shared_bert/transformer-tokenizer/vocab.txt +0 -0
  22. context_used_bert/config.json +19 -0
  23. context_used_bert/model_weights.hdf5 +3 -0
  24. context_used_bert/transformer-config.json +25 -0
  25. context_used_bert/transformer-tokenizer/special_tokens_map.json +1 -0
  26. context_used_bert/transformer-tokenizer/tokenizer.json +0 -0
  27. context_used_bert/transformer-tokenizer/tokenizer_config.json +1 -0
  28. context_used_bert/transformer-tokenizer/vocab.txt +0 -0
  29. software-BERT/config.json +38 -0
  30. software-BERT/model_weights.hdf5 +3 -0
  31. software-BERT/preprocessor.json +596 -0
  32. software-BERT/transformer-config.json +21 -0
  33. software-BERT/transformer-tokenizer/special_tokens_map.json +7 -0
  34. software-BERT/transformer-tokenizer/tokenizer.json +0 -0
  35. software-BERT/transformer-tokenizer/tokenizer_config.json +17 -0
  36. software-BERT/transformer-tokenizer/vocab.txt +0 -0
  37. software-BERT_CRF/config.json +38 -0
  38. software-BERT_CRF/model_weights.hdf5 +3 -0
  39. software-BERT_CRF/preprocessor.json +592 -0
  40. software-BERT_CRF/transformer-config.json +21 -0
  41. software-BERT_CRF/transformer-tokenizer/special_tokens_map.json +7 -0
  42. software-BERT_CRF/transformer-tokenizer/tokenizer.json +0 -0
  43. software-BERT_CRF/transformer-tokenizer/tokenizer_config.json +17 -0
  44. software-BERT_CRF/transformer-tokenizer/vocab.txt +0 -0
  45. software-type-BERT_CRF/config.json +38 -0
  46. software-type-BERT_CRF/preprocessor.json +343 -0
  47. software-type-BERT_CRF/transformer-config.json +25 -0
  48. software-type-BERT_CRF/transformer-tokenizer/special_tokens_map.json +7 -0
  49. software-type-BERT_CRF/transformer-tokenizer/tokenizer.json +0 -0
  50. software-type-BERT_CRF/transformer-tokenizer/tokenizer_config.json +15 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ software-BERT_CRF/model_weights.hdf5 filter=lfs diff=lfs merge=lfs -text
37
+ context_bert/model_weights.hdf5 filter=lfs diff=lfs merge=lfs -text
38
+ software-BERT/model_weights.hdf5 filter=lfs diff=lfs merge=lfs -text
39
+ context_used_bert/model_weights.hdf5 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # Softcite models developed in the SoFAIR EU Project
6
+ The goal of this GROBID module is to recognize any software mentions in scholar textual documents, publisher XML and PDF.
7
+ It uses as training data the Softcite Dataset developed by James Howison Lab at the University of Texas at Austin.
8
+ This annotated corpus and the present software text mining component have been developed supported by a grant from the Alfred P. Sloan foundation to improve credit for research software.
9
+
10
+ Github: https://github.com/softcite/software-mentions
11
+
12
+ Original author: Patrice Lopez
13
+ Current authors: SoFAIR Project
14
+
15
+
16
+ These models have been migrated from AWS S3.
context_bert/config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "context_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 100,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "used",
14
+ "creation",
15
+ "shared"
16
+ ],
17
+ "fold_number": 1,
18
+ "batch_size": 32,
19
+ "transformer_name": "michiyasunaga/LinkBERT-base"
20
+ }
context_bert/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0ea2189fa380300c38d697ee7edf59881177ec8fa844275c7718152b66be449
3
+ size 433524128
context_bert/transformer-config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "michiyasunaga/LinkBERT-base",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.15.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 28996
25
+ }
context_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
context_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
context_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "michiyasunaga/LinkBERT-base", "add_special_tokens": true, "max_length": 100, "add_prefix_space": true, "tokenizer_class": "BertTokenizer"}
context_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_creation_bert/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "context_creation_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 100,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "creation",
14
+ "not_creation"
15
+ ],
16
+ "fold_number": 1,
17
+ "batch_size": 32,
18
+ "transformer_name": "michiyasunaga/LinkBERT-base"
19
+ }
context_creation_bert/transformer-config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "michiyasunaga/LinkBERT-base",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.15.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 28996
25
+ }
context_creation_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
context_creation_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
context_creation_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "michiyasunaga/LinkBERT-base", "add_special_tokens": true, "max_length": 100, "add_prefix_space": true, "tokenizer_class": "BertTokenizer"}
context_creation_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_shared_bert/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "context_shared_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 100,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "shared",
14
+ "not_shared"
15
+ ],
16
+ "fold_number": 1,
17
+ "batch_size": 32,
18
+ "transformer_name": "michiyasunaga/LinkBERT-base"
19
+ }
context_shared_bert/transformer-config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "michiyasunaga/LinkBERT-base",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.15.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 28996
25
+ }
context_shared_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
context_shared_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
context_shared_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "michiyasunaga/LinkBERT-base", "add_special_tokens": true, "max_length": 100, "add_prefix_space": true, "tokenizer_class": "BertTokenizer"}
context_shared_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_used_bert/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "context_used_bert",
3
+ "architecture": "bert",
4
+ "embeddings_name": null,
5
+ "char_embedding_size": 25,
6
+ "word_embedding_size": 0,
7
+ "dropout": 0.5,
8
+ "recurrent_dropout": 0.25,
9
+ "maxlen": 100,
10
+ "dense_size": 32,
11
+ "use_char_feature": false,
12
+ "list_classes": [
13
+ "used",
14
+ "not_used"
15
+ ],
16
+ "fold_number": 1,
17
+ "batch_size": 32,
18
+ "transformer_name": "michiyasunaga/LinkBERT-base"
19
+ }
context_used_bert/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95ea068e2ab8ed28b22c984e32ac8ad0f8b6105c100d00ef9cad8b4dffb1be75
3
+ size 433521056
context_used_bert/transformer-config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "michiyasunaga/LinkBERT-base",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.15.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 28996
25
+ }
context_used_bert/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
context_used_bert/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
context_used_bert/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "michiyasunaga/LinkBERT-base", "add_special_tokens": true, "max_length": 100, "add_prefix_space": true, "tokenizer_class": "BertTokenizer"}
context_used_bert/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
software-BERT/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "software-BERT",
3
+ "architecture": "BERT",
4
+ "embeddings_name": null,
5
+ "char_vocab_size": 549,
6
+ "case_vocab_size": 8,
7
+ "char_embedding_size": 25,
8
+ "num_char_lstm_units": 25,
9
+ "max_char_length": 30,
10
+ "features_vocabulary_size": 12,
11
+ "features_indices": null,
12
+ "features_embedding_size": 4,
13
+ "features_lstm_units": 4,
14
+ "max_sequence_length": 512,
15
+ "word_embedding_size": 0,
16
+ "num_word_lstm_units": 100,
17
+ "case_embedding_size": 5,
18
+ "dropout": 0.5,
19
+ "recurrent_dropout": 0.5,
20
+ "use_crf": false,
21
+ "use_chain_crf": false,
22
+ "fold_number": 1,
23
+ "batch_size": 8,
24
+ "transformer_name": "allenai/scibert_scivocab_cased/dir",
25
+ "use_ELMo": false,
26
+ "labels": {
27
+ "<PAD>": 0,
28
+ "B-<creator>": 1,
29
+ "B-<software>": 2,
30
+ "B-<url>": 3,
31
+ "B-<version>": 4,
32
+ "I-<creator>": 5,
33
+ "I-<software>": 6,
34
+ "I-<url>": 7,
35
+ "I-<version>": 8,
36
+ "O": 9
37
+ }
38
+ }
software-BERT/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14c716b7ab2d945de3a838e009a5ebb4303292ec454d5c078e9dfb441a987ae8
3
+ size 440060072
software-BERT/preprocessor.json ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "padding": true,
3
+ "return_lengths": false,
4
+ "return_word_embeddings": false,
5
+ "return_casing": false,
6
+ "return_features": false,
7
+ "return_chars": false,
8
+ "return_bert_embeddings": true,
9
+ "vocab_char": {
10
+ "<PAD>": 0,
11
+ "<UNK>": 1,
12
+ "!": 2,
13
+ "\"": 3,
14
+ "#": 4,
15
+ "$": 5,
16
+ "%": 6,
17
+ "&": 7,
18
+ "'": 8,
19
+ "(": 9,
20
+ ")": 10,
21
+ "*": 11,
22
+ "+": 12,
23
+ ",": 13,
24
+ "-": 14,
25
+ ".": 15,
26
+ "/": 16,
27
+ "0": 17,
28
+ "1": 18,
29
+ "2": 19,
30
+ "3": 20,
31
+ "4": 21,
32
+ "5": 22,
33
+ "6": 23,
34
+ "7": 24,
35
+ "8": 25,
36
+ "9": 26,
37
+ ":": 27,
38
+ ";": 28,
39
+ "<": 29,
40
+ "=": 30,
41
+ ">": 31,
42
+ "?": 32,
43
+ "@": 33,
44
+ "A": 34,
45
+ "B": 35,
46
+ "C": 36,
47
+ "D": 37,
48
+ "E": 38,
49
+ "F": 39,
50
+ "G": 40,
51
+ "H": 41,
52
+ "I": 42,
53
+ "J": 43,
54
+ "K": 44,
55
+ "L": 45,
56
+ "M": 46,
57
+ "N": 47,
58
+ "O": 48,
59
+ "P": 49,
60
+ "Q": 50,
61
+ "R": 51,
62
+ "S": 52,
63
+ "T": 53,
64
+ "U": 54,
65
+ "V": 55,
66
+ "W": 56,
67
+ "X": 57,
68
+ "Y": 58,
69
+ "Z": 59,
70
+ "[": 60,
71
+ "\\": 61,
72
+ "]": 62,
73
+ "^": 63,
74
+ "_": 64,
75
+ "`": 65,
76
+ "a": 66,
77
+ "b": 67,
78
+ "c": 68,
79
+ "d": 69,
80
+ "e": 70,
81
+ "f": 71,
82
+ "g": 72,
83
+ "h": 73,
84
+ "i": 74,
85
+ "j": 75,
86
+ "k": 76,
87
+ "l": 77,
88
+ "m": 78,
89
+ "n": 79,
90
+ "o": 80,
91
+ "p": 81,
92
+ "q": 82,
93
+ "r": 83,
94
+ "s": 84,
95
+ "t": 85,
96
+ "u": 86,
97
+ "v": 87,
98
+ "w": 88,
99
+ "x": 89,
100
+ "y": 90,
101
+ "z": 91,
102
+ "{": 92,
103
+ "|": 93,
104
+ "}": 94,
105
+ "~": 95,
106
+ "\u00a1": 96,
107
+ "\u00a2": 97,
108
+ "\u00a3": 98,
109
+ "\u00a5": 99,
110
+ "\u00a7": 100,
111
+ "\u00a8": 101,
112
+ "\u00a9": 102,
113
+ "\u00ab": 103,
114
+ "\u00ac": 104,
115
+ "\u00ae": 105,
116
+ "\u00af": 106,
117
+ "\u00b0": 107,
118
+ "\u00b1": 108,
119
+ "\u00b2": 109,
120
+ "\u00b3": 110,
121
+ "\u00b4": 111,
122
+ "\u00b5": 112,
123
+ "\u00b6": 113,
124
+ "\u00b7": 114,
125
+ "\u00b8": 115,
126
+ "\u00b9": 116,
127
+ "\u00ba": 117,
128
+ "\u00bb": 118,
129
+ "\u00bc": 119,
130
+ "\u00bd": 120,
131
+ "\u00be": 121,
132
+ "\u00bf": 122,
133
+ "\u00c0": 123,
134
+ "\u00c1": 124,
135
+ "\u00c2": 125,
136
+ "\u00c3": 126,
137
+ "\u00c4": 127,
138
+ "\u00c5": 128,
139
+ "\u00c8": 129,
140
+ "\u00c9": 130,
141
+ "\u00ca": 131,
142
+ "\u00cd": 132,
143
+ "\u00d0": 133,
144
+ "\u00d2": 134,
145
+ "\u00d3": 135,
146
+ "\u00d4": 136,
147
+ "\u00d5": 137,
148
+ "\u00d6": 138,
149
+ "\u00d7": 139,
150
+ "\u00d8": 140,
151
+ "\u00d9": 141,
152
+ "\u00da": 142,
153
+ "\u00dc": 143,
154
+ "\u00de": 144,
155
+ "\u00df": 145,
156
+ "\u00e0": 146,
157
+ "\u00e1": 147,
158
+ "\u00e2": 148,
159
+ "\u00e3": 149,
160
+ "\u00e4": 150,
161
+ "\u00e5": 151,
162
+ "\u00e7": 152,
163
+ "\u00e8": 153,
164
+ "\u00e9": 154,
165
+ "\u00ea": 155,
166
+ "\u00eb": 156,
167
+ "\u00ec": 157,
168
+ "\u00ed": 158,
169
+ "\u00ee": 159,
170
+ "\u00ef": 160,
171
+ "\u00f0": 161,
172
+ "\u00f1": 162,
173
+ "\u00f2": 163,
174
+ "\u00f3": 164,
175
+ "\u00f4": 165,
176
+ "\u00f5": 166,
177
+ "\u00f6": 167,
178
+ "\u00f7": 168,
179
+ "\u00f8": 169,
180
+ "\u00f9": 170,
181
+ "\u00fa": 171,
182
+ "\u00fb": 172,
183
+ "\u00fc": 173,
184
+ "\u00fd": 174,
185
+ "\u00fe": 175,
186
+ "\u0100": 176,
187
+ "\u0101": 177,
188
+ "\u0103": 178,
189
+ "\u0105": 179,
190
+ "\u0106": 180,
191
+ "\u0107": 181,
192
+ "\u0108": 182,
193
+ "\u0109": 183,
194
+ "\u010c": 184,
195
+ "\u010d": 185,
196
+ "\u0113": 186,
197
+ "\u0117": 187,
198
+ "\u0118": 188,
199
+ "\u011b": 189,
200
+ "\u012b": 190,
201
+ "\u0130": 191,
202
+ "\u0131": 192,
203
+ "\u013e": 193,
204
+ "\u0142": 194,
205
+ "\u0144": 195,
206
+ "\u0148": 196,
207
+ "\u014d": 197,
208
+ "\u0151": 198,
209
+ "\u0159": 199,
210
+ "\u015a": 200,
211
+ "\u015c": 201,
212
+ "\u015d": 202,
213
+ "\u015e": 203,
214
+ "\u015f": 204,
215
+ "\u0160": 205,
216
+ "\u0161": 206,
217
+ "\u0168": 207,
218
+ "\u0169": 208,
219
+ "\u016b": 209,
220
+ "\u016f": 210,
221
+ "\u0175": 211,
222
+ "\u0176": 212,
223
+ "\u0177": 213,
224
+ "\u017d": 214,
225
+ "\u017e": 215,
226
+ "\u0192": 216,
227
+ "\u01eb": 217,
228
+ "\u021b": 218,
229
+ "\u0251": 219,
230
+ "\u025b": 220,
231
+ "\u0263": 221,
232
+ "\u02a6": 222,
233
+ "\u02b9": 223,
234
+ "\u02bc": 224,
235
+ "\u02c2": 225,
236
+ "\u02c3": 226,
237
+ "\u02c6": 227,
238
+ "\u02c7": 228,
239
+ "\u02c9": 229,
240
+ "\u02d8": 230,
241
+ "\u02d9": 231,
242
+ "\u02da": 232,
243
+ "\u02db": 233,
244
+ "\u02dc": 234,
245
+ "\u0303": 235,
246
+ "\u030a": 236,
247
+ "\u0313": 237,
248
+ "\u0314": 238,
249
+ "\u031d": 239,
250
+ "\u034c": 240,
251
+ "\u0350": 241,
252
+ "\u0351": 242,
253
+ "\u0352": 243,
254
+ "\u0354": 244,
255
+ "\u0357": 245,
256
+ "\u0358": 246,
257
+ "\u0371": 247,
258
+ "\u0374": 248,
259
+ "\u0392": 249,
260
+ "\u0393": 250,
261
+ "\u0394": 251,
262
+ "\u0397": 252,
263
+ "\u0398": 253,
264
+ "\u039b": 254,
265
+ "\u03a0": 255,
266
+ "\u03a3": 256,
267
+ "\u03a4": 257,
268
+ "\u03a5": 258,
269
+ "\u03a6": 259,
270
+ "\u03a7": 260,
271
+ "\u03a8": 261,
272
+ "\u03a9": 262,
273
+ "\u03b1": 263,
274
+ "\u03b2": 264,
275
+ "\u03b3": 265,
276
+ "\u03b4": 266,
277
+ "\u03b5": 267,
278
+ "\u03b6": 268,
279
+ "\u03b7": 269,
280
+ "\u03b8": 270,
281
+ "\u03b9": 271,
282
+ "\u03ba": 272,
283
+ "\u03bb": 273,
284
+ "\u03bc": 274,
285
+ "\u03bd": 275,
286
+ "\u03be": 276,
287
+ "\u03c0": 277,
288
+ "\u03c1": 278,
289
+ "\u03c3": 279,
290
+ "\u03c4": 280,
291
+ "\u03c5": 281,
292
+ "\u03c6": 282,
293
+ "\u03c7": 283,
294
+ "\u03c8": 284,
295
+ "\u03c9": 285,
296
+ "\u03d1": 286,
297
+ "\u03d2": 287,
298
+ "\u03d5": 288,
299
+ "\u03e9": 289,
300
+ "\u03ea": 290,
301
+ "\u03eb": 291,
302
+ "\u03ed": 292,
303
+ "\u03ee": 293,
304
+ "\u03f1": 294,
305
+ "\u03f3": 295,
306
+ "\u03f5": 296,
307
+ "\u03fd": 297,
308
+ "\u03fe": 298,
309
+ "\u0408": 299,
310
+ "\u0412": 300,
311
+ "\u0413": 301,
312
+ "\u041a": 302,
313
+ "\u041d": 303,
314
+ "\u0424": 304,
315
+ "\u0430": 305,
316
+ "\u0545": 306,
317
+ "\u0546": 307,
318
+ "\u0609": 308,
319
+ "\u060a": 309,
320
+ "\u060c": 310,
321
+ "\u060d": 311,
322
+ "\u060e": 312,
323
+ "\u0621": 313,
324
+ "\u0623": 314,
325
+ "\u0626": 315,
326
+ "\u0627": 316,
327
+ "\u0628": 317,
328
+ "\u0629": 318,
329
+ "\u062a": 319,
330
+ "\u062b": 320,
331
+ "\u062d": 321,
332
+ "\u062e": 322,
333
+ "\u062f": 323,
334
+ "\u0631": 324,
335
+ "\u0633": 325,
336
+ "\u0634": 326,
337
+ "\u0637": 327,
338
+ "\u0639": 328,
339
+ "\u0641": 329,
340
+ "\u0642": 330,
341
+ "\u0643": 331,
342
+ "\u0644": 332,
343
+ "\u0645": 333,
344
+ "\u0646": 334,
345
+ "\u0648": 335,
346
+ "\u0649": 336,
347
+ "\u064a": 337,
348
+ "\u064b": 338,
349
+ "\u1e54": 339,
350
+ "\u1e63": 340,
351
+ "\u1e7c": 341,
352
+ "\u1e83": 342,
353
+ "\u1e91": 343,
354
+ "\u1ef9": 344,
355
+ "\u1fb1": 345,
356
+ "\u1fbd": 346,
357
+ "\u2016": 347,
358
+ "\u2020": 348,
359
+ "\u2021": 349,
360
+ "\u2022": 350,
361
+ "\u2026": 351,
362
+ "\u202b": 352,
363
+ "\u202c": 353,
364
+ "\u2030": 354,
365
+ "\u2032": 355,
366
+ "\u2033": 356,
367
+ "\u2034": 357,
368
+ "\u203e": 358,
369
+ "\u2044": 359,
370
+ "\u2074": 360,
371
+ "\u2075": 361,
372
+ "\u2081": 362,
373
+ "\u20a9": 363,
374
+ "\u20ac": 364,
375
+ "\u2103": 365,
376
+ "\u2119": 366,
377
+ "\u211c": 367,
378
+ "\u211d": 368,
379
+ "\u2122": 369,
380
+ "\u2126": 370,
381
+ "\u2150": 371,
382
+ "\u2190": 372,
383
+ "\u2191": 373,
384
+ "\u2192": 374,
385
+ "\u2193": 375,
386
+ "\u21a6": 376,
387
+ "\u21c4": 377,
388
+ "\u21d2": 378,
389
+ "\u21d4": 379,
390
+ "\u2200": 380,
391
+ "\u2202": 381,
392
+ "\u2205": 382,
393
+ "\u2206": 383,
394
+ "\u2208": 384,
395
+ "\u220e": 385,
396
+ "\u2211": 386,
397
+ "\u2212": 387,
398
+ "\u2213": 388,
399
+ "\u2215": 389,
400
+ "\u221a": 390,
401
+ "\u221d": 391,
402
+ "\u221e": 392,
403
+ "\u2229": 393,
404
+ "\u222a": 394,
405
+ "\u222b": 395,
406
+ "\u223c": 396,
407
+ "\u2243": 397,
408
+ "\u2248": 398,
409
+ "\u2260": 399,
410
+ "\u2261": 400,
411
+ "\u2264": 401,
412
+ "\u2265": 402,
413
+ "\u226b": 403,
414
+ "\u227a": 404,
415
+ "\u2282": 405,
416
+ "\u2286": 406,
417
+ "\u2295": 407,
418
+ "\u22a5": 408,
419
+ "\u22c5": 409,
420
+ "\u22ef": 410,
421
+ "\u2329": 411,
422
+ "\u232a": 412,
423
+ "\u232c": 413,
424
+ "\u2338": 414,
425
+ "\u233a": 415,
426
+ "\u233d": 416,
427
+ "\u239b": 417,
428
+ "\u239d": 418,
429
+ "\u239e": 419,
430
+ "\u23a0": 420,
431
+ "\u2423": 421,
432
+ "\u2424": 422,
433
+ "\u2425": 423,
434
+ "\u2426": 424,
435
+ "\u2440": 425,
436
+ "\u24c7": 426,
437
+ "\u2500": 427,
438
+ "\u2502": 428,
439
+ "\u25a0": 429,
440
+ "\u25a1": 430,
441
+ "\u25aa": 431,
442
+ "\u25ab": 432,
443
+ "\u25b2": 433,
444
+ "\u25b3": 434,
445
+ "\u25b5": 435,
446
+ "\u25b6": 436,
447
+ "\u25b8": 437,
448
+ "\u25c6": 438,
449
+ "\u25c7": 439,
450
+ "\u2605": 440,
451
+ "\u2610": 441,
452
+ "\u2713": 442,
453
+ "\u274f": 443,
454
+ "\u29cb": 444,
455
+ "\u2a7d": 445,
456
+ "\u2a7e": 446,
457
+ "\u3008": 447,
458
+ "\u3009": 448,
459
+ "\ue023": 449,
460
+ "\ue024": 450,
461
+ "\ue02c": 451,
462
+ "\ue02e": 452,
463
+ "\ue032": 453,
464
+ "\ue039": 454,
465
+ "\ue044": 455,
466
+ "\ue061": 456,
467
+ "\ue062": 457,
468
+ "\ue067": 458,
469
+ "\ue06b": 459,
470
+ "\ue06c": 460,
471
+ "\ue06d": 461,
472
+ "\ue06e": 462,
473
+ "\ue073": 463,
474
+ "\ue074": 464,
475
+ "\ue07a": 465,
476
+ "\ue093": 466,
477
+ "\ue09d": 467,
478
+ "\ue103": 468,
479
+ "\ue104": 469,
480
+ "\uf03c": 470,
481
+ "\uf043": 471,
482
+ "\uf061": 472,
483
+ "\uf062": 473,
484
+ "\uf063": 474,
485
+ "\uf065": 475,
486
+ "\uf067": 476,
487
+ "\uf06b": 477,
488
+ "\uf06c": 478,
489
+ "\uf06d": 479,
490
+ "\uf070": 480,
491
+ "\uf073": 481,
492
+ "\uf074": 482,
493
+ "\uf077": 483,
494
+ "\uf0a2": 484,
495
+ "\uf0a3": 485,
496
+ "\uf0b0": 486,
497
+ "\uf0b1": 487,
498
+ "\uf0b4": 488,
499
+ "\uf0b7": 489,
500
+ "\uf0b9": 490,
501
+ "\uf0bb": 491,
502
+ "\uf0e2": 492,
503
+ "\uf0e4": 493,
504
+ "\uf0fc": 494,
505
+ "\uf643": 495,
506
+ "\uf644": 496,
507
+ "\uf645": 497,
508
+ "\uf646": 498,
509
+ "\uf647": 499,
510
+ "\uf648": 500,
511
+ "\uf649": 501,
512
+ "\uf64a": 502,
513
+ "\uf64b": 503,
514
+ "\uf64c": 504,
515
+ "\uf6f6": 505,
516
+ "\uf76a": 506,
517
+ "\uf76d": 507,
518
+ "\uf775": 508,
519
+ "\uf777": 509,
520
+ "\uf8e8": 510,
521
+ "\uff0b": 511,
522
+ "\uff0c": 512,
523
+ "\uff1a": 513,
524
+ "\uff1c": 514,
525
+ "\uff1e": 515,
526
+ "\ufffd": 516,
527
+ "\ud835\udc34": 517,
528
+ "\ud835\udc36": 518,
529
+ "\ud835\udc37": 519,
530
+ "\ud835\udc39": 520,
531
+ "\ud835\udc3a": 521,
532
+ "\ud835\udc3b": 522,
533
+ "\ud835\udc3e": 523,
534
+ "\ud835\udc3f": 524,
535
+ "\ud835\udc42": 525,
536
+ "\ud835\udc43": 526,
537
+ "\ud835\udc51": 527,
538
+ "\ud835\udc53": 528,
539
+ "\ud835\udc54": 529,
540
+ "\ud835\udc56": 530,
541
+ "\ud835\udc57": 531,
542
+ "\ud835\udc58": 532,
543
+ "\ud835\udc5e": 533,
544
+ "\ud835\udc5f": 534,
545
+ "\ud835\udc61": 535,
546
+ "\ud835\udc63": 536,
547
+ "\ud835\udd3c": 537,
548
+ "\ud835\udd40": 538,
549
+ "\ud835\udefc": 539,
550
+ "\ud835\udefd": 540,
551
+ "\ud835\udefe": 541,
552
+ "\ud835\udeff": 542,
553
+ "\ud835\udf03": 543,
554
+ "\ud835\udf06": 544,
555
+ "\ud835\udf07": 545,
556
+ "\ud835\udf0b": 546,
557
+ "\ud835\udf0d": 547,
558
+ "\ud835\udf15": 548
559
+ },
560
+ "vocab_tag": {
561
+ "<PAD>": 0,
562
+ "B-<creator>": 1,
563
+ "B-<software>": 2,
564
+ "B-<url>": 3,
565
+ "B-<version>": 4,
566
+ "I-<creator>": 5,
567
+ "I-<software>": 6,
568
+ "I-<url>": 7,
569
+ "I-<version>": 8,
570
+ "O": 9
571
+ },
572
+ "vocab_case": [
573
+ "<PAD>",
574
+ "numeric",
575
+ "allLower",
576
+ "allUpper",
577
+ "initialUpper",
578
+ "other",
579
+ "mainly_numeric",
580
+ "contains_digit"
581
+ ],
582
+ "max_char_length": 30,
583
+ "feature_preprocessor": null,
584
+ "indice_tag": {
585
+ "0": "<PAD>",
586
+ "1": "B-<creator>",
587
+ "2": "B-<software>",
588
+ "3": "B-<url>",
589
+ "4": "B-<version>",
590
+ "5": "I-<creator>",
591
+ "6": "I-<software>",
592
+ "7": "I-<url>",
593
+ "8": "I-<version>",
594
+ "9": "O"
595
+ }
596
+ }
software-BERT/transformer-config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/srv/storage/[email protected]/lfoppiano/embeddings/scibert_scivocab_cased",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "classifier_dropout": null,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-12,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "bert",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 12,
15
+ "pad_token_id": 0,
16
+ "position_embedding_type": "absolute",
17
+ "transformers_version": "4.33.2",
18
+ "type_vocab_size": 2,
19
+ "use_cache": true,
20
+ "vocab_size": 31116
21
+ }
software-BERT/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
software-BERT/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
software-BERT/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_basic_tokenize": true,
6
+ "do_lower_case": true,
7
+ "mask_token": "[MASK]",
8
+ "max_length": 512,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]"
17
+ }
software-BERT/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
software-BERT_CRF/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "software-BERT_CRF",
3
+ "architecture": "BERT_CRF",
4
+ "embeddings_name": null,
5
+ "char_vocab_size": 545,
6
+ "case_vocab_size": 8,
7
+ "char_embedding_size": 25,
8
+ "num_char_lstm_units": 25,
9
+ "max_char_length": 30,
10
+ "features_vocabulary_size": 12,
11
+ "features_indices": null,
12
+ "features_embedding_size": 4,
13
+ "features_lstm_units": 4,
14
+ "max_sequence_length": 512,
15
+ "word_embedding_size": 0,
16
+ "num_word_lstm_units": 100,
17
+ "case_embedding_size": 5,
18
+ "dropout": 0.5,
19
+ "recurrent_dropout": 0.5,
20
+ "use_crf": true,
21
+ "use_chain_crf": false,
22
+ "fold_number": 1,
23
+ "batch_size": 8,
24
+ "transformer_name": "allenai/scibert_scivocab_cased/dir",
25
+ "use_ELMo": false,
26
+ "labels": {
27
+ "<PAD>": 0,
28
+ "B-<creator>": 1,
29
+ "B-<software>": 2,
30
+ "B-<url>": 3,
31
+ "B-<version>": 4,
32
+ "I-<creator>": 5,
33
+ "I-<software>": 6,
34
+ "I-<url>": 7,
35
+ "I-<version>": 8,
36
+ "O": 9
37
+ }
38
+ }
software-BERT_CRF/model_weights.hdf5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71a32c7f372dad4f51c3018114cbaa2dd0575a50e9f1cc5c4797b04d5f6614d
3
+ size 440058504
software-BERT_CRF/preprocessor.json ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "padding": true,
3
+ "return_lengths": false,
4
+ "return_word_embeddings": false,
5
+ "return_casing": false,
6
+ "return_features": false,
7
+ "return_chars": false,
8
+ "return_bert_embeddings": true,
9
+ "vocab_char": {
10
+ "<PAD>": 0,
11
+ "<UNK>": 1,
12
+ "!": 2,
13
+ "\"": 3,
14
+ "#": 4,
15
+ "$": 5,
16
+ "%": 6,
17
+ "&": 7,
18
+ "'": 8,
19
+ "(": 9,
20
+ ")": 10,
21
+ "*": 11,
22
+ "+": 12,
23
+ ",": 13,
24
+ "-": 14,
25
+ ".": 15,
26
+ "/": 16,
27
+ "0": 17,
28
+ "1": 18,
29
+ "2": 19,
30
+ "3": 20,
31
+ "4": 21,
32
+ "5": 22,
33
+ "6": 23,
34
+ "7": 24,
35
+ "8": 25,
36
+ "9": 26,
37
+ ":": 27,
38
+ ";": 28,
39
+ "<": 29,
40
+ "=": 30,
41
+ ">": 31,
42
+ "?": 32,
43
+ "@": 33,
44
+ "A": 34,
45
+ "B": 35,
46
+ "C": 36,
47
+ "D": 37,
48
+ "E": 38,
49
+ "F": 39,
50
+ "G": 40,
51
+ "H": 41,
52
+ "I": 42,
53
+ "J": 43,
54
+ "K": 44,
55
+ "L": 45,
56
+ "M": 46,
57
+ "N": 47,
58
+ "O": 48,
59
+ "P": 49,
60
+ "Q": 50,
61
+ "R": 51,
62
+ "S": 52,
63
+ "T": 53,
64
+ "U": 54,
65
+ "V": 55,
66
+ "W": 56,
67
+ "X": 57,
68
+ "Y": 58,
69
+ "Z": 59,
70
+ "[": 60,
71
+ "\\": 61,
72
+ "]": 62,
73
+ "^": 63,
74
+ "_": 64,
75
+ "`": 65,
76
+ "a": 66,
77
+ "b": 67,
78
+ "c": 68,
79
+ "d": 69,
80
+ "e": 70,
81
+ "f": 71,
82
+ "g": 72,
83
+ "h": 73,
84
+ "i": 74,
85
+ "j": 75,
86
+ "k": 76,
87
+ "l": 77,
88
+ "m": 78,
89
+ "n": 79,
90
+ "o": 80,
91
+ "p": 81,
92
+ "q": 82,
93
+ "r": 83,
94
+ "s": 84,
95
+ "t": 85,
96
+ "u": 86,
97
+ "v": 87,
98
+ "w": 88,
99
+ "x": 89,
100
+ "y": 90,
101
+ "z": 91,
102
+ "{": 92,
103
+ "|": 93,
104
+ "}": 94,
105
+ "~": 95,
106
+ "\u00a1": 96,
107
+ "\u00a2": 97,
108
+ "\u00a3": 98,
109
+ "\u00a5": 99,
110
+ "\u00a7": 100,
111
+ "\u00a8": 101,
112
+ "\u00a9": 102,
113
+ "\u00ab": 103,
114
+ "\u00ac": 104,
115
+ "\u00ae": 105,
116
+ "\u00b0": 106,
117
+ "\u00b1": 107,
118
+ "\u00b2": 108,
119
+ "\u00b3": 109,
120
+ "\u00b4": 110,
121
+ "\u00b5": 111,
122
+ "\u00b6": 112,
123
+ "\u00b7": 113,
124
+ "\u00b8": 114,
125
+ "\u00b9": 115,
126
+ "\u00ba": 116,
127
+ "\u00bb": 117,
128
+ "\u00bc": 118,
129
+ "\u00bd": 119,
130
+ "\u00be": 120,
131
+ "\u00bf": 121,
132
+ "\u00c0": 122,
133
+ "\u00c1": 123,
134
+ "\u00c2": 124,
135
+ "\u00c3": 125,
136
+ "\u00c4": 126,
137
+ "\u00c5": 127,
138
+ "\u00c8": 128,
139
+ "\u00c9": 129,
140
+ "\u00ca": 130,
141
+ "\u00cd": 131,
142
+ "\u00d0": 132,
143
+ "\u00d2": 133,
144
+ "\u00d3": 134,
145
+ "\u00d4": 135,
146
+ "\u00d5": 136,
147
+ "\u00d6": 137,
148
+ "\u00d7": 138,
149
+ "\u00d8": 139,
150
+ "\u00d9": 140,
151
+ "\u00da": 141,
152
+ "\u00dc": 142,
153
+ "\u00de": 143,
154
+ "\u00df": 144,
155
+ "\u00e0": 145,
156
+ "\u00e1": 146,
157
+ "\u00e2": 147,
158
+ "\u00e3": 148,
159
+ "\u00e4": 149,
160
+ "\u00e5": 150,
161
+ "\u00e7": 151,
162
+ "\u00e8": 152,
163
+ "\u00e9": 153,
164
+ "\u00ea": 154,
165
+ "\u00eb": 155,
166
+ "\u00ec": 156,
167
+ "\u00ed": 157,
168
+ "\u00ee": 158,
169
+ "\u00ef": 159,
170
+ "\u00f0": 160,
171
+ "\u00f1": 161,
172
+ "\u00f2": 162,
173
+ "\u00f3": 163,
174
+ "\u00f4": 164,
175
+ "\u00f5": 165,
176
+ "\u00f6": 166,
177
+ "\u00f7": 167,
178
+ "\u00f8": 168,
179
+ "\u00f9": 169,
180
+ "\u00fa": 170,
181
+ "\u00fb": 171,
182
+ "\u00fc": 172,
183
+ "\u00fd": 173,
184
+ "\u00fe": 174,
185
+ "\u0100": 175,
186
+ "\u0101": 176,
187
+ "\u0103": 177,
188
+ "\u0105": 178,
189
+ "\u0106": 179,
190
+ "\u0107": 180,
191
+ "\u0108": 181,
192
+ "\u0109": 182,
193
+ "\u010c": 183,
194
+ "\u010d": 184,
195
+ "\u0113": 185,
196
+ "\u0117": 186,
197
+ "\u0118": 187,
198
+ "\u011b": 188,
199
+ "\u012b": 189,
200
+ "\u0130": 190,
201
+ "\u0131": 191,
202
+ "\u013e": 192,
203
+ "\u0142": 193,
204
+ "\u0144": 194,
205
+ "\u0148": 195,
206
+ "\u014d": 196,
207
+ "\u0151": 197,
208
+ "\u0159": 198,
209
+ "\u015a": 199,
210
+ "\u015c": 200,
211
+ "\u015d": 201,
212
+ "\u015e": 202,
213
+ "\u015f": 203,
214
+ "\u0160": 204,
215
+ "\u0161": 205,
216
+ "\u0168": 206,
217
+ "\u0169": 207,
218
+ "\u016b": 208,
219
+ "\u016f": 209,
220
+ "\u0175": 210,
221
+ "\u0176": 211,
222
+ "\u0177": 212,
223
+ "\u017d": 213,
224
+ "\u017e": 214,
225
+ "\u0192": 215,
226
+ "\u01eb": 216,
227
+ "\u021b": 217,
228
+ "\u0251": 218,
229
+ "\u025b": 219,
230
+ "\u0263": 220,
231
+ "\u02a6": 221,
232
+ "\u02b9": 222,
233
+ "\u02bc": 223,
234
+ "\u02c2": 224,
235
+ "\u02c3": 225,
236
+ "\u02c6": 226,
237
+ "\u02c7": 227,
238
+ "\u02c9": 228,
239
+ "\u02d8": 229,
240
+ "\u02d9": 230,
241
+ "\u02da": 231,
242
+ "\u02db": 232,
243
+ "\u02dc": 233,
244
+ "\u0303": 234,
245
+ "\u030a": 235,
246
+ "\u0313": 236,
247
+ "\u0314": 237,
248
+ "\u031d": 238,
249
+ "\u034c": 239,
250
+ "\u0350": 240,
251
+ "\u0351": 241,
252
+ "\u0352": 242,
253
+ "\u0354": 243,
254
+ "\u0357": 244,
255
+ "\u0358": 245,
256
+ "\u0371": 246,
257
+ "\u0374": 247,
258
+ "\u0392": 248,
259
+ "\u0393": 249,
260
+ "\u0394": 250,
261
+ "\u0397": 251,
262
+ "\u0398": 252,
263
+ "\u039b": 253,
264
+ "\u03a0": 254,
265
+ "\u03a3": 255,
266
+ "\u03a4": 256,
267
+ "\u03a5": 257,
268
+ "\u03a6": 258,
269
+ "\u03a7": 259,
270
+ "\u03a8": 260,
271
+ "\u03a9": 261,
272
+ "\u03b1": 262,
273
+ "\u03b2": 263,
274
+ "\u03b3": 264,
275
+ "\u03b4": 265,
276
+ "\u03b5": 266,
277
+ "\u03b6": 267,
278
+ "\u03b7": 268,
279
+ "\u03b8": 269,
280
+ "\u03b9": 270,
281
+ "\u03ba": 271,
282
+ "\u03bb": 272,
283
+ "\u03bc": 273,
284
+ "\u03bd": 274,
285
+ "\u03be": 275,
286
+ "\u03c0": 276,
287
+ "\u03c1": 277,
288
+ "\u03c3": 278,
289
+ "\u03c4": 279,
290
+ "\u03c5": 280,
291
+ "\u03c6": 281,
292
+ "\u03c7": 282,
293
+ "\u03c8": 283,
294
+ "\u03c9": 284,
295
+ "\u03d1": 285,
296
+ "\u03d2": 286,
297
+ "\u03d5": 287,
298
+ "\u03e9": 288,
299
+ "\u03ea": 289,
300
+ "\u03eb": 290,
301
+ "\u03ed": 291,
302
+ "\u03ee": 292,
303
+ "\u03f1": 293,
304
+ "\u03f3": 294,
305
+ "\u03f5": 295,
306
+ "\u03fd": 296,
307
+ "\u03fe": 297,
308
+ "\u0408": 298,
309
+ "\u0413": 299,
310
+ "\u041a": 300,
311
+ "\u041d": 301,
312
+ "\u0424": 302,
313
+ "\u0430": 303,
314
+ "\u0545": 304,
315
+ "\u0546": 305,
316
+ "\u0609": 306,
317
+ "\u060a": 307,
318
+ "\u060c": 308,
319
+ "\u060d": 309,
320
+ "\u0621": 310,
321
+ "\u0623": 311,
322
+ "\u0626": 312,
323
+ "\u0627": 313,
324
+ "\u0628": 314,
325
+ "\u0629": 315,
326
+ "\u062a": 316,
327
+ "\u062b": 317,
328
+ "\u062d": 318,
329
+ "\u062e": 319,
330
+ "\u062f": 320,
331
+ "\u0631": 321,
332
+ "\u0633": 322,
333
+ "\u0634": 323,
334
+ "\u0637": 324,
335
+ "\u0639": 325,
336
+ "\u0641": 326,
337
+ "\u0642": 327,
338
+ "\u0643": 328,
339
+ "\u0644": 329,
340
+ "\u0645": 330,
341
+ "\u0646": 331,
342
+ "\u0648": 332,
343
+ "\u0649": 333,
344
+ "\u064a": 334,
345
+ "\u064b": 335,
346
+ "\u1e54": 336,
347
+ "\u1e63": 337,
348
+ "\u1e7c": 338,
349
+ "\u1e83": 339,
350
+ "\u1e91": 340,
351
+ "\u1ef9": 341,
352
+ "\u1fb1": 342,
353
+ "\u1fbd": 343,
354
+ "\u2016": 344,
355
+ "\u2020": 345,
356
+ "\u2021": 346,
357
+ "\u2022": 347,
358
+ "\u2026": 348,
359
+ "\u202b": 349,
360
+ "\u202c": 350,
361
+ "\u2030": 351,
362
+ "\u2032": 352,
363
+ "\u2033": 353,
364
+ "\u2034": 354,
365
+ "\u203e": 355,
366
+ "\u2044": 356,
367
+ "\u2074": 357,
368
+ "\u2075": 358,
369
+ "\u2081": 359,
370
+ "\u20a9": 360,
371
+ "\u20ac": 361,
372
+ "\u2103": 362,
373
+ "\u2119": 363,
374
+ "\u211c": 364,
375
+ "\u211d": 365,
376
+ "\u2122": 366,
377
+ "\u2126": 367,
378
+ "\u2150": 368,
379
+ "\u2190": 369,
380
+ "\u2191": 370,
381
+ "\u2192": 371,
382
+ "\u2193": 372,
383
+ "\u21a6": 373,
384
+ "\u21c4": 374,
385
+ "\u21d2": 375,
386
+ "\u21d4": 376,
387
+ "\u2200": 377,
388
+ "\u2202": 378,
389
+ "\u2205": 379,
390
+ "\u2206": 380,
391
+ "\u2208": 381,
392
+ "\u220e": 382,
393
+ "\u2211": 383,
394
+ "\u2212": 384,
395
+ "\u2213": 385,
396
+ "\u2215": 386,
397
+ "\u221a": 387,
398
+ "\u221d": 388,
399
+ "\u221e": 389,
400
+ "\u2229": 390,
401
+ "\u222a": 391,
402
+ "\u222b": 392,
403
+ "\u223c": 393,
404
+ "\u2243": 394,
405
+ "\u2248": 395,
406
+ "\u2260": 396,
407
+ "\u2261": 397,
408
+ "\u2264": 398,
409
+ "\u2265": 399,
410
+ "\u226b": 400,
411
+ "\u227a": 401,
412
+ "\u2282": 402,
413
+ "\u2286": 403,
414
+ "\u2295": 404,
415
+ "\u22a5": 405,
416
+ "\u22c5": 406,
417
+ "\u22ef": 407,
418
+ "\u2329": 408,
419
+ "\u232a": 409,
420
+ "\u232c": 410,
421
+ "\u2338": 411,
422
+ "\u233a": 412,
423
+ "\u233d": 413,
424
+ "\u239b": 414,
425
+ "\u239d": 415,
426
+ "\u239e": 416,
427
+ "\u23a0": 417,
428
+ "\u2423": 418,
429
+ "\u2424": 419,
430
+ "\u2425": 420,
431
+ "\u2426": 421,
432
+ "\u2440": 422,
433
+ "\u24c7": 423,
434
+ "\u2500": 424,
435
+ "\u2502": 425,
436
+ "\u25a0": 426,
437
+ "\u25a1": 427,
438
+ "\u25aa": 428,
439
+ "\u25ab": 429,
440
+ "\u25b2": 430,
441
+ "\u25b3": 431,
442
+ "\u25b5": 432,
443
+ "\u25b6": 433,
444
+ "\u25b8": 434,
445
+ "\u25c6": 435,
446
+ "\u25c7": 436,
447
+ "\u2605": 437,
448
+ "\u2610": 438,
449
+ "\u2713": 439,
450
+ "\u274f": 440,
451
+ "\u29cb": 441,
452
+ "\u2a7d": 442,
453
+ "\u2a7e": 443,
454
+ "\u3008": 444,
455
+ "\u3009": 445,
456
+ "\ue023": 446,
457
+ "\ue024": 447,
458
+ "\ue02c": 448,
459
+ "\ue02e": 449,
460
+ "\ue032": 450,
461
+ "\ue039": 451,
462
+ "\ue044": 452,
463
+ "\ue061": 453,
464
+ "\ue062": 454,
465
+ "\ue067": 455,
466
+ "\ue06b": 456,
467
+ "\ue06c": 457,
468
+ "\ue06d": 458,
469
+ "\ue06e": 459,
470
+ "\ue073": 460,
471
+ "\ue074": 461,
472
+ "\ue07a": 462,
473
+ "\ue093": 463,
474
+ "\ue09d": 464,
475
+ "\ue103": 465,
476
+ "\ue104": 466,
477
+ "\uf03c": 467,
478
+ "\uf043": 468,
479
+ "\uf061": 469,
480
+ "\uf062": 470,
481
+ "\uf063": 471,
482
+ "\uf065": 472,
483
+ "\uf067": 473,
484
+ "\uf06b": 474,
485
+ "\uf06c": 475,
486
+ "\uf06d": 476,
487
+ "\uf070": 477,
488
+ "\uf073": 478,
489
+ "\uf074": 479,
490
+ "\uf0a2": 480,
491
+ "\uf0a3": 481,
492
+ "\uf0b0": 482,
493
+ "\uf0b1": 483,
494
+ "\uf0b4": 484,
495
+ "\uf0b7": 485,
496
+ "\uf0b9": 486,
497
+ "\uf0bb": 487,
498
+ "\uf0e2": 488,
499
+ "\uf0e4": 489,
500
+ "\uf0fc": 490,
501
+ "\uf643": 491,
502
+ "\uf644": 492,
503
+ "\uf645": 493,
504
+ "\uf646": 494,
505
+ "\uf647": 495,
506
+ "\uf648": 496,
507
+ "\uf649": 497,
508
+ "\uf64a": 498,
509
+ "\uf64b": 499,
510
+ "\uf64c": 500,
511
+ "\uf6f6": 501,
512
+ "\uf76a": 502,
513
+ "\uf76d": 503,
514
+ "\uf775": 504,
515
+ "\uf777": 505,
516
+ "\uf8e8": 506,
517
+ "\uff0b": 507,
518
+ "\uff0c": 508,
519
+ "\uff1a": 509,
520
+ "\uff1c": 510,
521
+ "\uff1e": 511,
522
+ "\ufffd": 512,
523
+ "\ud835\udc34": 513,
524
+ "\ud835\udc36": 514,
525
+ "\ud835\udc37": 515,
526
+ "\ud835\udc39": 516,
527
+ "\ud835\udc3a": 517,
528
+ "\ud835\udc3b": 518,
529
+ "\ud835\udc3e": 519,
530
+ "\ud835\udc3f": 520,
531
+ "\ud835\udc42": 521,
532
+ "\ud835\udc43": 522,
533
+ "\ud835\udc51": 523,
534
+ "\ud835\udc53": 524,
535
+ "\ud835\udc54": 525,
536
+ "\ud835\udc56": 526,
537
+ "\ud835\udc57": 527,
538
+ "\ud835\udc58": 528,
539
+ "\ud835\udc5e": 529,
540
+ "\ud835\udc5f": 530,
541
+ "\ud835\udc61": 531,
542
+ "\ud835\udc63": 532,
543
+ "\ud835\udd3c": 533,
544
+ "\ud835\udd40": 534,
545
+ "\ud835\udefc": 535,
546
+ "\ud835\udefd": 536,
547
+ "\ud835\udefe": 537,
548
+ "\ud835\udeff": 538,
549
+ "\ud835\udf03": 539,
550
+ "\ud835\udf06": 540,
551
+ "\ud835\udf07": 541,
552
+ "\ud835\udf0b": 542,
553
+ "\ud835\udf0d": 543,
554
+ "\ud835\udf15": 544
555
+ },
556
+ "vocab_tag": {
557
+ "<PAD>": 0,
558
+ "B-<creator>": 1,
559
+ "B-<software>": 2,
560
+ "B-<url>": 3,
561
+ "B-<version>": 4,
562
+ "I-<creator>": 5,
563
+ "I-<software>": 6,
564
+ "I-<url>": 7,
565
+ "I-<version>": 8,
566
+ "O": 9
567
+ },
568
+ "vocab_case": [
569
+ "<PAD>",
570
+ "numeric",
571
+ "allLower",
572
+ "allUpper",
573
+ "initialUpper",
574
+ "other",
575
+ "mainly_numeric",
576
+ "contains_digit"
577
+ ],
578
+ "max_char_length": 30,
579
+ "feature_preprocessor": null,
580
+ "indice_tag": {
581
+ "0": "<PAD>",
582
+ "1": "B-<creator>",
583
+ "2": "B-<software>",
584
+ "3": "B-<url>",
585
+ "4": "B-<version>",
586
+ "5": "I-<creator>",
587
+ "6": "I-<software>",
588
+ "7": "I-<url>",
589
+ "8": "I-<version>",
590
+ "9": "O"
591
+ }
592
+ }
software-BERT_CRF/transformer-config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/srv/storage/[email protected]/lfoppiano/embeddings/scibert_scivocab_cased",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "classifier_dropout": null,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-12,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "bert",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 12,
15
+ "pad_token_id": 0,
16
+ "position_embedding_type": "absolute",
17
+ "transformers_version": "4.33.2",
18
+ "type_vocab_size": 2,
19
+ "use_cache": true,
20
+ "vocab_size": 31116
21
+ }
software-BERT_CRF/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
software-BERT_CRF/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
software-BERT_CRF/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_basic_tokenize": true,
6
+ "do_lower_case": true,
7
+ "mask_token": "[MASK]",
8
+ "max_length": 512,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]"
17
+ }
software-BERT_CRF/transformer-tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
software-type-BERT_CRF/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "software-type-BERT_CRF",
3
+ "architecture": "BERT_CRF",
4
+ "embeddings_name": null,
5
+ "char_vocab_size": 296,
6
+ "case_vocab_size": 8,
7
+ "char_embedding_size": 25,
8
+ "num_char_lstm_units": 25,
9
+ "max_char_length": 30,
10
+ "features_vocabulary_size": 12,
11
+ "features_indices": null,
12
+ "features_embedding_size": 4,
13
+ "features_lstm_units": 4,
14
+ "max_sequence_length": 512,
15
+ "word_embedding_size": 0,
16
+ "num_word_lstm_units": 100,
17
+ "case_embedding_size": 5,
18
+ "dropout": 0.5,
19
+ "recurrent_dropout": 0.5,
20
+ "use_crf": true,
21
+ "use_chain_crf": false,
22
+ "fold_number": 1,
23
+ "batch_size": 8,
24
+ "transformer_name": "michiyasunaga/LinkBERT-basecased",
25
+ "use_ELMo": false,
26
+ "labels": {
27
+ "<PAD>": 0,
28
+ "B-<component>": 1,
29
+ "B-<environment>": 2,
30
+ "B-<implicit>": 3,
31
+ "B-<language>": 4,
32
+ "I-<component>": 5,
33
+ "I-<environment>": 6,
34
+ "I-<implicit>": 7,
35
+ "I-<language>": 8,
36
+ "O": 9
37
+ }
38
+ }
software-type-BERT_CRF/preprocessor.json ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "padding": true,
3
+ "return_lengths": false,
4
+ "return_word_embeddings": false,
5
+ "return_casing": false,
6
+ "return_features": false,
7
+ "return_chars": false,
8
+ "return_bert_embeddings": true,
9
+ "vocab_char": {
10
+ "<PAD>": 0,
11
+ "<UNK>": 1,
12
+ "!": 2,
13
+ "\"": 3,
14
+ "#": 4,
15
+ "$": 5,
16
+ "%": 6,
17
+ "&": 7,
18
+ "'": 8,
19
+ "(": 9,
20
+ ")": 10,
21
+ "*": 11,
22
+ "+": 12,
23
+ ",": 13,
24
+ "-": 14,
25
+ ".": 15,
26
+ "/": 16,
27
+ "0": 17,
28
+ "1": 18,
29
+ "2": 19,
30
+ "3": 20,
31
+ "4": 21,
32
+ "5": 22,
33
+ "6": 23,
34
+ "7": 24,
35
+ "8": 25,
36
+ "9": 26,
37
+ ":": 27,
38
+ ";": 28,
39
+ "<": 29,
40
+ "=": 30,
41
+ ">": 31,
42
+ "?": 32,
43
+ "@": 33,
44
+ "A": 34,
45
+ "B": 35,
46
+ "C": 36,
47
+ "D": 37,
48
+ "E": 38,
49
+ "F": 39,
50
+ "G": 40,
51
+ "H": 41,
52
+ "I": 42,
53
+ "J": 43,
54
+ "K": 44,
55
+ "L": 45,
56
+ "M": 46,
57
+ "N": 47,
58
+ "O": 48,
59
+ "P": 49,
60
+ "Q": 50,
61
+ "R": 51,
62
+ "S": 52,
63
+ "T": 53,
64
+ "U": 54,
65
+ "V": 55,
66
+ "W": 56,
67
+ "X": 57,
68
+ "Y": 58,
69
+ "Z": 59,
70
+ "[": 60,
71
+ "]": 61,
72
+ "^": 62,
73
+ "_": 63,
74
+ "`": 64,
75
+ "a": 65,
76
+ "b": 66,
77
+ "c": 67,
78
+ "d": 68,
79
+ "e": 69,
80
+ "f": 70,
81
+ "g": 71,
82
+ "h": 72,
83
+ "i": 73,
84
+ "j": 74,
85
+ "k": 75,
86
+ "l": 76,
87
+ "m": 77,
88
+ "n": 78,
89
+ "o": 79,
90
+ "p": 80,
91
+ "q": 81,
92
+ "r": 82,
93
+ "s": 83,
94
+ "t": 84,
95
+ "u": 85,
96
+ "v": 86,
97
+ "w": 87,
98
+ "x": 88,
99
+ "y": 89,
100
+ "z": 90,
101
+ "{": 91,
102
+ "|": 92,
103
+ "}": 93,
104
+ "~": 94,
105
+ "\u00a1": 95,
106
+ "\u00a2": 96,
107
+ "\u00a3": 97,
108
+ "\u00a7": 98,
109
+ "\u00a8": 99,
110
+ "\u00a9": 100,
111
+ "\u00ae": 101,
112
+ "\u00af": 102,
113
+ "\u00b0": 103,
114
+ "\u00b1": 104,
115
+ "\u00b2": 105,
116
+ "\u00b4": 106,
117
+ "\u00b5": 107,
118
+ "\u00b7": 108,
119
+ "\u00b8": 109,
120
+ "\u00b9": 110,
121
+ "\u00ba": 111,
122
+ "\u00bb": 112,
123
+ "\u00bc": 113,
124
+ "\u00bd": 114,
125
+ "\u00c0": 115,
126
+ "\u00c1": 116,
127
+ "\u00c2": 117,
128
+ "\u00c3": 118,
129
+ "\u00c5": 119,
130
+ "\u00c9": 120,
131
+ "\u00d2": 121,
132
+ "\u00d4": 122,
133
+ "\u00d5": 123,
134
+ "\u00d7": 124,
135
+ "\u00dc": 125,
136
+ "\u00de": 126,
137
+ "\u00df": 127,
138
+ "\u00e1": 128,
139
+ "\u00e2": 129,
140
+ "\u00e3": 130,
141
+ "\u00e4": 131,
142
+ "\u00e5": 132,
143
+ "\u00e7": 133,
144
+ "\u00e8": 134,
145
+ "\u00e9": 135,
146
+ "\u00ea": 136,
147
+ "\u00ed": 137,
148
+ "\u00ef": 138,
149
+ "\u00f0": 139,
150
+ "\u00f3": 140,
151
+ "\u00f4": 141,
152
+ "\u00f6": 142,
153
+ "\u00f8": 143,
154
+ "\u00fa": 144,
155
+ "\u00fc": 145,
156
+ "\u00fe": 146,
157
+ "\u0107": 147,
158
+ "\u010c": 148,
159
+ "\u010d": 149,
160
+ "\u0113": 150,
161
+ "\u0117": 151,
162
+ "\u0131": 152,
163
+ "\u0144": 153,
164
+ "\u0160": 154,
165
+ "\u0161": 155,
166
+ "\u0177": 156,
167
+ "\u017e": 157,
168
+ "\u0192": 158,
169
+ "\u01eb": 159,
170
+ "\u0251": 160,
171
+ "\u025b": 161,
172
+ "\u02c2": 162,
173
+ "\u02da": 163,
174
+ "\u02dc": 164,
175
+ "\u030a": 165,
176
+ "\u031d": 166,
177
+ "\u0357": 167,
178
+ "\u0358": 168,
179
+ "\u0374": 169,
180
+ "\u0394": 170,
181
+ "\u0397": 171,
182
+ "\u03a6": 172,
183
+ "\u03a9": 173,
184
+ "\u03b1": 174,
185
+ "\u03b2": 175,
186
+ "\u03b3": 176,
187
+ "\u03b4": 177,
188
+ "\u03b5": 178,
189
+ "\u03b6": 179,
190
+ "\u03b7": 180,
191
+ "\u03b8": 181,
192
+ "\u03ba": 182,
193
+ "\u03bb": 183,
194
+ "\u03bc": 184,
195
+ "\u03bd": 185,
196
+ "\u03be": 186,
197
+ "\u03c0": 187,
198
+ "\u03c1": 188,
199
+ "\u03c3": 189,
200
+ "\u03c4": 190,
201
+ "\u03c5": 191,
202
+ "\u03c6": 192,
203
+ "\u03c7": 193,
204
+ "\u03c8": 194,
205
+ "\u03c9": 195,
206
+ "\u03d2": 196,
207
+ "\u03d5": 197,
208
+ "\u03e9": 198,
209
+ "\u03ea": 199,
210
+ "\u03eb": 200,
211
+ "\u03ed": 201,
212
+ "\u03ee": 202,
213
+ "\u03f3": 203,
214
+ "\u03fd": 204,
215
+ "\u03fe": 205,
216
+ "\u0408": 206,
217
+ "\u0412": 207,
218
+ "\u0545": 208,
219
+ "\u0546": 209,
220
+ "\u0609": 210,
221
+ "\u060a": 211,
222
+ "\u060e": 212,
223
+ "\u1e91": 213,
224
+ "\u2020": 214,
225
+ "\u2021": 215,
226
+ "\u2022": 216,
227
+ "\u2026": 217,
228
+ "\u2032": 218,
229
+ "\u2033": 219,
230
+ "\u2044": 220,
231
+ "\u2074": 221,
232
+ "\u2075": 222,
233
+ "\u2081": 223,
234
+ "\u20ac": 224,
235
+ "\u2103": 225,
236
+ "\u2122": 226,
237
+ "\u2150": 227,
238
+ "\u2192": 228,
239
+ "\u2202": 229,
240
+ "\u2206": 230,
241
+ "\u2208": 231,
242
+ "\u2212": 232,
243
+ "\u2215": 233,
244
+ "\u221a": 234,
245
+ "\u221e": 235,
246
+ "\u223c": 236,
247
+ "\u2248": 237,
248
+ "\u2260": 238,
249
+ "\u2261": 239,
250
+ "\u2264": 240,
251
+ "\u2265": 241,
252
+ "\u22c5": 242,
253
+ "\u232c": 243,
254
+ "\u233a": 244,
255
+ "\u233d": 245,
256
+ "\u2423": 246,
257
+ "\u2424": 247,
258
+ "\u2425": 248,
259
+ "\u24c7": 249,
260
+ "\u2500": 250,
261
+ "\u2502": 251,
262
+ "\u25b3": 252,
263
+ "\u25b8": 253,
264
+ "\u2a7e": 254,
265
+ "\ue024": 255,
266
+ "\ue02c": 256,
267
+ "\ue02e": 257,
268
+ "\ue032": 258,
269
+ "\ue039": 259,
270
+ "\ue061": 260,
271
+ "\ue062": 261,
272
+ "\ue067": 262,
273
+ "\ue06d": 263,
274
+ "\ue074": 264,
275
+ "\ue07a": 265,
276
+ "\ue093": 266,
277
+ "\ue09d": 267,
278
+ "\ue103": 268,
279
+ "\ue104": 269,
280
+ "\uf062": 270,
281
+ "\uf06c": 271,
282
+ "\uf06d": 272,
283
+ "\uf077": 273,
284
+ "\uf0a3": 274,
285
+ "\uf0b0": 275,
286
+ "\uf0b1": 276,
287
+ "\uf0b4": 277,
288
+ "\uf0b7": 278,
289
+ "\uf0b9": 279,
290
+ "\uf643": 280,
291
+ "\uf644": 281,
292
+ "\uf645": 282,
293
+ "\uf64b": 283,
294
+ "\uf64c": 284,
295
+ "\uf76d": 285,
296
+ "\uff1a": 286,
297
+ "\uff1c": 287,
298
+ "\uff1e": 288,
299
+ "\ufffd": 289,
300
+ "\ud835\udc42": 290,
301
+ "\ud835\udc56": 291,
302
+ "\ud835\udc57": 292,
303
+ "\ud835\udc61": 293,
304
+ "\ud835\udefc": 294,
305
+ "\ud835\udeff": 295
306
+ },
307
+ "vocab_tag": {
308
+ "<PAD>": 0,
309
+ "B-<component>": 1,
310
+ "B-<environment>": 2,
311
+ "B-<implicit>": 3,
312
+ "B-<language>": 4,
313
+ "I-<component>": 5,
314
+ "I-<environment>": 6,
315
+ "I-<implicit>": 7,
316
+ "I-<language>": 8,
317
+ "O": 9
318
+ },
319
+ "vocab_case": [
320
+ "<PAD>",
321
+ "numeric",
322
+ "allLower",
323
+ "allUpper",
324
+ "initialUpper",
325
+ "other",
326
+ "mainly_numeric",
327
+ "contains_digit"
328
+ ],
329
+ "max_char_length": 30,
330
+ "feature_preprocessor": null,
331
+ "indice_tag": {
332
+ "0": "<PAD>",
333
+ "1": "B-<component>",
334
+ "2": "B-<environment>",
335
+ "3": "B-<implicit>",
336
+ "4": "B-<language>",
337
+ "5": "I-<component>",
338
+ "6": "I-<environment>",
339
+ "7": "I-<implicit>",
340
+ "8": "I-<language>",
341
+ "9": "O"
342
+ }
343
+ }
software-type-BERT_CRF/transformer-config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/srv/storage/[email protected]/lfoppiano/embeddings/LinkBERT-base",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.33.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 28996
25
+ }
software-type-BERT_CRF/transformer-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
software-type-BERT_CRF/transformer-tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
software-type-BERT_CRF/transformer-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "mask_token": "[MASK]",
7
+ "max_length": 512,
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }