Simon Tang commited on
Commit
fb4253c
·
1 Parent(s): 24c3942

commit files to HF hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "custom_pipelines": {
8
+ "entailment-classification": {
9
+ "impl": "pipeline.MyPipeline",
10
+ "pt": [
11
+ "AutoModelForSequenceClassification"
12
+ ],
13
+ "tf": []
14
+ }
15
+ },
16
+ "hidden_act": "gelu",
17
+ "hidden_dropout_prob": 0.1,
18
+ "hidden_size": 768,
19
+ "id2label": {
20
+ "0": "entailment",
21
+ "1": "neutral",
22
+ "2": "contradiction"
23
+ },
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 3072,
26
+ "label2id": {
27
+ "contradiction": 2,
28
+ "entailment": 0,
29
+ "neutral": 1
30
+ },
31
+ "layer_norm_eps": 1e-07,
32
+ "max_position_embeddings": 512,
33
+ "max_relative_positions": -1,
34
+ "model_type": "deberta-v2",
35
+ "norm_rel_ebd": "layer_norm",
36
+ "num_attention_heads": 12,
37
+ "num_hidden_layers": 12,
38
+ "pad_token_id": 0,
39
+ "pooler_dropout": 0,
40
+ "pooler_hidden_act": "gelu",
41
+ "pooler_hidden_size": 768,
42
+ "pos_att_type": [
43
+ "p2c",
44
+ "c2p"
45
+ ],
46
+ "position_biased_input": false,
47
+ "position_buckets": 256,
48
+ "relative_attention": true,
49
+ "share_att_key": true,
50
+ "torch_dtype": "float32",
51
+ "transformers_version": "4.30.2",
52
+ "type_vocab_size": 0,
53
+ "vocab_size": 128100
54
+ }
pipeline.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import Repository
2
+ from typing import List, Union
3
+ from transformers import pipeline
4
+ from transformers.pipelines import PIPELINE_REGISTRY
5
+ from transformers import AutoModelForSequenceClassification, Pipeline
6
+ import torch
7
+
8
+
9
+ # from loguru import logger
10
+
11
+
12
+ class MyPipeline(Pipeline):
13
+ def _sanitize_parameters(self, **kwargs):
14
+ preprocess_kwargs = {}
15
+ if "hypothesis" in kwargs:
16
+ preprocess_kwargs["hypothesis"] = kwargs["hypothesis"]
17
+ return preprocess_kwargs, {}, {}
18
+
19
+ def __call__(
20
+ self,
21
+ sequences: Union[str, List[str]],
22
+ *args,
23
+ **kwargs,
24
+ ):
25
+ if len(args) == 0:
26
+ pass
27
+ elif len(args) == 1 and "hypothesis" not in kwargs:
28
+ kwargs["hypothesis"] = args[0]
29
+ else:
30
+ raise ValueError(f"Unable to understand extra arguments {args}")
31
+
32
+ return super().__call__(sequences, **kwargs)
33
+
34
+
35
+ def preprocess(self, premise, hypothesis=None):
36
+ encode_inputs = self.tokenizer(
37
+ premise,
38
+ hypothesis,
39
+ # max_length=self.toke,
40
+ # return_token_type_ids=True,
41
+ truncation=True,
42
+ return_tensors="pt"
43
+ )
44
+
45
+ return {"input_ids": encode_inputs['input_ids']}
46
+
47
+ def _forward(self, input_ids):
48
+ outputs = self.model(input_ids['input_ids'])
49
+ return outputs
50
+
51
+ def postprocess(self, model_outputs):
52
+ prediction = torch.softmax(model_outputs["logits"][0], -1).tolist()
53
+ print(prediction)
54
+ label_names = ["entailment", "neutral", "contradiction"]
55
+ prediction = {name: round(float(pred) * 100, 1)
56
+ for pred, name in zip(prediction, label_names)}
57
+ return prediction
58
+
59
+
60
+ # PIPELINE_REGISTRY.register_pipeline(
61
+ # "test",
62
+ # pipeline_class=MyPipeline,
63
+ # pt_model=AutoModelForSequenceClassification,
64
+ # # default={"pt": ("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", "retina")},
65
+ # # type="text",
66
+ # )
67
+
68
+
69
+ # classifier = pipeline("test",
70
+ # model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
71
+ # # tokenizer="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
72
+ # )
73
+
74
+ # output = classifier(
75
+ # "Angela Merkel is a politician in Germany and leader of the CDU",
76
+ # hypothesis="this is a test"
77
+ # )
78
+ # # logger.info(output)
79
+
80
+
81
+ # # repo = Repository("entailment-classifier",
82
+ # # clone_from="Tverous/entailment-classifier")
83
+ # classifier.save_pretrained("entailment-classifier")
84
+ # # repo.push_to_hub()
85
+
86
+
87
+ # logger.info("Finished")
88
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be8178bff2c2a45c2c893a6584cd2b09ce98493f762c591175df839b9b38e6a4
3
+ size 737769017
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "eos_token": "[SEP]",
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "sp_model_kwargs": {},
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]",
15
+ "vocab_type": "spm"
16
+ }