Dawid Motyka commited on
Commit
834d42f
·
1 Parent(s): fd1b237

app and model

Browse files
Files changed (4) hide show
  1. app.py +56 -0
  2. inference_utils.py +11 -0
  3. models.py +85 -0
  4. preprocessing.py +22 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AutoTokenizer, Pipeline
6
+
7
+ from inference_utils import prepare_stance_texts
8
+ from models import StanceEncoderModel
9
+
10
+ CLASS_DICT = {0: 'FAVOR', 1: 'AGAINST', 2: 'NEITHER'}
11
+
12
+ params = {'lang': 'pl',
13
+ 'masked_lm_prompt': 4,}
14
+
15
+
16
+ class StancePipeline(Pipeline):
17
+
18
+ def _sanitize_parameters(self, **pipeline_parameters):
19
+ return pipeline_parameters, {}, {}
20
+
21
+ def preprocess(self, input):
22
+ prompt_text, prompt_target = prepare_stance_texts([input['text'],], [input['target'],], params, self.tokenizer)
23
+ inputs = self.tokenizer(prompt_text, prompt_target, return_tensors="pt", padding=True, truncation='only_first')
24
+ return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'],
25
+ 'sequence_ids': torch.tensor((np.array(inputs.sequence_ids()) == 1).astype(int)).unsqueeze(0)}
26
+
27
+ def _forward(self, model_inputs):
28
+ outputs = self.model(**model_inputs)
29
+ return outputs
30
+
31
+ def postprocess(self, model_outputs):
32
+ probas = model_outputs["logits"].softmax(-1)
33
+ score = probas.max(-1)[0].item()
34
+ return {'stance': CLASS_DICT[probas.argmax(-1).item()], 'score': score}
35
+
36
+
37
+ pipeline = StancePipeline(model=StanceEncoderModel.from_pretrained('clarin-knext/stance-pl-1'),
38
+ tokenizer=AutoTokenizer.from_pretrained('clarin-knext/stance-pl-1'),
39
+ batch_size=1)
40
+
41
+
42
+ def predict(text, target):
43
+ predictions = pipeline({'text': text, 'target': target})
44
+ return predictions['stance'], predictions['score']
45
+
46
+
47
+ gradio_app = gr.Interface(
48
+ predict,
49
+ inputs=[gr.TextArea(label="Text", placeholder="text"),
50
+ gr.Textbox(label="Target", placeholder="stance target")],
51
+ outputs=[gr.Label(label="Stance"), gr.Label(label="Score")],
52
+ title="Polish stance detection",
53
+ )
54
+
55
+ if __name__ == "__main__":
56
+ gradio_app.launch()
inference_utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import pandas as pd
3
+
4
+ from preprocessing import format_masked_lm_prompt
5
+
6
+
7
+ def prepare_stance_texts(texts: List[str], targets: List[str], params: Dict[str, any], tokenizer):
8
+ texts_df = pd.DataFrame({'text': texts, 'target': targets})
9
+ prompt_text, prompt_target = format_masked_lm_prompt(texts_df, tokenizer=tokenizer,
10
+ prompt_type=params['masked_lm_prompt'], lang='pl',)
11
+ return prompt_text, prompt_target
models.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Tuple
3
+ from typing import Union
4
+
5
+ import torch
6
+ from torch.nn import CrossEntropyLoss
7
+ from transformers import PreTrainedModel, BertForMaskedLM, BertConfig
8
+ from transformers.modeling_outputs import SequenceClassifierOutput
9
+
10
+
11
+ class StanceEncoderModel(PreTrainedModel):
12
+
13
+ config_class = BertConfig
14
+ logger = logging.getLogger("StanceEncoderModel")
15
+
16
+ def __init__(self, config):
17
+ super().__init__(config)
18
+ task_specific_params = config.task_specific_params
19
+ self.num_labels = task_specific_params.get('num_labels', 3)
20
+ self.mask_token_id = task_specific_params['mask_token_id']
21
+ self.verbalizer_token_ids = task_specific_params['verbalizer_token_ids']
22
+ self.clf_hidden_dim = task_specific_params.get('clf_hidden_dim', 300)
23
+ self.clf_drop_prob = task_specific_params.get('clf_drop_prob', 0.2)
24
+ self.clf_gelu_head = task_specific_params.get('clf_gelu_head', False)
25
+ self.masked_lm = task_specific_params.get('masked_lm', True)
26
+ self.masked_lm_n_tokens = task_specific_params.get('masked_lm_tokens', 1)
27
+ self.masked_lm_verbalizer = task_specific_params.get('masked_lm_verbalizer', False)
28
+
29
+ base_model = BertForMaskedLM(config)
30
+ self.base_enc_model = base_model.bert
31
+ self.lm_head = base_model.cls
32
+
33
+ hidden_size_multiplier = 1
34
+
35
+ if not self.masked_lm_verbalizer:
36
+ if self.clf_gelu_head:
37
+ self.logger.info('using 2 layer gelu classifier head')
38
+ self.classifier = torch.nn.Sequential(
39
+ torch.nn.Linear(self.config.hidden_size * hidden_size_multiplier, self.clf_hidden_dim),
40
+ torch.nn.Dropout(self.clf_drop_prob),
41
+ torch.nn.GELU(),
42
+ torch.nn.Linear(self.clf_hidden_dim, self.num_labels)
43
+ )
44
+ else:
45
+ raise ValueError('classification type head not specified')
46
+
47
+ def forward(
48
+ self,
49
+ input_ids: Optional[torch.Tensor] = None,
50
+ attention_mask: Optional[torch.Tensor] = None,
51
+ token_type_ids: Optional[torch.Tensor] = None,
52
+ position_ids: Optional[torch.Tensor] = None,
53
+ head_mask: Optional[torch.Tensor] = None,
54
+ inputs_embeds: Optional[torch.Tensor] = None,
55
+ labels: Optional[torch.Tensor] = None,
56
+ output_attentions: Optional[bool] = None,
57
+ output_hidden_states: Optional[bool] = None,
58
+ return_dict: Optional[bool] = None,
59
+ sequence_ids: Optional[torch.Tensor] = None,
60
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
61
+
62
+ outputs = self.base_enc_model(
63
+ input_ids=input_ids,
64
+ attention_mask=attention_mask
65
+ )
66
+
67
+ masked_token_filter = input_ids == self.mask_token_id
68
+ masked_repr = outputs.last_hidden_state[masked_token_filter].reshape(len(input_ids), -1)
69
+
70
+ if self.masked_lm_verbalizer:
71
+ logits = self.lm_head(masked_repr)[:, self.verbalizer_token_ids]
72
+ else:
73
+ logits = self.classifier(masked_repr)
74
+
75
+ loss = None
76
+ if labels is not None:
77
+ loss_fct = CrossEntropyLoss()
78
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
79
+
80
+ return SequenceClassifierOutput(
81
+ loss=loss,
82
+ logits=logits,
83
+ hidden_states=outputs.hidden_states,
84
+ attentions=outputs.attentions,
85
+ )
preprocessing.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger('stancedatasets')
4
+
5
+
6
+ def format_masked_lm_prompt(stance_df, tokenizer, prompt_type, lang='en', masked_lm_tokens=1):
7
+ masks_str = "".join([tokenizer.mask_token] * masked_lm_tokens)
8
+ if lang == 'pl':
9
+ if prompt_type == 1:
10
+ return list(stance_df['text']), \
11
+ list('Moja postawa w kierunku ' + stance_df['target'] + ' jest: ' + masks_str + '.')
12
+ elif prompt_type == 2:
13
+ return list(stance_df['text']), \
14
+ list('Moja postawa w kierunku ' + stance_df['target'] + ' jest ' + masks_str + '.')
15
+ elif prompt_type == 3:
16
+ return list(stance_df['text']), \
17
+ list('Więc moja postawa w kierunku ' + stance_df['target'] + ' jest: ' + masks_str + '.')
18
+ elif prompt_type == 4:
19
+ return list(stance_df['text']), \
20
+ list('Więc moja postawa w kierunku ' + stance_df['target'] + ' jest ' + masks_str + '.')
21
+
22
+ raise ValueError(f'unknown prompt_type: {prompt_type} for language {lang}')