Spaces:

RyanX
/

BookSearch

Runtime error

App Files Files Community

xyh1756 commited on Nov 26, 2021

Commit

ad16774

1 Parent(s): f35fe56

first commit

Browse files

Files changed (18) hide show

app.py +16 -0
bert-base-chinese/README.md +3 -0
bert-base-chinese/config.json +25 -0
bert-base-chinese/flax_model.msgpack +3 -0
bert-base-chinese/pytorch_model.bin +3 -0
bert-base-chinese/tf_model.h5 +3 -0
bert-base-chinese/tokenizer.json +0 -0
bert-base-chinese/tokenizer_config.json +3 -0
bert-base-chinese/vocab.txt +0 -0
bert/__init__.py +15 -0
bert/modeling_jointbert.py +63 -0
bert/module.py +23 -0
book_model/config.json +27 -0
book_model/pytorch_model.bin +3 -0
book_model/training_args.bin +3 -0
data/intent_label.txt +3 -0
data/slot_label.txt +13 -0
predictOnce.py +180 -0

app.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import gradio as gr
+from predictOnce import Estimator
+def predict(inputText):
+    global e
+    res = e.predict(inputText)
+    return res[0], res[1]
+if __name__ == '__main__':
+    e = Estimator()
+    iface = gr.Interface(fn=e.predict, inputs=gr.inputs.Textbox(lines=2, label="输入语句", placeholder="输入要识别的语句..."),
+                         outputs=[gr.outputs.Textbox(label="意图"), gr.outputs.Textbox(label="槽值")], live=True,
+                         theme="huggingface", allow_screenshot=False, allow_flagging=False)
+    iface.launch(share=True)

bert-base-chinese/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+language: zh
+---

bert-base-chinese/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "type_vocab_size": 2,
+  "vocab_size": 21128
+}

bert-base-chinese/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76df8425215fb9ede22e0393e356f82a99d84e79f078cd141afbbf9277460c8e
+size 409168515

bert-base-chinese/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a693db616eaf647ed2bfe531e1fa446637358fc108a8bf04e8d4db17e837ee9
+size 411577189

bert-base-chinese/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:612acd33db45677c3d6ba70615336619dc65cddf1ecf9d39a22dd1934af4aff2
+size 478309336

bert-base-chinese/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert-base-chinese/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "do_lower_case": false
+}

bert-base-chinese/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bert/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

bert/modeling_jointbert.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import torch.nn as nn
+from transformers.modeling_bert import BertPreTrainedModel, BertModel, BertConfig
+from torchcrf import CRF
+from .module import IntentClassifier, SlotClassifier
+class JointBERT(BertPreTrainedModel):
+    def __init__(self, config, args, intent_label_lst, slot_label_lst):
+        super(JointBERT, self).__init__(config)
+        self.args = args
+        self.num_intent_labels = len(intent_label_lst)
+        self.num_slot_labels = len(slot_label_lst)
+        self.bert = BertModel(config=config)  # Load pretrained bert
+        self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate)
+        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate)
+        if args.use_crf:
+            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)
+    def forward(self, input_ids, attention_mask, token_type_ids, intent_label_ids, slot_labels_ids):
+        outputs = self.bert(input_ids, attention_mask=attention_mask,
+                            token_type_ids=token_type_ids)  # sequence_output, pooled_output, (hidden_states), (attentions)
+        sequence_output = outputs[0]
+        pooled_output = outputs[1]  # [CLS]
+        intent_logits = self.intent_classifier(pooled_output)
+        slot_logits = self.slot_classifier(sequence_output)
+        total_loss = 0
+        # 1. Intent Softmax
+        if intent_label_ids is not None:
+            if self.num_intent_labels == 1:
+                intent_loss_fct = nn.MSELoss()
+                intent_loss = intent_loss_fct(intent_logits.view(-1), intent_label_ids.view(-1))
+            else:
+                intent_loss_fct = nn.CrossEntropyLoss()
+                intent_loss = intent_loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1))
+            total_loss += intent_loss
+        # 2. Slot Softmax
+        if slot_labels_ids is not None:
+            if self.args.use_crf:
+                slot_loss = self.crf(slot_logits, slot_labels_ids, mask=attention_mask.byte(), reduction='mean')
+                slot_loss = -1 * slot_loss  # negative log-likelihood
+            else:
+                slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.args.ignore_index)
+                # Only keep active parts of the loss
+                if attention_mask is not None:
+                    active_loss = attention_mask.view(-1) == 1
+                    active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
+                    active_labels = slot_labels_ids.view(-1)[active_loss]
+                    slot_loss = slot_loss_fct(active_logits, active_labels)
+                else:
+                    slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
+            total_loss += self.args.slot_loss_coef * slot_loss
+        outputs = ((intent_logits, slot_logits),) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (total_loss,) + outputs
+        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

bert/module.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+class IntentClassifier(nn.Module):
+    def __init__(self, input_dim, num_intent_labels, dropout_rate=0.):
+        super(IntentClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_intent_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)
+class SlotClassifier(nn.Module):
+    def __init__(self, input_dim, num_slot_labels, dropout_rate=0.):
+        super(SlotClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_slot_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)

book_model/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "JointBERT"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "directionality": "bidi",
+  "finetuning_task": "book",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "type_vocab_size": 2,
+  "vocab_size": 21128
+}

book_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6389ddb0d25ffbbea13eae6adbeb3a8e9dde3dd71ad811abd019862f51570ede
+size 409203155

book_model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2a51024072e0d7bf7f5c695ade9f2bf7b52f85696d12e73389e95a8d63fe9c
+size 1199

data/intent_label.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+UNK
+query
+chat

data/slot_label.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+PAD
+UNK
+O
+B-Author
+I-Author
+B-Book
+I-Book
+B-Press
+I-Press
+B-Tag
+I-Tag
+B-Topic
+I-Topic

predictOnce.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+import time
+import numpy as np
+import torch
+from transformers import BertTokenizer
+from bert.modeling_jointbert import JointBERT
+class Estimator:
+    class Args:
+        adam_epsilon = 1e-08
+        batch_size = 16
+        data_dir = 'data'
+        device = 'cpu'
+        do_eval = True
+        do_train = False
+        dropout_rate = 0.1
+        eval_batch_size = 64
+        gradient_accumulation_steps = 1
+        ignore_index = 0
+        intent_label_file = 'data/intent_label.txt'
+        learning_rate = 5e-05
+        logging_steps = 50
+        max_grad_norm = 1.0
+        max_seq_len = 50
+        max_steps = -1
+        model_dir = 'book_model'
+        model_name_or_path = 'bert-base-chinese'
+        model_type = 'bert-chinese'
+        no_cuda = False
+        num_train_epochs = 5.0
+        save_steps = 200
+        seed = 1234
+        slot_label_file = 'data/slot_label.txt'
+        slot_loss_coef = 1.0
+        slot_pad_label = 'PAD'
+        task = 'book'
+        train_batch_size = 32
+        use_crf = False
+        warmup_steps = 0
+        weight_decay = 0.0
+    def __init__(self, args=Args):
+        self.intent_label_lst = [label.strip() for label in open(args.intent_label_file, 'r', encoding='utf-8')]
+        self.slot_label_lst = [label.strip() for label in open(args.slot_label_file, 'r', encoding='utf-8')]
+        # Check whether model exists
+        if not os.path.exists(args.model_dir):
+            raise Exception("Model doesn't exists! Train first!")
+        self.model = JointBERT.from_pretrained(args.model_dir,
+                                               args=args,
+                                               intent_label_lst=self.intent_label_lst,
+                                               slot_label_lst=self.slot_label_lst)
+        self.model.to(args.device)
+        self.model.eval()
+        self.args = args
+        self.tokenizer = BertTokenizer.from_pretrained(self.args.model_name_or_path)
+    def convert_input_to_tensor_data(self, input, tokenizer, pad_token_label_id,
+                                     cls_token_segment_id=0,
+                                     pad_token_segment_id=0,
+                                     sequence_a_segment_id=0,
+                                     mask_padding_with_zero=True):
+        # Setting based on the current model type
+        cls_token = tokenizer.cls_token
+        sep_token = tokenizer.sep_token
+        unk_token = tokenizer.unk_token
+        pad_token_id = tokenizer.pad_token_id
+        slot_label_mask = []
+        words = list(input)
+        tokens = []
+        for word in words:
+            word_tokens = tokenizer.tokenize(word)
+            if not word_tokens:
+                word_tokens = [unk_token]  # For handling the bad-encoded word
+            tokens.extend(word_tokens)
+            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+            slot_label_mask.extend([pad_token_label_id + 1] + [pad_token_label_id] * (len(word_tokens) - 1))
+        # Account for [CLS] and [SEP]
+        special_tokens_count = 2
+        if len(tokens) > self.args.max_seq_len - special_tokens_count:
+            tokens = tokens[: (self.args.max_seq_len - special_tokens_count)]
+            slot_label_mask = slot_label_mask[:(self.args.max_seq_len - special_tokens_count)]
+        # Add [SEP] token
+        tokens += [sep_token]
+        token_type_ids = [sequence_a_segment_id] * len(tokens)
+        slot_label_mask += [pad_token_label_id]
+        # Add [CLS] token
+        tokens = [cls_token] + tokens
+        token_type_ids = [cls_token_segment_id] + token_type_ids
+        slot_label_mask = [pad_token_label_id] + slot_label_mask
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        padding_length = self.args.max_seq_len - len(input_ids)
+        input_ids = input_ids + ([pad_token_id] * padding_length)
+        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+        slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)
+        # Change to Tensor
+        input_ids = torch.tensor([input_ids], dtype=torch.long)
+        attention_mask = torch.tensor([attention_mask], dtype=torch.long)
+        token_type_ids = torch.tensor([token_type_ids], dtype=torch.long)
+        slot_label_mask = torch.tensor([slot_label_mask], dtype=torch.long)
+        data = [input_ids, attention_mask, token_type_ids, slot_label_mask]
+        return data
+    def predict(self, input):
+        # Convert input file to TensorDataset
+        pad_token_label_id = self.args.ignore_index
+        batch = self.convert_input_to_tensor_data(input, self.tokenizer, pad_token_label_id)
+        # Predict
+        batch = tuple(t.to(self.args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {"input_ids": batch[0],
+                      "attention_mask": batch[1],
+                      "token_type_ids": batch[2],
+                      "intent_label_ids": None,
+                      "slot_labels_ids": None}
+            outputs = self.model(**inputs)
+            _, (intent_logits, slot_logits) = outputs[:2]
+            # Intent Prediction
+            intent_pred = intent_logits.detach().cpu().numpy()
+            # Slot prediction
+            if self.args.use_crf:
+                # decode() in `torchcrf` returns list with best index directly
+                slot_preds = np.array(self.model.crf.decode(slot_logits))
+            else:
+                slot_preds = slot_logits.detach().cpu().numpy()
+            all_slot_label_mask = batch[3].detach().cpu().numpy()
+        intent_pred = np.argmax(intent_pred, axis=1)[0]
+        if not self.args.use_crf:
+            slot_preds = np.argmax(slot_preds, axis=2)
+        slot_label_map = {i: label for i, label in enumerate(self.slot_label_lst)}
+        slot_preds_list = []
+        for i in range(slot_preds.shape[1]):
+            if all_slot_label_mask[0, i] != pad_token_label_id:
+                slot_preds_list.append(slot_label_map[slot_preds[0][i]])
+        words = list(input)
+        slots = dict()
+        slot = str()
+        for i in range(len(words)):
+            if slot_preds_list[i] == 'O':
+                if slot == '':
+                    continue
+                slots[slot_preds_list[i - 1].split('-')[1]] = slot
+                slot = str()
+            else:
+                slot += words[i]
+        if slot != '':
+            slots[slot_preds_list[len(words) - 1].split('-')[1]] = slot
+        return self.intent_label_lst[intent_pred], slots
+if __name__ == "__main__":
+    e = Estimator()
+    while True:
+        print(e.predict(input(">>")))