Spaces:

mc0c0z
/

btl-nlp

Runtime error

App Files Files Community

mc0c0z commited on Aug 10, 2024

Commit

a307848

1 Parent(s): ec6055f

Update space

Browse files

Files changed (4) hide show

app.py +286 -59
sa_model/bert_model_sentiment_analysis.pth +3 -0
sa_model/phobert_sentiment_analysis.pth +3 -0
sa_model/phobertv2_sentiment_analysis.pth +3 -0

app.py CHANGED Viewed

@@ -1,63 +1,290 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import os
+import sys
 import gradio as gr
+import html
+from tqdm import tqdm
+import torch
+from transformers import MBartForConditionalGeneration, AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, AutoModelForTokenClassification, pipeline
+from torch import nn
+import torch.nn.functional as F
+from underthesea import word_tokenize
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+# Load multi task model
+bartpho_mt_base = MBartForConditionalGeneration.from_pretrained("mc0c0z/BARTPho-multi-task")
+bartpho_mt_base_tokenizer = AutoTokenizer.from_pretrained("mc0c0z/BARTPho-multi-task")
+bartpho_mt_base.to(device)
+bartpho_mt = MBartForConditionalGeneration.from_pretrained("mc0c0z/BARTPho-Large-multi-task")
+bartpho_mt_tokenizer = AutoTokenizer.from_pretrained("mc0c0z/BARTPho-Large-multi-task")
+bartpho_mt.to(device)
+def segmenter(text):
+    text = html.unescape(text)
+    tokens = word_tokenize(text)
+    result = []
+    for token in tokens:
+        if ' ' in token:
+            result.append(token.replace(' ', '_'))
+        else:
+            result.append(token)
+    return result
+class MultiTaskModel:
+    def __init__(self, model, tokenizer, device):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+    def get_prompt(self, task):
+        if task == 'sa':
+            return "Classify the sentiment: "
+        elif task == 'mt-en-vi':
+            return "Translate English to Vietnamese: "
+        elif task == 'mt-vi-en':
+            return "Translate Vietnamese to English: "
+        else:
+            return ""
+    def inference(self, task, sentence, device):
+        # Tiền xử lý câu đầu vào tương tự như trong CustomDataset
+        tokenized_text = segmenter(sentence)
+        source = self.get_prompt(task) + " ".join(tokenized_text)
+        # Tokenize input
+        inputs = self.tokenizer(source, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
+        # Di chuyển input sang device
+        input_ids = inputs["input_ids"].to(device)
+        attention_mask = inputs["attention_mask"].to(device)
+        # Sinh dự đoán
+        self.model.eval()
+        with torch.no_grad():
+            generated_output = self.model.generate(input_ids, attention_mask=attention_mask, max_length=128)
+        # Giải mã dự đoán
+        prediction = self.tokenizer.decode(generated_output[0], skip_special_tokens=True)
+        if task == 'sa':
+            class_names = ["Negative", "Positive"]
+            return class_names[int(prediction[0])]
+        return html.unescape(prediction)
+#Load SA model
+class CustomModel(nn.Module):
+    def __init__(self, bert_model):
+        super(CustomModel, self).__init__()
+        self.bert = bert_model
+        self.mlp = nn.Sequential(
+            nn.Linear(768 * 5, 512),  # 768*5 cho BERT
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, 3)  # num_classes là số lượng lớp trong bài toán
+        )
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        # Lấy 5 lớp ẩn cuối cùng của token [CLS]
+        last_hidden_states = outputs.hidden_states[-5:]
+        cls_embeddings = torch.cat([state[:, 0, :] for state in last_hidden_states], dim=1)
+        # Đưa qua MLP
+        logits = self.mlp(cls_embeddings)
+        return logits
+## PhoBERT
+phobert_sa = AutoModel.from_pretrained("vinai/phobert-base", output_hidden_states=True)
+phobert_sa_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+phobert_sa = CustomModel(phobert_sa)
+phobert_sa.load_state_dict(torch.load('sa_model\phobert_sentiment_analysis.pth', map_location=device))
+phobert_sa.to(device)
+## PhoBERTv2
+phobertv2_sa = AutoModel.from_pretrained("vinai/phobert-base-v2", output_hidden_states=True)
+phobertv2_sa_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
+phobertv2_sa = CustomModel(phobertv2_sa)
+phobertv2_sa.load_state_dict(torch.load('sa_model\phobertv2_sentiment_analysis.pth', map_location=device))
+phobertv2_sa.to(device)
+## Multilingual BERT
+m_bert_sa = AutoModel.from_pretrained("google-bert/bert-base-multilingual-cased", output_hidden_states=True)
+m_bert_sa_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
+m_bert_sa = CustomModel(m_bert_sa)
+m_bert_sa.load_state_dict(torch.load('sa_model\\bert_model_sentiment_analysis.pth', map_location=device))
+m_bert_sa.to(device)
+# Load Q&A model
+roberta_qa = AutoModelForQuestionAnswering.from_pretrained("HungLV2512/Vietnamese-QA-fine-tuned")
+roberta_qa_tokenizer = AutoTokenizer.from_pretrained("HungLV2512/Vietnamese-QA-fine-tuned")
+roberta_qa.to(device)
+# Load NER model
+label_map = {
+    'B-LOC': 0,
+    'B-MISC': 1,
+    'B-ORG': 2,
+    'B-PER': 3,
+    'I-LOC': 4,
+    'I-MISC': 5,
+    'I-ORG': 6,
+    'I-PER': 7,
+    'O': 8
+}
+## PhoBERT
+phobert_ner = AutoModelForTokenClassification.from_pretrained("DrRinS/NER-PhoBERT", num_labels=len(label_map))
+phobert_ner_tokenizer = AutoTokenizer.from_pretrained("DrRinS/NER-PhoBERT")
+phobert_ner.to(device)
+## PhoBERTv2
+phobertv2_ner = AutoModelForTokenClassification.from_pretrained("DrRinS/NER-PhoBERTv2", num_labels=len(label_map))
+phobertv2_ner_tokenizer = AutoTokenizer.from_pretrained("DrRinS/NER-PhoBERTv2")
+phobertv2_ner.to(device)
+## Multilingual BERT
+m_bert_ner = AutoModelForTokenClassification.from_pretrained("DrRinS/NER_MultilingualBERT", num_labels=len(label_map))
+m_bert_ner_tokenizer = AutoTokenizer.from_pretrained("DrRinS/NER_MultilingualBERT")
+m_bert_ner.to(device)
+# Inference function
+def sentiment_inference(model, tokenizer, text, device):
+    # Segment the input text
+    text = " ".join(segmenter(text))
+    # Tokenize the segmented text
+    inputs = tokenizer(
+        text,
+        padding='max_length',
+        truncation=True,
+        max_length=128,
+        return_tensors='pt'
+    )
+    # Move inputs to the correct device
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    # Ensure inputs have the correct shape
+    input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids
+    attention_mask = attention_mask.unsqueeze(0) if attention_mask.dim() == 1 else attention_mask
+    # Perform inference
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask)
+        _, preds = torch.max(outputs, dim=1)
+    # Map predictions to class names
+    class_names = ["Negative", "Positive", "Neutral"]
+    return class_names[preds.cpu().item()]
+def multitask_inference(model, tokenizer, text, task, device):
+    multitask_model = MultiTaskModel(model, tokenizer, device)
+    return multitask_model.inference(task, text, device)
+def qa_inference(model, tokenizer, question, context, device):
+    qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)
+    res = qa_pipeline(question=question, context=context)
+    return res['answer']
+def ner_inference(model, tokenizer, text, device):
+    predictions = []
+    # Tokenize the segmented text
+    inputs = tokenizer(
+        text,
+        padding='max_length',
+        truncation=True,
+        max_length=128,
+        return_tensors='pt'
+    )
+    # Move inputs to the correct device
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    # Perform inference
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask)
+        _, preds = torch.max(outputs.logits, dim=2)
+    # Convert predictions to labels
+    id_to_label = {v: k for k, v in label_map.items()}
+    predictions = preds[attention_mask.bool()].cpu().numpy().flatten()
+    labels = [id_to_label[p] for p in predictions]
+    # Decode the input ids to tokens
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0], skip_special_tokens=True)
+    # Combine tokens with their NER labels
+    ner_tags = list(zip(tokens, labels))
+    return ner_tags
+def process_input(input_text, context, task):
+    results = {}
+    if task == "Sentiment Analysis":
+        results["PhoBERT"] = sentiment_inference(phobert_sa, phobert_sa_tokenizer, input_text, device)
+        results["PhoBERTv2"] = sentiment_inference(phobertv2_sa, phobertv2_sa_tokenizer, input_text, device)
+        results["Multilingual BERT"] = sentiment_inference(m_bert_sa, m_bert_sa_tokenizer, input_text, device)
+        results["BARTPho Base"] = multitask_inference(bartpho_mt_base, bartpho_mt_base_tokenizer, input_text, "sa", device)
+        results["BARTPho Large"] = multitask_inference(bartpho_mt, bartpho_mt_tokenizer, input_text, "sa", device)
+    elif task == "English to Vietnamese":
+        results["BARTPho Base"] = multitask_inference(bartpho_mt_base, bartpho_mt_base_tokenizer, input_text, "mt-en-vi", device)
+        results["BARTPho Large"] = multitask_inference(bartpho_mt, bartpho_mt_tokenizer, input_text, "mt-en-vi", device)
+    elif task == "Vietnamese to English":
+        results["BARTPho Base"] = multitask_inference(bartpho_mt_base, bartpho_mt_base_tokenizer, input_text, "mt-vi-en", device)
+        results["BARTPho Large"] = multitask_inference(bartpho_mt, bartpho_mt_tokenizer, input_text, "mt-vi-en", device)
+    elif task == "Question Answering":
+        results["RoBERTa"] = qa_inference(roberta_qa, roberta_qa_tokenizer, input_text, context, device)
+    elif task == "Named Entity Recognition":
+        results["PhoBERT"] = ner_inference(phobert_ner, phobert_ner_tokenizer, input_text, device)
+        results["PhoBERTv2"] = ner_inference(phobertv2_ner, phobertv2_ner_tokenizer, input_text, device)
+        results["Multilingual BERT"] = ner_inference(m_bert_ner, m_bert_ner_tokenizer, input_text, device)
+    return results
+with gr.Blocks() as iface:
+    gr.Markdown("# Multi-task NLP Demo")
+    gr.Markdown("Perform sentiment analysis, machine translation, question answering, or named entity recognition using various models.")
+    with gr.Row():
+        task = gr.Radio(["Sentiment Analysis", "Question Answering", "Named Entity Recognition", "English to Vietnamese", "Vietnamese to English"], label="Task")
+    with gr.Row():
+        input_text = gr.Textbox(label="Input Text")
+        context = gr.Textbox(label="Context", visible=False)
+    output = gr.JSON(label="Results")
+    submit = gr.Button("Submit")
+    def on_task_change(task):
+        if task == "Question Answering":
+            return {
+                input_text: gr.update(label="Question", visible=True),
+                context: gr.update(visible=True)
+            }
+        else:
+            return {
+                input_text: gr.update(label="Input Text", visible=True),
+                context: gr.update(visible=False)
+            }
+    task.change(on_task_change, task, [input_text, context])
+    submit.click(
+        process_input,
+        inputs=[input_text, context, task],
+        outputs=output
+    )
 if __name__ == "__main__":
+    iface.launch(share=True)

sa_model/bert_model_sentiment_analysis.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:852b53ae6d6f1db4129b1de8a87eee9d12b3a2407ec2c9c827d523194103e879
+size 719896142

sa_model/phobert_sentiment_analysis.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e98704ecef05aaef8209231fdaf73040a5ca01ca9dc3baad9a2a31d20c257c3
+size 548474843

sa_model/phobertv2_sentiment_analysis.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b96944fa9531778d34bbd299c5d1ba6581dbb638486867002edc31c3ce15696
+size 548475261