cpi-connect commited on Sep 28, 2023

Commit

4e38daf

1 Parent(s): d60b7fd

Upload 18 files

Browse files

Files changed (18) hide show

.gitattributes +7 -0
args_model_utils.py +210 -0
argument_model_state_dict.pth +3 -0
configuration.py +1 -0
event_arg_predict.py +280 -0
event_arg_role_dataloader.py +100 -0
event_arg_role_predict.py +113 -0
event_nugget_predict.py +250 -0
event_realis_predict.py +270 -0
model_59.pt +3 -0
model_64_pos_ner.pt +3 -0
model_66.pt +3 -0
model_97.pt +3 -0
nugget_model_state_dict.pth +3 -0
nugget_model_utils.py +151 -0
realis_model_state_dict.pth +3 -0
realis_model_utils.py +146 -0
utils.py +196 -0

.gitattributes CHANGED Viewed

	@@ -1 +1,8 @@
1	pytorch_model.bin filter=lfs diff=lfs merge=lfs -text

 pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
+argument_model_state_dict.pth filter=lfs diff=lfs merge=lfs -text
+model_59.pt filter=lfs diff=lfs merge=lfs -text
+model_64_pos_ner.pt filter=lfs diff=lfs merge=lfs -text
+model_66.pt filter=lfs diff=lfs merge=lfs -text
+model_97.pt filter=lfs diff=lfs merge=lfs -text
+nugget_model_state_dict.pth filter=lfs diff=lfs merge=lfs -text
+realis_model_state_dict.pth filter=lfs diff=lfs merge=lfs -text

args_model_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+import spacy
+import en_core_web_sm
+from torch import nn
+import math
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+from transformers import AutoModel, TrainingArguments, Trainer, RobertaTokenizer, RobertaModel
+from transformers import AutoTokenizer
+model_checkpoint = "ehsanaghaei/SecureBERT"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+roberta_model = RobertaModel.from_pretrained(model_checkpoint).to(device)
+nlp = en_core_web_sm.load()
+pos_spacy_tag_list = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","X"]
+ner_spacy_tag_list = [bio + entity for entity in list(nlp.get_pipe('ner').labels) for bio in ["B-", "I-"]] + ["O"]
+dep_spacy_tag_list = list(nlp.get_pipe("parser").labels)
+event_nugget_tag_list = ["Databreach", "Ransom", "PatchVulnerability", "Phishing", "DiscoverVulnerability"]
+arg_nugget_relative_pos_tag_list = ["before-same-sentence", "before-differ-sentence", "after-same-sentence", "after-differ-sentence"]
+class CustomRobertaWithPOS(nn.Module):
+    def __init__(self, num_classes):
+        super(CustomRobertaWithPOS, self).__init__()
+        self.num_classes = num_classes
+        self.pos_embed = nn.Embedding(len(pos_spacy_tag_list), 16)
+        self.ner_embed = nn.Embedding(len(ner_spacy_tag_list), 8)
+        self.dep_embed = nn.Embedding(len(dep_spacy_tag_list), 8)
+        self.depth_embed = nn.Embedding(17, 8)
+        self.subtype_embed = nn.Embedding(len(event_nugget_tag_list), 2)
+        self.dist_embed = nn.Embedding(11, 6)
+        self.relative_pos_embed = nn.Embedding(len(arg_nugget_relative_pos_tag_list), 2)
+        self.roberta = roberta_model
+        self.dropout1 = nn.Dropout(0.2)
+        self.fc1 = nn.Linear(self.roberta.config.hidden_size + 50, num_classes)
+    def forward(self, input_ids, attention_mask, pos_spacy, ner_spacy, dep_spacy, depth_spacy, nearest_nugget_subtype, nearest_nugget_dist, arg_nugget_relative_pos):
+        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
+        last_hidden_output = outputs.last_hidden_state
+        pooler_output = outputs.pooler_output
+        pooler_output_unsqz = pooler_output.unsqueeze(1)
+        pooler_output_fin = pooler_output_unsqz.expand(-1, last_hidden_output.shape[1], -1)
+        pos_mask = pos_spacy != -100
+        pos_embed_masked = self.pos_embed(pos_spacy[pos_mask])
+        pos_embed = torch.zeros((pos_spacy.shape[0], pos_spacy.shape[1], 16), dtype=torch.float).to(device)
+        pos_embed[pos_mask] = pos_embed_masked
+        ner_mask = ner_spacy != -100
+        ner_embed_masked = self.ner_embed(ner_spacy[ner_mask])
+        ner_embed = torch.zeros((ner_spacy.shape[0], ner_spacy.shape[1], 8), dtype=torch.float).to(device)
+        ner_embed[ner_mask] = ner_embed_masked
+        dep_mask = dep_spacy != -100
+        dep_embed_masked = self.dep_embed(dep_spacy[dep_mask])
+        dep_embed = torch.zeros((dep_spacy.shape[0], dep_spacy.shape[1], 8), dtype=torch.float).to(device)
+        dep_embed[dep_mask] = dep_embed_masked
+        depth_mask = depth_spacy != -100
+        depth_embed_masked = self.depth_embed(depth_spacy[depth_mask])
+        depth_embed = torch.zeros((depth_spacy.shape[0], depth_spacy.shape[1], 8), dtype=torch.float).to(device)
+        depth_embed[dep_mask] = depth_embed_masked
+        nearest_nugget_subtype_mask = nearest_nugget_subtype != -100
+        nearest_nugget_subtype_embed_masked = self.subtype_embed(nearest_nugget_subtype[nearest_nugget_subtype_mask])
+        nearest_nugget_subtype_embed = torch.zeros((nearest_nugget_subtype.shape[0], nearest_nugget_subtype.shape[1], 2), dtype=torch.float).to(device)
+        nearest_nugget_subtype_embed[dep_mask] = nearest_nugget_subtype_embed_masked
+        nearest_nugget_dist_mask = nearest_nugget_dist != -100
+        nearest_nugget_dist_embed_masked = self.dist_embed(nearest_nugget_dist[nearest_nugget_dist_mask])
+        nearest_nugget_dist_embed = torch.zeros((nearest_nugget_dist.shape[0], nearest_nugget_dist.shape[1], 6), dtype=torch.float).to(device)
+        nearest_nugget_dist_embed[dep_mask] = nearest_nugget_dist_embed_masked
+        arg_nugget_relative_pos_mask = arg_nugget_relative_pos != -100
+        arg_nugget_relative_pos_embed_masked = self.relative_pos_embed(arg_nugget_relative_pos[arg_nugget_relative_pos_mask])
+        arg_nugget_relative_pos_embed = torch.zeros((arg_nugget_relative_pos.shape[0], arg_nugget_relative_pos.shape[1], 2), dtype=torch.float).to(device)
+        arg_nugget_relative_pos_embed[dep_mask] = arg_nugget_relative_pos_embed_masked
+        features_concat = torch.cat((last_hidden_output, pos_embed, ner_embed, dep_embed, depth_embed, nearest_nugget_subtype_embed, nearest_nugget_dist_embed, arg_nugget_relative_pos_embed), 2).to(device)
+        features_concat = self.dropout1(features_concat)
+        logits = self.fc1(features_concat)
+        return logits
+def tokenize_and_align_labels_with_pos_ner_dep(examples, tokenizer, label_all_tokens = True):
+    tokenized_inputs = tokenizer(examples["tokens"], padding='max_length', truncation=True, is_split_into_words=True)
+    #tokenized_inputs.pop('input_ids')
+    ner_spacy = []
+    pos_spacy = []
+    dep_spacy = []
+    depth_spacy = []
+    nearest_nugget_subtype = []
+    nearest_nugget_dist = []
+    arg_nugget_relative_pos = []
+    for i, (pos, ner, dep, depth, subtype, dist, relative_pos) in enumerate(zip(examples["pos_spacy"],
+                                                                                examples["ner_spacy"],
+                                                                                examples["dep_spacy"],
+                                                                                examples["depth_spacy"],
+                                                                                examples["nearest_nugget_subtype"],
+                                                                                examples["nearest_nugget_dist"],
+                                                                                examples["arg_nugget_relative_pos"])):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        ner_spacy_ids = []
+        pos_spacy_ids = []
+        dep_spacy_ids = []
+        depth_spacy_ids = []
+        nearest_nugget_subtype_ids = []
+        nearest_nugget_dist_ids = []
+        arg_nugget_relative_pos_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+            # ignored in the loss function.
+            if word_idx is None:
+                ner_spacy_ids.append(-100)
+                pos_spacy_ids.append(-100)
+                dep_spacy_ids.append(-100)
+                depth_spacy_ids.append(-100)
+                nearest_nugget_subtype_ids.append(-100)
+                nearest_nugget_dist_ids.append(-100)
+                arg_nugget_relative_pos_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                ner_spacy_ids.append(ner[word_idx])
+                pos_spacy_ids.append(pos[word_idx])
+                dep_spacy_ids.append(dep[word_idx])
+                depth_spacy_ids.append(depth[word_idx])
+                nearest_nugget_subtype_ids.append(subtype[word_idx])
+                nearest_nugget_dist_ids.append(dist[word_idx])
+                arg_nugget_relative_pos_ids.append(relative_pos[word_idx])
+            # For the other tokens in a word, we set the label to either the current label or -100, depending on
+            # the label_all_tokens flag.
+            else:
+                ner_spacy_ids.append(ner[word_idx] if label_all_tokens else -100)
+                pos_spacy_ids.append(pos[word_idx] if label_all_tokens else -100)
+                dep_spacy_ids.append(dep[word_idx] if label_all_tokens else -100)
+                depth_spacy_ids.append(depth[word_idx] if label_all_tokens else -100)
+                nearest_nugget_subtype_ids.append(subtype[word_idx] if label_all_tokens else -100)
+                nearest_nugget_dist_ids.append(dist[word_idx] if label_all_tokens else -100)
+                arg_nugget_relative_pos_ids.append(relative_pos[word_idx] if label_all_tokens else -100)
+            previous_word_idx = word_idx
+        ner_spacy.append(ner_spacy_ids)
+        pos_spacy.append(pos_spacy_ids)
+        dep_spacy.append(dep_spacy_ids)
+        depth_spacy.append(depth_spacy_ids)
+        nearest_nugget_subtype.append(nearest_nugget_subtype_ids)
+        nearest_nugget_dist.append(nearest_nugget_dist_ids)
+        arg_nugget_relative_pos.append(arg_nugget_relative_pos_ids)
+    tokenized_inputs["pos_spacy"] = pos_spacy
+    tokenized_inputs["ner_spacy"] = ner_spacy
+    tokenized_inputs["dep_spacy"] = dep_spacy
+    tokenized_inputs["depth_spacy"] = depth_spacy
+    tokenized_inputs["nearest_nugget_subtype"] = nearest_nugget_subtype
+    tokenized_inputs["nearest_nugget_dist"] = nearest_nugget_dist
+    tokenized_inputs["arg_nugget_relative_pos"] = arg_nugget_relative_pos
+    return tokenized_inputs
+def find_nearest_nugget_features(doc, start_idx, end_idx, event_nuggets):
+            nearest_subtype = None
+            nearest_dist = math.inf
+            relative_pos = None
+            mid_idx = (end_idx + start_idx) / 2
+            for nugget in event_nuggets:
+                mid_nugget_idx = (nugget["startOffset"] + nugget["endOffset"]) / 2
+                dist = abs(mid_nugget_idx - mid_idx)
+                if dist < nearest_dist:
+                    nearest_dist = dist
+                    nearest_subtype = nugget["subtype"]
+                    for sent in doc.sents:
+                        if between_idxs(mid_idx, sent.start_char, sent.end_char) and between_idxs(mid_nugget_idx, sent.start_char, sent.end_char):
+                            if mid_idx < mid_nugget_idx:
+                                relative_pos = "before-same-sentence"
+                            else:
+                                relative_pos = "after-same-sentence"
+                            break
+                        elif between_idxs(mid_nugget_idx, sent.start_char, sent.end_char) and mid_idx > mid_nugget_idx:
+                            relative_pos = "after-differ-sentence"
+                            break
+                        elif between_idxs(mid_idx, sent.start_char, sent.end_char) and mid_idx < mid_nugget_idx:
+                            relative_pos = "before-differ-sentence"
+                            break
+            nearest_dist = int(min(10, nearest_dist // 20))
+            return nearest_subtype, nearest_dist, relative_pos
+def find_dep_depth(token):
+            depth = 0
+            current_token = token
+            while current_token.head != current_token:
+                depth += 1
+                current_token = current_token.head
+            return min(depth, 16)
+def between_idxs(idx, start_idx, end_idx):
+    return idx >= start_idx and idx <= end_idx

argument_model_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:185e22992430c80ec1eb1fca7f3ba4ebe801163c3ba13bed00abc6dc24072712
+size 498813605

configuration.py CHANGED Viewed

@@ -5,6 +5,7 @@ from cybersecurity_knowledge_graph.utils import event_args_list, event_nugget_li
 class CybersecurityKnowledgeGraphConfig(PretrainedConfig):
     def __init__(
         self,

 class CybersecurityKnowledgeGraphConfig(PretrainedConfig):
+    model_type = "cybersecurity_knowledge_graph"
     def __init__(
         self,

event_arg_predict.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import streamlit as st
+from annotated_text import annotated_text
+import torch
+from torch.utils.data import DataLoader
+from cybersecurity_knowledge_graph.args_model_utils import tokenize_and_align_labels_with_pos_ner_dep, find_nearest_nugget_features, find_dep_depth
+from cybersecurity_knowledge_graph.nugget_model_utils import CustomRobertaWithPOS
+from cybersecurity_knowledge_graph.utils import get_content, get_event_nugget, get_idxs_from_text, get_entity_from_idx, list_of_pos_tags, event_args_list
+from cybersecurity_knowledge_graph.event_nugget_predict import get_event_nuggets
+import spacy
+from transformers import AutoTokenizer
+from datasets import load_dataset, Features, ClassLabel, Value, Sequence, Dataset
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+def find_dep_depth(token):
+    depth = 0
+    current_token = token
+    while current_token.head != current_token:
+        depth += 1
+        current_token = current_token.head
+    return min(depth, 16)
+nlp = spacy.load('en_core_web_sm')
+pos_spacy_tag_list = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","X"]
+ner_spacy_tag_list = [bio + entity for entity in list(nlp.get_pipe('ner').labels) for bio in ["B-", "I-"]] + ["O"]
+dep_spacy_tag_list = list(nlp.get_pipe("parser").labels)
+event_nugget_tag_list = ["Databreach", "Ransom", "PatchVulnerability", "Phishing", "DiscoverVulnerability"]
+arg_nugget_relative_pos_tag_list = ["before-same-sentence", "before-differ-sentence", "after-same-sentence", "after-differ-sentence"]
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model_checkpoint = "ehsanaghaei/SecureBERT"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+from cybersecurity_knowledge_graph.args_model_utils import CustomRobertaWithPOS as ArgumentModel
+model_nugget = ArgumentModel(num_classes=43)
+model_nugget.load_state_dict(torch.load("cybersecurity_knowledge_graph/argument_model_state_dict.pth", map_location=device))
+model_nugget.eval()
+"""
+Function: create_dataloader(text_input)
+Description: This function creates a DataLoader for processing text data, tokenizes it, and organizes it into batches.
+Inputs:
+    - text_input: The input text to be processed.
+Output:
+    - dataloader: A DataLoader for the tokenized and batched text data.
+    - tokenized_dataset_ner: The tokenized dataset used for training.
+"""
+def create_dataloader(text_input):
+    event_nuggets = get_event_nuggets(text_input)
+    doc = nlp(text_input)
+    content_as_words_emdash = [tok.text for tok in doc]
+    content_as_words_emdash = [word.replace("``", '"').replace("''", '"').replace("$", "") for word in content_as_words_emdash]
+    content_idx_dict = get_idxs_from_text(text_input, content_as_words_emdash)
+    data = []
+    words = []
+    arg_nugget_nearest_subtype = []
+    arg_nugget_nearest_dist = []
+    arg_nugget_relative_pos = []
+    pos_spacy = [tok.pos_ for tok in doc]
+    ner_spacy = [ent.ent_iob_ + "-" + ent.ent_type_ if ent.ent_iob_ != "O" else ent.ent_iob_ for ent in doc]
+    dep_spacy = [tok.dep_ for tok in doc]
+    depth_spacy = [find_dep_depth(tok) for tok in doc]
+    for content_dict in content_idx_dict:
+        start_idx, end_idx = content_dict["start_idx"], content_dict["end_idx"]
+        nearest_subtype, nearest_dist, relative_pos = find_nearest_nugget_features(doc, content_dict["start_idx"], content_dict["end_idx"], event_nuggets)
+        words.append(content_dict["word"])
+        arg_nugget_nearest_subtype.append(nearest_subtype)
+        arg_nugget_nearest_dist.append(nearest_dist)
+        arg_nugget_relative_pos.append(relative_pos)
+    content_token_len = len(tokenizer(words, truncation=False, is_split_into_words=True)["input_ids"])
+    if content_token_len > tokenizer.model_max_length:
+        no_split = (content_token_len // tokenizer.model_max_length) + 2
+        split_len = (len(words) // no_split) + 1
+        last_id = 0
+        threshold = split_len
+        for id, token in enumerate(words):
+            if token == "." and id > threshold:
+                data.append(
+                    {
+                        "tokens" : words[last_id : id + 1],
+                        "pos_spacy" : pos_spacy[last_id : id + 1],
+                        "ner_spacy" : ner_spacy[last_id : id + 1],
+                        "dep_spacy" : dep_spacy[last_id : id + 1],
+                        "depth_spacy" : depth_spacy[last_id : id + 1],
+                        "nearest_nugget_subtype" : arg_nugget_nearest_subtype[last_id : id + 1],
+                        "nearest_nugget_dist" : arg_nugget_nearest_dist[last_id : id + 1],
+                        "arg_nugget_relative_pos" : arg_nugget_relative_pos[last_id : id + 1]
+                    }
+                )
+                last_id = id + 1
+                threshold += split_len
+        data.append({"tokens" : words[last_id : ],
+                     "pos_spacy" : pos_spacy[last_id : ],
+                     "ner_spacy" : ner_spacy[last_id : ],
+                     "dep_spacy" : dep_spacy[last_id : ],
+                     "depth_spacy" : depth_spacy[last_id : ],
+                     "nearest_nugget_subtype" : arg_nugget_nearest_subtype[last_id : ],
+                    "nearest_nugget_dist" : arg_nugget_nearest_dist[last_id : ],
+                    "arg_nugget_relative_pos" : arg_nugget_relative_pos[last_id : ]})
+    else:
+        data.append(
+            {
+                "tokens" : words,
+                "pos_spacy" : pos_spacy,
+                "ner_spacy" : ner_spacy,
+                "dep_spacy" : dep_spacy,
+                "depth_spacy" : depth_spacy,
+                "nearest_nugget_subtype" : arg_nugget_nearest_subtype,
+                "nearest_nugget_dist" : arg_nugget_nearest_dist,
+                "arg_nugget_relative_pos" : arg_nugget_relative_pos
+            }
+        )
+    ner_features = Features({'tokens' : Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
+                            'pos_spacy' : Sequence(feature=ClassLabel(num_classes=len(pos_spacy_tag_list), names=pos_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'ner_spacy' : Sequence(feature=ClassLabel(num_classes=len(ner_spacy_tag_list), names=ner_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'dep_spacy' : Sequence(feature=ClassLabel(num_classes=len(dep_spacy_tag_list), names=dep_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'depth_spacy' : Sequence(feature=ClassLabel(num_classes=17, names= list(range(17)), names_file=None, id=None), length=-1, id=None),
+                            'nearest_nugget_subtype' : Sequence(feature=ClassLabel(num_classes=len(event_nugget_tag_list), names=event_nugget_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'nearest_nugget_dist' : Sequence(feature=ClassLabel(num_classes=11, names=list(range(11)), names_file=None, id=None), length=-1, id=None),
+                            'arg_nugget_relative_pos' : Sequence(feature=ClassLabel(num_classes=len(arg_nugget_relative_pos_tag_list), names=arg_nugget_relative_pos_tag_list, names_file=None, id=None), length=-1, id=None),
+                            })
+    dataset = Dataset.from_list(data, features=ner_features)
+    tokenized_dataset_ner = dataset.map(tokenize_and_align_labels_with_pos_ner_dep, fn_kwargs={'tokenizer' : tokenizer}, batched=True, load_from_cache_file=False)
+    tokenized_dataset_ner = tokenized_dataset_ner.with_format("torch")
+    tokenized_dataset_ner = tokenized_dataset_ner.remove_columns("tokens")
+    batch_size = 4 # Number of input texts
+    dataloader = DataLoader(tokenized_dataset_ner, batch_size=batch_size)
+    return dataloader, tokenized_dataset_ner
+"""
+Function: predict(dataloader)
+Description: This function performs prediction on a given dataloader using a trained model for label classification.
+Inputs:
+    - dataloader: A DataLoader containing the input data for prediction.
+Output:
+    - predicted_label: A tensor containing the predicted labels for each input in the dataloader.
+"""
+def predict(dataloader):
+    predicted_label = []
+    for batch in dataloader:
+        with torch.no_grad():
+            logits = model_nugget(**batch)
+        batch_predicted_label = logits.argmax(-1)
+        predicted_label.append(batch_predicted_label)
+    return torch.cat(predicted_label, dim=-1)
+"""
+Function: show_annotations(text_input)
+Description: This function displays annotated event arguments in the provided input text.
+Inputs:
+    - text_input: The input text containing event arguments to be annotated and displayed.
+Output:
+    - An interactive display of annotated event arguments within the input text.
+"""
+def show_annotations(text_input):
+    st.title("Event Arguments")
+    dataloader, tokenized_dataset_ner = create_dataloader(text_input)
+    predicted_label = predict(dataloader)
+    for idx, labels in enumerate(predicted_label):
+        token_mask = [token > 2 for token in tokenized_dataset_ner[idx]["input_ids"]]
+        tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset_ner[idx]["input_ids"][token_mask], skip_special_tokens=True)
+        tokens = [token.replace("Ġ", "").replace("Ċ", "").replace("âĢĻ", "'") for token in tokens]
+        text = tokenizer.decode(tokenized_dataset_ner[idx]["input_ids"][token_mask])
+        idxs = get_idxs_from_text(text, tokens)
+        labels = labels[token_mask]
+        annotated_text_list = []
+        last_label = ""
+        cumulative_tokens = ""
+        last_id = 0
+        for idx, label in zip(idxs, labels):
+            to_label = event_args_list[label]
+            label_short = to_label.split("-")[1] if "-" in to_label else to_label
+            if last_label == label_short:
+                cumulative_tokens += text[last_id : idx["end_idx"]]
+                last_id = idx["end_idx"]
+            else:
+                if last_label != "":
+                    if last_label == "O":
+                        annotated_text_list.append(cumulative_tokens)
+                    else:
+                        annotated_text_list.append((cumulative_tokens, last_label))
+                last_label = label_short
+                cumulative_tokens = idx["word"]
+                last_id = idx["end_idx"]
+        if last_label == "O":
+            annotated_text_list.append(cumulative_tokens)
+        else:
+            annotated_text_list.append((cumulative_tokens, last_label))
+        annotated_text(annotated_text_list)
+"""
+Function: get_event_args(text_input)
+Description: This function extracts predicted event arguments (event nuggets) from the provided input text.
+Inputs:
+    - text_input: The input text containing event nuggets to be extracted.
+Output:
+    - predicted_event_nuggets: A list of dictionaries, each representing an extracted event nugget with start and end offsets,
+      subtype, and text content.
+"""
+def get_event_args(text_input):
+    dataloader, tokenized_dataset_ner = create_dataloader(text_input)
+    predicted_label = predict(dataloader)
+    predicted_event_nuggets = []
+    text_length = 0
+    for idx, labels in enumerate(predicted_label):
+        token_mask = [token > 2 for token in tokenized_dataset_ner[idx]["input_ids"]]
+        tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset_ner[idx]["input_ids"][token_mask], skip_special_tokens=True)
+        tokens = [token.replace("Ġ", "").replace("Ċ", "").replace("âĢĻ", "'") for token in tokens]
+        text = tokenizer.decode(tokenized_dataset_ner[idx]["input_ids"][token_mask])
+        idxs = get_idxs_from_text(text_input[text_length : ], tokens)
+        labels = labels[token_mask]
+        start_idx = 0
+        end_idx = 0
+        last_label = ""
+        for idx, label in zip(idxs, labels):
+            to_label = event_args_list[label]
+            if "-" in to_label:
+                label_split = to_label.split("-")[1]
+            else:
+                label_split = to_label
+            if label_split == last_label:
+                end_idx = idx["end_idx"]
+            else:
+                if text_input[start_idx : end_idx] != "" and last_label != "O":
+                    predicted_event_nuggets.append(
+                        {
+                            "startOffset" : text_length + start_idx,
+                            "endOffset" : text_length + end_idx,
+                            "subtype" : last_label,
+                            "text" : text_input[text_length + start_idx : text_length + end_idx]
+                        }
+                    )
+                start_idx = idx["start_idx"]
+                end_idx = idx["start_idx"] + len(idx["word"])
+            last_label = label_split
+        text_length += idx["end_idx"]
+    return predicted_event_nuggets

event_arg_role_dataloader.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os, json
+from cybersecurity_knowledge_graph.utils import get_content, get_event_args, get_event_nugget, get_idxs_from_text, get_args_entity_from_idx, find_dict_by_overlap
+from tqdm import tqdm
+import spacy
+import jsonlines
+from sklearn.model_selection import train_test_split
+import math
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+import numpy as np
+embed_model = SentenceTransformer('all-MiniLM-L6-v2')
+pipe = pipeline("token-classification", model="CyberPeace-Institute/SecureBERT-NER")
+nlp = spacy.load('en_core_web_sm')
+"""
+Class: EventArgumentRoleDataset
+Description: This class represents a dataset for training and evaluating event argument role classifiers.
+Attributes:
+    - path: The path to the folder containing JSON files with event data.
+    - tokenizer: A tokenizer for encoding text data.
+    - arg: The specific argument type (subtype) for which the dataset is created.
+    - data: A list to store data samples, each consisting of an embedding and a label.
+    - train_data, val_data, test_data: Lists to store the split training, validation, and test data samples.
+    - datapoint_id: An identifier for tracking data samples.
+Methods:
+    - __len__(): Returns the total number of data samples in the dataset.
+    - __getitem__(index): Retrieves a data sample at a specified index.
+    - to_jsonlines(train_path, val_path, test_path): Writes the dataset to JSON files for train, validation, and test sets.
+    - train_val_test_split(): Splits the data into training and test sets.
+    - load_data(): Loads and preprocesses event data from JSON files, creating embeddings for argument-role classification.
+"""
+class EventArgumentRoleDataset():
+    def __init__(self, path, tokenizer, arg):
+        self.path = path
+        self.tokenizer = tokenizer
+        self.arg = arg
+        self.data = []
+        self.train_data, self.val_data, self.test_data = None, None, None
+        self.datapoint_id = 0
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        sample = self.data[index]
+        return sample
+    def to_jsonlines(self, train_path, val_path, test_path):
+        if self.train_data is None or self.test_data is None:
+            raise ValueError("Do the train-val-test split")
+        with jsonlines.open(train_path, "w") as f:
+            f.write_all(self.train_data)
+        # with jsonlines.open(val_path, "w") as f:
+        #     f.write_all(self.val_data)
+        with jsonlines.open(test_path, "w") as f:
+            f.write_all(self.test_data)
+    def train_val_test_split(self):
+        self.train_data, self.test_data = train_test_split(self.data, test_size=0.1, random_state=42, shuffle=True)
+        # self.val_data, self.test_data = train_test_split(test_val, test_size=0.5, random_state=42, shuffle=True)
+    def load_data(self):
+        folder_path = self.path
+        json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
+        # Load the nuggets
+        for idx, file_path in enumerate(tqdm(json_files)):
+            try:
+                with open(self.path + file_path, "r") as f:
+                    file_json = json.load(f)
+            except:
+                print("Error in ", file_path)
+            content = get_content(file_json)
+            content = content.replace("\xa0", " ")
+            event_args = get_event_args(file_json)
+            doc = nlp(content)
+            sentence_indexes = []
+            for sent in doc.sents:
+                start_index = sent[0].idx
+                end_index = sent[-1].idx + len(sent[-1].text)
+                sentence_indexes.append((start_index, end_index))
+            for idx, (start, end) in enumerate(sentence_indexes):
+                sentence = content[start:end]
+                is_arg_sentence = [event_arg["startOffset"] >= start and event_arg["endOffset"] <= end for event_arg in event_args]
+                args = [event_args[idx] for idx, boolean in enumerate(is_arg_sentence) if boolean]
+                if args != []:
+                    sentence_doc = nlp(sentence)
+                    sentence_embed = embed_model.encode(sentence)
+                    for arg in args:
+                        if arg["type"] == self.arg:
+                            arg_embed = embed_model.encode(arg["text"])
+                            embedding = np.concatenate((sentence_embed, arg_embed))
+                            self.data.append({"embedding" : embedding, "label" : arg["role"]["type"]})

event_arg_role_predict.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from cybersecurity_knowledge_graph.event_arg_role_dataloader import EventArgumentRoleDataset
+from cybersecurity_knowledge_graph.utils import arg_2_role
+import os
+from transformers import AutoTokenizer
+import optuna
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import cross_val_score
+from sklearn.metrics import make_scorer, f1_score
+from sklearn.ensemble import VotingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.neural_network import MLPClassifier
+from sklearn.svm import SVC
+from joblib import dump, load
+from sentence_transformers import SentenceTransformer
+import numpy as np
+embed_model = SentenceTransformer('all-MiniLM-L6-v2')
+model_checkpoint = "ehsanaghaei/SecureBERT"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+classifiers = {}
+folder_path = '/cybersecurity_knowledge_graph/arg_role_models'
+for filename in os.listdir(os.getcwd() + folder_path):
+    if filename.endswith('.joblib'):
+        file_path = os.getcwd() + os.path.join(folder_path, filename)
+        clf = load(file_path)
+        arg = filename.split(".")[0]
+        classifiers[arg] = clf
+"""
+Function: fit()
+Description: This function performs a machine learning task to train and evaluate classifiers for multiple argument roles.
+             It utilizes Optuna for hyperparameter optimization and creates a Voting Classifier.
+             The trained classifiers are saved as joblib files.
+"""
+def fit():
+    for arg, roles in arg_2_role.items():
+        if len(roles) > 1:
+            dataset = EventArgumentRoleDataset(path="./data/annotation/", tokenizer=tokenizer, arg=arg)
+            dataset.load_data()
+            dataset.train_val_test_split()
+            X = [datapoint["embedding"] for datapoint in dataset.data]
+            y = [roles.index(datapoint["label"]) for datapoint in dataset.data]
+            # FYI: Objective functions can take additional arguments
+            # (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
+            def objective(trial):
+                classifier_name = trial.suggest_categorical("classifier", ["voting"])
+                if classifier_name == "voting":
+                    svc_c = trial.suggest_float("svc_c", 1e-3, 1e3, log=True)
+                    svc_kernel = trial.suggest_categorical("kernel", ['rbf'])
+                    classifier_obj = VotingClassifier(estimators=[
+                        ('Logistic Regression', LogisticRegression()),
+                        ('Neural Network', MLPClassifier(max_iter=500)),
+                        ('Support Vector Machine', SVC(C=svc_c, kernel=svc_kernel))
+                    ], voting='hard')
+                f1_scorer = make_scorer(f1_score, average = "weighted")
+                stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+                cv_scores = cross_val_score(classifier_obj, X, y, cv=stratified_kfold, scoring=f1_scorer)
+                return cv_scores.mean()
+            study = optuna.create_study(direction="maximize")
+            study.optimize(objective, n_trials=20)
+            print(f"{arg} : {study.best_trial.values[0]}")
+            best_clf = VotingClassifier(estimators=[
+                        ('Logistic Regression', LogisticRegression()),
+                        ('Neural Network', MLPClassifier(max_iter=500)),
+                        ('Support Vector Machine', SVC(C=study.best_trial.params["svc_c"], kernel=study.best_trial.params["kernel"]))
+                    ], voting='hard')
+            best_clf.fit(X, y)
+            dump(best_clf, f'{arg}.joblib')
+"""
+Function: get_arg_roles(event_args, doc)
+Description: This function assigns argument roles to a list of event arguments within a document.
+Inputs:
+    - event_args: A list of event argument dictionaries, each containing information about an argument.
+    - doc: A spaCy document representing the analyzed text.
+Output:
+    - The input 'event_args' list with updated 'role' values assigned to each argument.
+"""
+def get_arg_roles(event_args, doc):
+    for arg in event_args:
+        if len(arg_2_role[arg["subtype"]]) > 1:
+            sent = next(filter(lambda x : arg["startOffset"] >= x.start_char and arg["endOffset"] <= x.end_char, doc.sents))
+            sent_embed = embed_model.encode(sent.text)
+            arg_embed = embed_model.encode(arg["text"])
+            embed = np.concatenate((sent_embed, arg_embed))
+            arg_clf = classifiers[arg["subtype"]]
+            role_id = arg_clf.predict(embed.reshape(1, -1))
+            role = arg_2_role[arg["subtype"]][role_id[0]]
+            arg["role"] = role
+        else:
+            arg["role"] = arg_2_role[arg["subtype"]][0]
+    return event_args

event_nugget_predict.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import streamlit as st
+from annotated_text import annotated_text
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from cybersecurity_knowledge_graph.nugget_model_utils import CustomRobertaWithPOS as NuggetModel
+from cybersecurity_knowledge_graph.nugget_model_utils import tokenize_and_align_labels_with_pos_ner_dep, find_nearest_nugget_features, find_dep_depth
+from cybersecurity_knowledge_graph.utils import get_idxs_from_text, event_nugget_list
+import spacy
+from transformers import AutoTokenizer
+from datasets import load_dataset, Features, ClassLabel, Value, Sequence, Dataset
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+def find_dep_depth(token):
+    depth = 0
+    current_token = token
+    while current_token.head != current_token:
+        depth += 1
+        current_token = current_token.head
+    return min(depth, 16)
+nlp = spacy.load('en_core_web_sm')
+pos_spacy_tag_list = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","X"]
+ner_spacy_tag_list = [bio + entity for entity in list(nlp.get_pipe('ner').labels) for bio in ["B-", "I-"]] + ["O"]
+dep_spacy_tag_list = list(nlp.get_pipe("parser").labels)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model_checkpoint = "ehsanaghaei/SecureBERT"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+model_nugget = NuggetModel(num_classes = 11)
+model_nugget.load_state_dict(torch.load("cybersecurity_knowledge_graph/nugget_model_state_dict.pth", map_location=device))
+model_nugget.eval()
+"""
+Function: create_dataloader(text_input)
+Description: This function prepares a DataLoader for processing text input, including tokenization and alignment of labels.
+Inputs:
+    - text_input: The input text to be processed.
+Output:
+    - dataloader: A DataLoader for the tokenized and batched text data.
+    - tokenized_dataset_ner: The tokenized dataset used for training.
+"""
+def create_dataloader(text_input):
+    doc = nlp(text_input)
+    content_as_words_emdash = [tok.text for tok in doc]
+    content_as_words_emdash = [word.replace("``", '"').replace("''", '"').replace("$", "") for word in content_as_words_emdash]
+    content_idx_dict = get_idxs_from_text(text_input, content_as_words_emdash)
+    data = []
+    words = []
+    pos_spacy = [tok.pos_ for tok in doc]
+    ner_spacy = [ent.ent_iob_ + "-" + ent.ent_type_ if ent.ent_iob_ != "O" else ent.ent_iob_ for ent in doc]
+    dep_spacy = [tok.dep_ for tok in doc]
+    depth_spacy = [find_dep_depth(tok) for tok in doc]
+    for content_dict in content_idx_dict:
+        start_idx, end_idx = content_dict["start_idx"], content_dict["end_idx"]
+        words.append(content_dict["word"])
+    content_token_len = len(tokenizer(words, truncation=False, is_split_into_words=True)["input_ids"])
+    if content_token_len > tokenizer.model_max_length:
+        no_split = (content_token_len // tokenizer.model_max_length) + 2
+        split_len = (len(words) // no_split) + 1
+        last_id = 0
+        threshold = split_len
+        for id, token in enumerate(words):
+            if token == "." and id > threshold:
+                data.append(
+                    {
+                        "tokens" : words[last_id : id + 1],
+                        "pos_spacy" : pos_spacy[last_id : id + 1],
+                        "ner_spacy" : ner_spacy[last_id : id + 1],
+                        "dep_spacy" : dep_spacy[last_id : id + 1],
+                        "depth_spacy" : depth_spacy[last_id : id + 1],
+                    }
+                )
+                last_id = id + 1
+                threshold += split_len
+        data.append({"tokens" : words[last_id : ],
+                     "pos_spacy" : pos_spacy[last_id : ],
+                     "ner_spacy" : ner_spacy[last_id : ],
+                     "dep_spacy" : dep_spacy[last_id : ],
+                     "depth_spacy" : depth_spacy[last_id : ]})
+    else:
+        data.append(
+            {
+                "tokens" : words,
+                "pos_spacy" : pos_spacy,
+                "ner_spacy" : ner_spacy,
+                "dep_spacy" : dep_spacy,
+                "depth_spacy" : depth_spacy
+            }
+        )
+    ner_features = Features({'tokens' : Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
+                            'pos_spacy' : Sequence(feature=ClassLabel(num_classes=len(pos_spacy_tag_list), names=pos_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'ner_spacy' : Sequence(feature=ClassLabel(num_classes=len(ner_spacy_tag_list), names=ner_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'dep_spacy' : Sequence(feature=ClassLabel(num_classes=len(dep_spacy_tag_list), names=dep_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'depth_spacy' : Sequence(feature=ClassLabel(num_classes=17, names= list(range(17)), names_file=None, id=None), length=-1, id=None)
+                            })
+    dataset = Dataset.from_list(data, features=ner_features)
+    tokenized_dataset_ner = dataset.map(tokenize_and_align_labels_with_pos_ner_dep, fn_kwargs={'tokenizer' : tokenizer}, batched=True, load_from_cache_file=False)
+    tokenized_dataset_ner = tokenized_dataset_ner.with_format("torch")
+    tokenized_dataset_ner = tokenized_dataset_ner.remove_columns("tokens")
+    batch_size = 4 # Number of input texts
+    dataloader = DataLoader(tokenized_dataset_ner, batch_size=batch_size)
+    # TODO : context_idx_dict should be used to index the words
+    return dataloader, tokenized_dataset_ner
+"""
+Function: predict(dataloader)
+Description: This function performs inference on a given DataLoader using a trained model and returns the predicted labels.
+Inputs:
+    - dataloader: A DataLoader containing input data for prediction.
+Output:
+    - predicted_label: A tensor containing the predicted labels for the input data.
+"""
+def predict(dataloader):
+    predicted_label = []
+    for batch in dataloader:
+        with torch.no_grad():
+            logits = model_nugget(**batch)
+        batch_predicted_label = logits.argmax(-1)
+        predicted_label.append(batch_predicted_label)
+    return torch.cat(predicted_label, dim=-1)
+"""
+Function: show_annotations(text_input)
+Description: This function displays annotated event nuggets in the provided input text using the Streamlit library.
+Inputs:
+    - text_input: The input text containing event nuggets to be annotated and displayed.
+Output:
+    - An interactive display of annotated event nuggets within the input text.
+"""
+def show_annotations(text_input):
+    st.title("Event Nuggets")
+    dataloader, tokenized_dataset_ner = create_dataloader(text_input)
+    predicted_label = predict(dataloader)
+    for idx, labels in enumerate(predicted_label):
+        token_mask = [token > 2 for token in tokenized_dataset_ner[idx]["input_ids"]]
+        tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset_ner[idx]["input_ids"][token_mask], skip_special_tokens=True)
+        tokens = [token.replace("Ġ", "").replace("Ċ", "").replace("âĢĻ", "'") for token in tokens]
+        text = tokenizer.decode(tokenized_dataset_ner[idx]["input_ids"][token_mask])
+        idxs = get_idxs_from_text(text, tokens)
+        labels = labels[token_mask]
+        annotated_text_list = []
+        last_label = ""
+        cumulative_tokens = ""
+        last_id = 0
+        for idx, label in zip(idxs, labels):
+            to_label = event_nugget_list[label]
+            label_short = to_label.split("-")[1] if "-" in to_label else to_label
+            if last_label == label_short:
+                cumulative_tokens += text[last_id : idx["end_idx"]]
+                last_id = idx["end_idx"]
+            else:
+                if last_label != "":
+                    if last_label == "O":
+                        annotated_text_list.append(cumulative_tokens)
+                    else:
+                        annotated_text_list.append((cumulative_tokens, last_label))
+                last_label = label_short
+                cumulative_tokens = idx["word"]
+                last_id = idx["end_idx"]
+        if last_label == "O":
+            annotated_text_list.append(cumulative_tokens)
+        else:
+            annotated_text_list.append((cumulative_tokens, last_label))
+        annotated_text(annotated_text_list)
+"""
+Function: get_event_nuggets(text_input)
+Description: This function extracts predicted event nuggets (event entities) from the provided input text.
+Inputs:
+    - text_input: The input text containing event nuggets to be extracted.
+Output:
+    - predicted_event_nuggets: A list of dictionaries, each representing an extracted event nugget with start and end offsets,
+      subtype, and text content.
+"""
+def get_event_nuggets(text_input):
+    dataloader, tokenized_dataset_ner = create_dataloader(text_input)
+    predicted_label = predict(dataloader)
+    predicted_event_nuggets = []
+    text_length = 0
+    for idx, labels in enumerate(predicted_label):
+        token_mask = [token > 2 for token in tokenized_dataset_ner[idx]["input_ids"]]
+        tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset_ner[idx]["input_ids"][token_mask], skip_special_tokens=True)
+        tokens = [token.replace("Ġ", "").replace("Ċ", "").replace("âĢĻ", "'") for token in tokens]
+        text = tokenizer.decode(tokenized_dataset_ner[idx]["input_ids"][token_mask])
+        idxs = get_idxs_from_text(text_input[text_length : ], tokens)
+        labels = labels[token_mask]
+        start_idx = 0
+        end_idx = 0
+        last_label = ""
+        for idx, label in zip(idxs, labels):
+            to_label = event_nugget_list[label]
+            label_short = to_label.split("-")[1] if "-" in to_label else to_label
+            if label_short == last_label:
+                end_idx = idx["end_idx"]
+            else:
+                if text_input[start_idx : end_idx] != "" and last_label != "O":
+                    predicted_event_nuggets.append(
+                        {
+                            "startOffset" : text_length + start_idx,
+                            "endOffset" : text_length + end_idx,
+                            "subtype" : last_label,
+                            "text" : text_input[text_length + start_idx : text_length + end_idx]
+                        }
+                    )
+                start_idx = idx["start_idx"]
+                end_idx = idx["start_idx"] + len(idx["word"])
+            last_label = label_short
+        text_length += idx["end_idx"]
+    return predicted_event_nuggets

event_realis_predict.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import os
+import spacy
+import torch
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+from cybersecurity_knowledge_graph.utils import get_idxs_from_text
+import streamlit as st
+from annotated_text import annotated_text
+from cybersecurity_knowledge_graph.nugget_model_utils import CustomRobertaWithPOS
+from cybersecurity_knowledge_graph.event_nugget_predict import get_event_nuggets
+from cybersecurity_knowledge_graph.realis_model_utils import get_entity_for_realis_from_idx, tokenize_and_align_labels_with_pos_ner_realis
+from datasets import load_dataset, Features, ClassLabel, Value, Sequence, Dataset
+event_nugget_list = ['B-Phishing',
+ 'I-Phishing',
+ 'O',
+ 'B-DiscoverVulnerability',
+ 'B-Ransom',
+ 'I-Ransom',
+ 'B-Databreach',
+ 'I-DiscoverVulnerability',
+ 'B-PatchVulnerability',
+ 'I-PatchVulnerability',
+ 'I-Databreach']
+realis_list = ["O", "Generic", "Other", "Actual"]
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+def find_dep_depth(token):
+    depth = 0
+    current_token = token
+    while current_token.head != current_token:
+        depth += 1
+        current_token = current_token.head
+    return min(depth, 16)
+nlp = spacy.load('en_core_web_sm')
+pos_spacy_tag_list = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","X"]
+ner_spacy_tag_list = [bio + entity for entity in list(nlp.get_pipe('ner').labels) for bio in ["B-", "I-"]] + ["O"]
+dep_spacy_tag_list = list(nlp.get_pipe("parser").labels)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model_checkpoint = "ehsanaghaei/SecureBERT"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+from cybersecurity_knowledge_graph.realis_model_utils import CustomRobertaWithPOS as RealisModel
+model_realis = RealisModel(num_classes_realis=4)
+model_realis.load_state_dict(torch.load("cybersecurity_knowledge_graph/realis_model_state_dict.pth", map_location=device))
+model_realis.eval()
+"""
+Function: create_dataloader(text_input)
+Description: This function prepares a DataLoader for processing text input, including tokenization and alignment of labels.
+Inputs:
+    - text_input: The input text to be processed.
+Output:
+    - dataloader: A DataLoader for the tokenized and batched text data.
+    - tokenized_dataset_ner: The tokenized dataset used for training.
+"""
+def create_dataloader(text_input):
+    event_nuggets = get_event_nuggets(text_input)
+    doc = nlp(text_input)
+    content_as_words_emdash = [tok.text for tok in doc]
+    content_as_words_emdash = [word.replace("``", '"').replace("''", '"').replace("$", "") for word in content_as_words_emdash]
+    content_idx_dict = get_idxs_from_text(text_input, content_as_words_emdash)
+    data = []
+    words = []
+    nugget_ner_tags = []
+    pos_spacy = [tok.pos_ for tok in doc]
+    ner_spacy = [ent.ent_iob_ + "-" + ent.ent_type_ if ent.ent_iob_ != "O" else ent.ent_iob_ for ent in doc]
+    dep_spacy = [tok.dep_ for tok in doc]
+    depth_spacy = [find_dep_depth(tok) for tok in doc]
+    for content_dict in content_idx_dict:
+        start_idx, end_idx = content_dict["start_idx"], content_dict["end_idx"]
+        entity = get_entity_for_realis_from_idx(start_idx, end_idx, event_nuggets)
+        words.append(content_dict["word"])
+        nugget_ner_tags.append(entity)
+    content_token_len = len(tokenizer(words, truncation=False, is_split_into_words=True)["input_ids"])
+    if content_token_len > tokenizer.model_max_length:
+        no_split = (content_token_len // tokenizer.model_max_length) + 2
+        split_len = (len(words) // no_split) + 1
+        last_id = 0
+        threshold = split_len
+        for id, token in enumerate(words):
+            if token == "." and id > threshold:
+                data.append(
+                    {
+                        "tokens" : words[last_id : id + 1],
+                        "ner_tags" : nugget_ner_tags[last_id : id + 1],
+                        "pos_spacy" : pos_spacy[last_id : id + 1],
+                        "ner_spacy" : ner_spacy[last_id : id + 1],
+                        "dep_spacy" : dep_spacy[last_id : id + 1],
+                        "depth_spacy" : depth_spacy[last_id : id + 1],
+                    }
+                )
+                last_id = id + 1
+                threshold += split_len
+        data.append({"tokens" : words[last_id : ],
+                     "ner_tags" : nugget_ner_tags[last_id : ],
+                     "pos_spacy" : pos_spacy[last_id : ],
+                     "ner_spacy" : ner_spacy[last_id : ],
+                     "dep_spacy" : dep_spacy[last_id : ],
+                     "depth_spacy" : depth_spacy[last_id : ]})
+    else:
+        data.append(
+            {
+                "tokens" : words,
+                "ner_tags" : nugget_ner_tags,
+                "pos_spacy" : pos_spacy,
+                "ner_spacy" : ner_spacy,
+                "dep_spacy" : dep_spacy,
+                "depth_spacy" : depth_spacy
+            }
+        )
+    ner_features = Features({'tokens' : Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
+                            'ner_tags' : Sequence(feature=ClassLabel(num_classes=len(event_nugget_list), names=event_nugget_list, names_file=None, id=None), length=-1, id=None),
+                            'pos_spacy' : Sequence(feature=ClassLabel(num_classes=len(pos_spacy_tag_list), names=pos_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'ner_spacy' : Sequence(feature=ClassLabel(num_classes=len(ner_spacy_tag_list), names=ner_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'dep_spacy' : Sequence(feature=ClassLabel(num_classes=len(dep_spacy_tag_list), names=dep_spacy_tag_list, names_file=None, id=None), length=-1, id=None),
+                            'depth_spacy' : Sequence(feature=ClassLabel(num_classes=17, names= list(range(17)), names_file=None, id=None), length=-1, id=None)
+                            })
+    dataset = Dataset.from_list(data, features=ner_features)
+    tokenized_dataset_ner = dataset.map(tokenize_and_align_labels_with_pos_ner_realis, fn_kwargs={'tokenizer' : tokenizer, 'ner_names' : event_nugget_list}, batched=True, load_from_cache_file=False)
+    tokenized_dataset_ner = tokenized_dataset_ner.with_format("torch")
+    tokenized_dataset_ner = tokenized_dataset_ner.remove_columns("tokens")
+    batch_size = 4 # Number of input texts
+    dataloader = DataLoader(tokenized_dataset_ner, batch_size=batch_size)
+    return dataloader, tokenized_dataset_ner
+"""
+Function: predict(dataloader)
+Description: This function performs inference on a given DataLoader using a trained model and returns the predicted labels.
+Inputs:
+    - dataloader: A DataLoader containing input data for prediction.
+Output:
+    - predicted_label: A tensor containing the predicted labels for the input data.
+"""
+def predict(dataloader):
+    predicted_label = []
+    for batch in dataloader:
+        with torch.no_grad():
+            logits = model_realis(**batch)
+        batch_predicted_label = logits.argmax(-1)
+        predicted_label.append(batch_predicted_label)
+    return torch.cat(predicted_label, dim=-1)
+"""
+Function: show_annotations(text_input)
+Description: This function displays annotated event nuggets in the provided input text using the Streamlit library.
+Inputs:
+    - text_input: The input text containing event nuggets to be annotated and displayed.
+Output:
+    - An interactive display of annotated event nuggets within the input text.
+"""
+def show_annotations(text_input):
+    st.title("Event Realis")
+    dataloader, tokenized_dataset_ner = create_dataloader(text_input)
+    predicted_label = predict(dataloader)
+    for idx, labels in enumerate(predicted_label):
+        token_mask = [token > 2 for token in tokenized_dataset_ner[idx]["input_ids"]]
+        tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset_ner[idx]["input_ids"][token_mask], skip_special_tokens=True)
+        tokens = [token.replace("Ġ", "").replace("Ċ", "").replace("âĢĻ", "'") for token in tokens]
+        text = tokenizer.decode(tokenized_dataset_ner[idx]["input_ids"][token_mask])
+        idxs = get_idxs_from_text(text, tokens)
+        labels = labels[token_mask]
+        annotated_text_list = []
+        last_label = ""
+        cumulative_tokens = ""
+        last_id = 0
+        for idx, label in zip(idxs, labels):
+            to_label = realis_list[label]
+            label_short = to_label.split("-")[1] if "-" in to_label else to_label
+            if last_label == label_short:
+                cumulative_tokens += text[last_id : idx["end_idx"]]
+                last_id = idx["end_idx"]
+            else:
+                if last_label != "":
+                    if last_label == "O":
+                        annotated_text_list.append(cumulative_tokens)
+                    else:
+                        annotated_text_list.append((cumulative_tokens, last_label))
+                last_label = label_short
+                cumulative_tokens = idx["word"]
+                last_id = idx["end_idx"]
+        if last_label == "O":
+            annotated_text_list.append(cumulative_tokens)
+        else:
+            annotated_text_list.append((cumulative_tokens, last_label))
+        annotated_text(annotated_text_list)
+"""
+Function: get_event_realis(text_input)
+Description: This function extracts predicted event realis (event modality) from the provided input text.
+Inputs:
+    - text_input: The input text containing event realis to be extracted.
+Output:
+    - predicted_event_realis: A list of dictionaries, each representing an extracted event realis with start and end offsets,
+      realis type, and text content.
+"""
+def get_event_realis(text_input):
+    dataloader, tokenized_dataset_ner = create_dataloader(text_input)
+    predicted_label = predict(dataloader)
+    predicted_event_realis = []
+    text_length = 0
+    for idx, labels in enumerate(predicted_label):
+        token_mask = [token > 2 for token in tokenized_dataset_ner[idx]["input_ids"]]
+        tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset_ner[idx]["input_ids"][token_mask], skip_special_tokens=True)
+        tokens = [token.replace("Ġ", "").replace("Ċ", "").replace("âĢĻ", "'") for token in tokens]
+        text = tokenizer.decode(tokenized_dataset_ner[idx]["input_ids"][token_mask])
+        idxs = get_idxs_from_text(text_input[text_length : ], tokens)
+        labels = labels[token_mask]
+        start_idx = 0
+        end_idx = 0
+        last_label = ""
+        for idx, label in zip(idxs, labels):
+            to_label = realis_list[label]
+            label_split = to_label
+            if label_split == last_label:
+                end_idx = idx["end_idx"]
+            else:
+                if text_input[start_idx : end_idx] != "" and last_label != "O":
+                    predicted_event_realis.append(
+                        {
+                            "startOffset" : text_length + start_idx,
+                            "endOffset" : text_length + end_idx,
+                            "realis" : last_label,
+                            "text" : text_input[text_length + start_idx : text_length + end_idx]
+                        }
+                    )
+                start_idx = idx["start_idx"]
+                end_idx = idx["start_idx"] + len(idx["word"])
+            last_label = label_split
+        text_length += idx["end_idx"]
+    return predicted_event_realis

model_59.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09bc24b422adbe6c4c6ca1333a3a8c33146e6152e00a7ad6376cab616b51e53f
+size 498858353

model_64_pos_ner.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76125c5bbce2c32e536fe74d24dc51fb1fce3ba076104b459ee290102ce4bd5d
+size 498746934

model_66.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46531e8ccf92661a025b15c829be791f72416d1b458ae1aa82cc66e069193bf5
+size 498751092

model_97.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6147a98aa2baaa545903103e9e2f0e55fc249ec638cfe27e273ffdd247479c4
+size 498729523

nugget_model_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d04c7ccd654b3af96c1c8e0f391a20d79ae1b5970d5419680f379c6a09e78bf
+size 498703483

nugget_model_utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+import spacy
+import en_core_web_sm
+from torch import nn
+import math
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+from transformers import AutoModel, TrainingArguments, Trainer, RobertaTokenizer, RobertaModel
+from transformers import AutoTokenizer
+model_checkpoint = "ehsanaghaei/SecureBERT"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+roberta_model = RobertaModel.from_pretrained(model_checkpoint).to(device)
+nlp = en_core_web_sm.load()
+pos_spacy_tag_list = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","X"]
+ner_spacy_tag_list = [bio + entity for entity in list(nlp.get_pipe('ner').labels) for bio in ["B-", "I-"]] + ["O"]
+class CustomRobertaWithPOS(nn.Module):
+    def __init__(self, num_classes):
+        super(CustomRobertaWithPOS, self).__init__()
+        self.num_classes = num_classes
+        self.pos_embed = nn.Embedding(len(pos_spacy_tag_list), 16)
+        self.ner_embed = nn.Embedding(len(ner_spacy_tag_list), 16)
+        self.roberta = roberta_model
+        self.dropout1 = nn.Dropout(0.2)
+        self.fc1 = nn.Linear(self.roberta.config.hidden_size, num_classes)
+    def forward(self, input_ids, attention_mask, pos_spacy, ner_spacy, dep_spacy, depth_spacy):
+        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
+        last_hidden_output = outputs.last_hidden_state
+        pos_mask = pos_spacy != -100
+        pos_one_hot = torch.zeros((pos_spacy.shape[0], pos_spacy.shape[1], len(pos_spacy_tag_list)), dtype=torch.long)
+        pos_one_hot[pos_mask, pos_spacy[pos_mask]] = 1
+        pos_one_hot = pos_one_hot.to(device)
+        ner_mask = ner_spacy != -100
+        ner_one_hot = torch.zeros((ner_spacy.shape[0], ner_spacy.shape[1], len(ner_spacy_tag_list)), dtype=torch.long)
+        ner_one_hot[ner_mask, ner_spacy[ner_mask]] = 1
+        ner_one_hot = ner_one_hot.to(device)
+        features_concat = last_hidden_output
+        features_concat = self.dropout1(features_concat)
+        logits = self.fc1(features_concat)
+        return logits
+def tokenize_and_align_labels_with_pos_ner_dep(examples, tokenizer, label_all_tokens = True):
+    tokenized_inputs = tokenizer(examples["tokens"], padding='max_length', truncation=True, is_split_into_words=True)
+    #tokenized_inputs.pop('input_ids')
+    ner_spacy = []
+    pos_spacy = []
+    dep_spacy = []
+    depth_spacy = []
+    for i, (pos, ner, dep, depth) in enumerate(zip(examples["pos_spacy"],
+                                                   examples["ner_spacy"],
+                                                   examples["dep_spacy"],
+                                                   examples["depth_spacy"])):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        ner_spacy_ids = []
+        pos_spacy_ids = []
+        dep_spacy_ids = []
+        depth_spacy_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+            # ignored in the loss function.
+            if word_idx is None:
+                ner_spacy_ids.append(-100)
+                pos_spacy_ids.append(-100)
+                dep_spacy_ids.append(-100)
+                depth_spacy_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                ner_spacy_ids.append(ner[word_idx])
+                pos_spacy_ids.append(pos[word_idx])
+                dep_spacy_ids.append(dep[word_idx])
+                depth_spacy_ids.append(depth[word_idx])
+            # For the other tokens in a word, we set the label to either the current label or -100, depending on
+            # the label_all_tokens flag.
+            else:
+                ner_spacy_ids.append(ner[word_idx] if label_all_tokens else -100)
+                pos_spacy_ids.append(pos[word_idx] if label_all_tokens else -100)
+                dep_spacy_ids.append(dep[word_idx] if label_all_tokens else -100)
+                depth_spacy_ids.append(depth[word_idx] if label_all_tokens else -100)
+            previous_word_idx = word_idx
+        ner_spacy.append(ner_spacy_ids)
+        pos_spacy.append(pos_spacy_ids)
+        dep_spacy.append(dep_spacy_ids)
+        depth_spacy.append(depth_spacy_ids)
+    tokenized_inputs["pos_spacy"] = pos_spacy
+    tokenized_inputs["ner_spacy"] = ner_spacy
+    tokenized_inputs["dep_spacy"] = dep_spacy
+    tokenized_inputs["depth_spacy"] = depth_spacy
+    return tokenized_inputs
+def find_nearest_nugget_features(doc, start_idx, end_idx, event_nuggets):
+            nearest_subtype = None
+            nearest_dist = math.inf
+            relative_pos = None
+            mid_idx = (end_idx + start_idx) / 2
+            for nugget in event_nuggets:
+                mid_nugget_idx = (nugget["nugget"]["startOffset"] + nugget["nugget"]["endOffset"]) / 2
+                dist = abs(mid_nugget_idx - mid_idx)
+                if dist < nearest_dist:
+                    nearest_dist = dist
+                    nearest_subtype = nugget["subtype"]
+                    for sent in doc.sents:
+                        if between_idxs(mid_idx, sent.start_char, sent.end_char) and between_idxs(mid_nugget_idx, sent.start_char, sent.end_char):
+                            if mid_idx < mid_nugget_idx:
+                                relative_pos = "before-same-sentence"
+                            else:
+                                relative_pos = "after-same-sentence"
+                            break
+                        elif between_idxs(mid_nugget_idx, sent.start_char, sent.end_char) and mid_idx > mid_nugget_idx:
+                            relative_pos = "after-differ-sentence"
+                            break
+                        elif between_idxs(mid_idx, sent.start_char, sent.end_char) and mid_idx < mid_nugget_idx:
+                            relative_pos = "before-differ-sentence"
+                            break
+            nearest_dist = int(min(10, nearest_dist // 20))
+            return nearest_subtype, nearest_dist, relative_pos
+def find_dep_depth(token):
+            depth = 0
+            current_token = token
+            while current_token.head != current_token:
+                depth += 1
+                current_token = current_token.head
+            return min(depth, 16)
+def between_idxs(idx, start_idx, end_idx):
+    return idx >= start_idx and idx <= end_idx

realis_model_state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ad63eeee95888dc6f22e94e0a8425a99912f7d727cd255881e8630218a3b7f0
+size 498684837

realis_model_utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+from torch import nn
+import en_core_web_sm
+from transformers import AutoModel, TrainingArguments, Trainer, RobertaTokenizer, RobertaModel
+from transformers import AutoTokenizer
+model_checkpoint = "ehsanaghaei/SecureBERT"
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
+roberta_model = RobertaModel.from_pretrained(model_checkpoint).to(device)
+event_nugget_list = ['B-Phishing',
+ 'I-Phishing',
+ 'O',
+ 'B-DiscoverVulnerability',
+ 'B-Ransom',
+ 'I-Ransom',
+ 'B-Databreach',
+ 'I-DiscoverVulnerability',
+ 'B-PatchVulnerability',
+ 'I-PatchVulnerability',
+ 'I-Databreach']
+nlp = en_core_web_sm.load()
+pos_spacy_tag_list = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","X"]
+ner_spacy_tag_list = [bio + entity for entity in list(nlp.get_pipe('ner').labels) for bio in ["B-", "I-"]] + ["O"]
+dep_spacy_tag_list = list(nlp.get_pipe("parser").labels)
+class CustomRobertaWithPOS(nn.Module):
+    def __init__(self, num_classes_realis):
+        super(CustomRobertaWithPOS, self).__init__()
+        self.num_classes_realis = num_classes_realis
+        self.pos_embed = nn.Embedding(len(pos_spacy_tag_list), 16)
+        self.ner_embed = nn.Embedding(len(ner_spacy_tag_list), 8)
+        self.dep_embed = nn.Embedding(len(dep_spacy_tag_list), 8)
+        self.depth_embed = nn.Embedding(17, 8)
+        self.nugget_embed = nn.Embedding(len(event_nugget_list), 8)
+        self.roberta = roberta_model
+        self.dropout1 = nn.Dropout(0.2)
+        self.fc1 = nn.Linear(self.roberta.config.hidden_size + 48, self.num_classes_realis)
+    def forward(self, input_ids, attention_mask, pos_spacy, ner_spacy, dep_spacy, depth_spacy, ner_tags):
+        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
+        last_hidden_output = outputs.last_hidden_state
+        pos_mask = pos_spacy != -100
+        pos_embed_masked = self.pos_embed(pos_spacy[pos_mask])
+        pos_embed = torch.zeros((pos_spacy.shape[0], pos_spacy.shape[1], 16), dtype=torch.float).to(device)
+        pos_embed[pos_mask] = pos_embed_masked
+        ner_mask = ner_spacy != -100
+        ner_embed_masked = self.ner_embed(ner_spacy[ner_mask])
+        ner_embed = torch.zeros((ner_spacy.shape[0], ner_spacy.shape[1], 8), dtype=torch.float).to(device)
+        ner_embed[ner_mask] = ner_embed_masked
+        dep_mask = dep_spacy != -100
+        dep_embed_masked = self.dep_embed(dep_spacy[dep_mask])
+        dep_embed = torch.zeros((dep_spacy.shape[0], dep_spacy.shape[1], 8), dtype=torch.float).to(device)
+        dep_embed[dep_mask] = dep_embed_masked
+        depth_mask = depth_spacy != -100
+        depth_embed_masked = self.depth_embed(depth_spacy[depth_mask])
+        depth_embed = torch.zeros((depth_spacy.shape[0], depth_spacy.shape[1], 8), dtype=torch.float).to(device)
+        depth_embed[dep_mask] = depth_embed_masked
+        nugget_mask = ner_tags != -100
+        nugget_embed_masked = self.nugget_embed(ner_tags[nugget_mask])
+        nugget_embed = torch.zeros((ner_tags.shape[0], ner_tags.shape[1], 8), dtype=torch.float).to(device)
+        nugget_embed[dep_mask] = nugget_embed_masked
+        features_concat = torch.cat((last_hidden_output, pos_embed, ner_embed, dep_embed, depth_embed, nugget_embed), 2).to(device)
+        features_concat = self.dropout1(features_concat)
+        features_concat = self.dropout1(features_concat)
+        logits = self.fc1(features_concat)
+        return logits
+def get_entity_for_realis_from_idx(start_idx, end_idx, event_nuggets):
+    event_nuggets_idxs = [(nugget["startOffset"], nugget["endOffset"]) for nugget in event_nuggets]
+    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
+        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
+            return "B-" + event_nuggets[idx]["subtype"]
+        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
+            return "I-" + event_nuggets[idx]["subtype"]
+    return "O"
+def tokenize_and_align_labels_with_pos_ner_realis(examples, tokenizer, ner_names, label_all_tokens = True):
+    tokenized_inputs = tokenizer(examples["tokens"], padding='max_length', truncation=True, is_split_into_words=True)
+    #tokenized_inputs.pop('input_ids')
+    labels = []
+    nuggets = []
+    ner_spacy = []
+    pos_spacy = []
+    dep_spacy = []
+    depth_spacy = []
+    for i, (nugget, pos, ner, dep, depth) in enumerate(zip(examples["ner_tags"], examples["pos_spacy"], examples["ner_spacy"], examples["dep_spacy"], examples["depth_spacy"])):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        nugget_ids = []
+        ner_spacy_ids = []
+        pos_spacy_ids = []
+        dep_spacy_ids = []
+        depth_spacy_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+            # ignored in the loss function.
+            if word_idx is None:
+                nugget_ids.append(-100)
+                ner_spacy_ids.append(-100)
+                pos_spacy_ids.append(-100)
+                dep_spacy_ids.append(-100)
+                depth_spacy_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                nugget_ids.append(nugget[word_idx])
+                ner_spacy_ids.append(ner[word_idx])
+                pos_spacy_ids.append(pos[word_idx])
+                dep_spacy_ids.append(dep[word_idx])
+                depth_spacy_ids.append(depth[word_idx])
+            # For the other tokens in a word, we set the label to either the current label or -100, depending on
+            # the label_all_tokens flag.
+            else:
+                nugget_ids.append(nugget[word_idx] if label_all_tokens else -100)
+                ner_spacy_ids.append(ner[word_idx] if label_all_tokens else -100)
+                pos_spacy_ids.append(pos[word_idx] if label_all_tokens else -100)
+                dep_spacy_ids.append(dep[word_idx] if label_all_tokens else -100)
+                depth_spacy_ids.append(depth[word_idx] if label_all_tokens else -100)
+            previous_word_idx = word_idx
+        nuggets.append(nugget_ids)
+        ner_spacy.append(ner_spacy_ids)
+        pos_spacy.append(pos_spacy_ids)
+        dep_spacy.append(dep_spacy_ids)
+        depth_spacy.append(depth_spacy_ids)
+    tokenized_inputs["ner_tags"] = nuggets
+    tokenized_inputs["pos_spacy"] = pos_spacy
+    tokenized_inputs["ner_spacy"] = ner_spacy
+    tokenized_inputs["dep_spacy"] = dep_spacy
+    tokenized_inputs["depth_spacy"] = depth_spacy
+    return tokenized_inputs

utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+list_of_pos_tags = [
+    "ADJ",
+    "ADP",
+    "ADV",
+    "AUX",
+    "CCONJ",
+    "DET",
+    "INTJ",
+    "NOUN",
+    "NUM",
+    "PART",
+    "PRON",
+    "PROPN",
+    "PUNCT",
+    "SCONJ",
+    "SYM",
+    "VERB",
+    "X"
+]
+realis_list = ["O",
+    "Generic",
+    "Other",
+    "Actual"
+]
+event_args_list = ['O',
+    'B-System',
+    'I-System',
+    'B-Organization',
+    'B-Money',
+    'I-Money',
+    'B-Device',
+    'B-Person',
+    'I-Person',
+    'B-Vulnerability',
+    'I-Vulnerability',
+    'B-Capabilities',
+    'I-Capabilities',
+    'I-Organization',
+    'B-PaymentMethod',
+    'I-PaymentMethod',
+    'B-Data',
+    'I-Data',
+    'B-Number',
+    'I-Number',
+    'B-Malware',
+    'I-Malware',
+    'B-PII',
+    'I-PII',
+    'B-CVE',
+    'I-CVE',
+    'B-Purpose',
+    'I-Purpose',
+    'B-File',
+    'I-File',
+    'I-Device',
+    'B-Time',
+    'I-Time',
+    'B-Software',
+    'I-Software',
+    'B-Patch',
+    'I-Patch',
+    'B-Version',
+    'I-Version',
+    'B-Website',
+    'I-Website',
+    'B-GPE',
+    'I-GPE'
+]
+event_nugget_list = ['O',
+ 'B-Ransom',
+ 'I-Ransom',
+ 'B-DiscoverVulnerability',
+ 'I-DiscoverVulnerability',
+ 'B-PatchVulnerability',
+ 'I-PatchVulnerability',
+ 'B-Databreach',
+ 'I-Databreach',
+ 'B-Phishing',
+ 'I-Phishing'
+]
+arg_2_role = {
+    "File" : ['Tool', 'Trusted-Entity'],
+    "Person" : ['Victim', 'Attacker', 'Discoverer', 'Releaser', 'Trusted-Entity', 'Vulnerable_System_Owner'],
+    "Capabilities" : ['Attack-Pattern', 'Capabilities', 'Issues-Addressed'],
+    "Purpose" : ['Purpose'],
+    "Time" : ['Time'],
+    "PII" : ['Compromised-Data', 'Trusted-Entity'],
+    "Data" : ['Compromised-Data', 'Trusted-Entity'],
+    "Organization" : ['Victim', 'Releaser', 'Discoverer', 'Attacker', 'Vulnerable_System_Owner', 'Trusted-Entity'],
+    "Patch" : ['Patch'],
+    "Software" : ['Vulnerable_System', 'Victim', 'Trusted-Entity', 'Supported_Platform'],
+    "Vulnerability" : ['Vulnerability'],
+    "Version" : ['Patch-Number', 'Vulnerable_System_Version'],
+    "Device" : ['Vulnerable_System', 'Victim', 'Supported_Platform'],
+    "CVE" : ['CVE'],
+    "Number" : ['Number-of-Data', 'Number-of-Victim'],
+    "System" : ['Victim', 'Supported_Platform', 'Vulnerable_System', 'Trusted-Entity'],
+    "Malware" : ['Tool'],
+    "Money" : ['Price', 'Damage-Amount'],
+    "PaymentMethod" : ['Payment-Method'],
+    "GPE" : ['Place'],
+    "Website" : ['Trusted-Entity', 'Tool', 'Vulnerable_System', 'Victim', 'Supported_Platform'],
+}
+def get_content(data):
+    return data["content"]
+def get_event_nugget(data):
+    return [
+        {"nugget" : event["nugget"], "type" : event["type"], "subtype" : event["subtype"], "realis" : event["realis"]}
+        for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]
+    ]
+def get_event_args(data):
+    events = [event for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]]
+    args = []
+    for event in events:
+        if "argument" in event.keys():
+            args.extend(event["argument"])
+    return args
+def get_idxs_from_text(text, text_tokenized):
+    rest_text = text
+    last_idx = 0
+    result_dict = []
+    for substring in text_tokenized:
+        index = rest_text.find(substring)
+        result_dict.append(
+            {
+                "word" : substring,
+                "start_idx" : last_idx + index,
+                "end_idx" : last_idx + index + len(substring)
+            }
+        )
+        rest_text = rest_text[index + len(substring) : ]
+        last_idx += index + len(substring)
+    return result_dict
+def get_entity_from_idx(start_idx, end_idx, event_nuggets):
+    event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets]
+    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
+        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
+            return "B-" + event_nuggets[idx]["subtype"]
+        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
+            return "I-" + event_nuggets[idx]["subtype"]
+    return "O"
+def get_entity_and_realis_from_idx(start_idx, end_idx, event_nuggets):
+    event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets]
+    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
+        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
+            return "B-" + event_nuggets[idx]["subtype"], "B-" + event_nuggets[idx]["realis"]
+        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
+            return "I-" + event_nuggets[idx]["subtype"], "I-" + event_nuggets[idx]["realis"]
+    return "O", "O"
+def get_args_entity_from_idx(start_idx, end_idx, event_args):
+    event_nuggets_idxs = [(nugget["startOffset"], nugget["endOffset"]) for nugget in event_args]
+    for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
+        if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
+            return "B-" + event_args[idx]["type"]
+        elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
+            return "I-" + event_args[idx]["type"]
+    return "O"
+def split_with_character(string, char):
+    result = []
+    start = 0
+    for i, c in enumerate(string):
+        if c == char:
+            result.append(string[start:i])
+            result.append(char)
+            start = i + 1
+    result.append(string[start:])
+    return [x for x in result if x != '']
+def extend_list_with_character(content_list, character):
+    content_as_words = []
+    for word in content_list:
+        if character in word:
+            split_list = split_with_character(word, character)
+            content_as_words.extend(split_list)
+        else:
+            content_as_words.append(word)
+    return content_as_words
+def find_dict_by_overlap(list_of_dicts, key_value_pairs):
+    for dictionary in list_of_dicts:
+        if max(dictionary["start"], dictionary["end"]) >= min(key_value_pairs["start"], key_value_pairs["end"]) and max(key_value_pairs["start"], key_value_pairs["end"]) >= min(dictionary["start"], dictionary["end"]):
+            return dictionary
+    return None