Spaces:
Runtime error
Runtime error
Fix Complexity function
Browse files
app.py
CHANGED
|
@@ -2,7 +2,6 @@ import gradio as gr
|
|
| 2 |
import re
|
| 3 |
import json
|
| 4 |
import nltk
|
| 5 |
-
import stanza
|
| 6 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
|
| 7 |
from sentence_transformers import CrossEncoder
|
| 8 |
from autocorrect import Speller
|
|
@@ -10,33 +9,7 @@ from transformers import BertTokenizer, BertForSequenceClassification
|
|
| 10 |
import torch
|
| 11 |
from torch.nn.utils.rnn import pad_sequence
|
| 12 |
import numpy as np
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
# ********************* Setting up Stanford CoreNLP *********************
|
| 16 |
-
|
| 17 |
-
# Download the Stanford CoreNLP package with Stanza's installation command
|
| 18 |
-
# This'll take several minutes, depending on the network speed
|
| 19 |
-
#corenlp_dir = './corenlp'
|
| 20 |
-
#stanza.install_corenlp(dir=corenlp_dir)
|
| 21 |
-
|
| 22 |
-
# Set the CORENLP_HOME environment variable to point to the installation location
|
| 23 |
-
#import os
|
| 24 |
-
#os.environ["CORENLP_HOME"] = corenlp_dir
|
| 25 |
-
|
| 26 |
-
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
|
| 27 |
-
#client = CoreNLPClient(
|
| 28 |
-
# annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'parse'],
|
| 29 |
-
# memory='4G',
|
| 30 |
-
# endpoint='http://localhost:9001',
|
| 31 |
-
# be_quiet=True)
|
| 32 |
-
#print(client)
|
| 33 |
-
|
| 34 |
-
# Start the background server and wait for some time
|
| 35 |
-
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
|
| 36 |
-
#client.start()
|
| 37 |
-
#import time; time.sleep(10)
|
| 38 |
-
|
| 39 |
-
# ************************************************************************
|
| 40 |
|
| 41 |
|
| 42 |
# ***************************** TGRL Parsing *****************************
|
|
@@ -169,115 +142,14 @@ def get_long_elements(elements): # Using RegEx
|
|
| 169 |
return "Long elements:\nNone."
|
| 170 |
# #####################################
|
| 171 |
|
| 172 |
-
'''
|
| 173 |
# ######### Complex Sentences #########
|
| 174 |
-
# Complex sentences
|
| 175 |
-
|
| 176 |
-
def get_verb_phrases(t):
|
| 177 |
-
verb_phrases = []
|
| 178 |
-
num_children = len(t)
|
| 179 |
-
num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children))
|
| 180 |
-
|
| 181 |
-
if t.label() != "VP":
|
| 182 |
-
for i in range(0, num_children):
|
| 183 |
-
if t[i].height() > 2:
|
| 184 |
-
verb_phrases.extend(get_verb_phrases(t[i]))
|
| 185 |
-
elif t.label() == "VP" and num_VP > 1:
|
| 186 |
-
for i in range(0, num_children):
|
| 187 |
-
if t[i].label() == "VP":
|
| 188 |
-
if t[i].height() > 2:
|
| 189 |
-
verb_phrases.extend(get_verb_phrases(t[i]))
|
| 190 |
-
else:
|
| 191 |
-
verb_phrases.append(' '.join(t.leaves()))
|
| 192 |
-
|
| 193 |
-
return verb_phrases
|
| 194 |
-
|
| 195 |
-
def get_pos(t):
|
| 196 |
-
vp_pos = []
|
| 197 |
-
sub_conj_pos = []
|
| 198 |
-
num_children = len(t)
|
| 199 |
-
children = [t[i].label() for i in range(0,num_children)]
|
| 200 |
-
|
| 201 |
-
flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children))
|
| 202 |
-
|
| 203 |
-
if "VP" in children and not flag:
|
| 204 |
-
for i in range(0, num_children):
|
| 205 |
-
if t[i].label() == "VP":
|
| 206 |
-
vp_pos.append(t[i].treeposition())
|
| 207 |
-
elif not "VP" in children and not flag:
|
| 208 |
-
for i in range(0, num_children):
|
| 209 |
-
if t[i].height() > 2:
|
| 210 |
-
temp1,temp2 = get_pos(t[i])
|
| 211 |
-
vp_pos.extend(temp1)
|
| 212 |
-
sub_conj_pos.extend(temp2)
|
| 213 |
-
# comment this "else" part, if want to include subordinating conjunctions
|
| 214 |
-
else:
|
| 215 |
-
for i in range(0, num_children):
|
| 216 |
-
if t[i].label() in ["S","SBAR","SBARQ","SINV","SQ"]:
|
| 217 |
-
temp1, temp2 = get_pos(t[i])
|
| 218 |
-
vp_pos.extend(temp1)
|
| 219 |
-
sub_conj_pos.extend(temp2)
|
| 220 |
-
else:
|
| 221 |
-
sub_conj_pos.append(t[i].treeposition())
|
| 222 |
-
|
| 223 |
-
return (vp_pos,sub_conj_pos)
|
| 224 |
-
|
| 225 |
-
# get all clauses
|
| 226 |
-
def get_clause_list(sent):
|
| 227 |
-
|
| 228 |
-
parser = client.annotate(sent, properties={"annotators":"parse","outputFormat": "json"})
|
| 229 |
-
sent_tree = nltk.tree.ParentedTree.fromstring(parser["sentences"][0]["parse"])
|
| 230 |
-
#print(sent_tree)
|
| 231 |
-
clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"]
|
| 232 |
-
clause_list = []
|
| 233 |
-
sub_trees = []
|
| 234 |
-
#sent_tree.pretty_print()
|
| 235 |
-
|
| 236 |
-
# break the tree into subtrees of clauses using
|
| 237 |
-
# clause levels "S","SBAR","SBARQ","SINV","SQ"
|
| 238 |
-
for sub_tree in reversed(list(sent_tree.subtrees())):
|
| 239 |
-
if sub_tree.label() in clause_level_list:
|
| 240 |
-
if sub_tree.parent().label() in clause_level_list:
|
| 241 |
-
continue
|
| 242 |
-
|
| 243 |
-
if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
|
| 244 |
-
and not sub_tree.parent().label() in clause_level_list):
|
| 245 |
-
continue
|
| 246 |
-
|
| 247 |
-
sub_trees.append(sub_tree)
|
| 248 |
-
del sent_tree[sub_tree.treeposition()]
|
| 249 |
-
|
| 250 |
-
#print(sub_trees)
|
| 251 |
-
|
| 252 |
-
# for each clause level subtree, extract relevant simple sentence
|
| 253 |
-
for t in sub_trees:
|
| 254 |
-
# get verb phrases from the new modified tree
|
| 255 |
-
verb_phrases = get_verb_phrases(t)
|
| 256 |
-
#print(verb_phrases)
|
| 257 |
-
|
| 258 |
-
# get tree without verb phrases (mainly subject)
|
| 259 |
-
# remove subordinating conjunctions
|
| 260 |
-
vp_pos,sub_conj_pos = get_pos(t)
|
| 261 |
-
for i in vp_pos:
|
| 262 |
-
del t[i]
|
| 263 |
-
for i in sub_conj_pos:
|
| 264 |
-
del t[i]
|
| 265 |
-
|
| 266 |
-
subject_phrase = ' '.join(t.leaves())
|
| 267 |
-
|
| 268 |
-
# update the clause_list
|
| 269 |
-
for i in verb_phrases:
|
| 270 |
-
clause_list.append(subject_phrase + " " + i)
|
| 271 |
-
|
| 272 |
-
return clause_list
|
| 273 |
-
|
| 274 |
def get_complex_sentences(elements):
|
| 275 |
|
| 276 |
complex_sentences = []
|
| 277 |
|
| 278 |
for key, value in elements.items():
|
| 279 |
for i in range(0, len(elements[key])):
|
| 280 |
-
if len(
|
| 281 |
complex_sentences.append(elements[key][i])
|
| 282 |
|
| 283 |
if complex_sentences:
|
|
@@ -285,8 +157,81 @@ def get_complex_sentences(elements):
|
|
| 285 |
return "Complex sentences:\n" + complex_sentences
|
| 286 |
else:
|
| 287 |
return "Complex sentences:\nNone."
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
# ########## Punctuations #########
|
| 291 |
def get_punctuations(elements):
|
| 292 |
|
|
@@ -574,8 +519,8 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
|
|
| 574 |
if 'Size' in selected_bad_smells:
|
| 575 |
output = output + get_long_elements(elements) + "\n\n"
|
| 576 |
|
| 577 |
-
|
| 578 |
-
|
| 579 |
|
| 580 |
if 'Punctuations' in selected_bad_smells:
|
| 581 |
output = output + get_punctuations(elements) + "\n\n"
|
|
@@ -610,12 +555,11 @@ def identify_bad_smells(tgrl_file, selected_bad_smells):
|
|
| 610 |
|
| 611 |
interface = gr.Interface(fn = identify_bad_smells,
|
| 612 |
inputs = [gr.File(label="TGRL File"),
|
| 613 |
-
gr.CheckboxGroup(["Size", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
|
| 614 |
label="Which bad smells you want to detect?")],
|
| 615 |
outputs = ["text"],
|
| 616 |
title = "TGRL Bad Smells Detection",
|
| 617 |
description = "Upload your .xgrl file and we will find the bad smells for you!")
|
| 618 |
-
#"Complexity"
|
| 619 |
|
| 620 |
-
|
| 621 |
-
|
|
|
|
| 2 |
import re
|
| 3 |
import json
|
| 4 |
import nltk
|
|
|
|
| 5 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
|
| 6 |
from sentence_transformers import CrossEncoder
|
| 7 |
from autocorrect import Speller
|
|
|
|
| 9 |
import torch
|
| 10 |
from torch.nn.utils.rnn import pad_sequence
|
| 11 |
import numpy as np
|
| 12 |
+
import spacy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
# ***************************** TGRL Parsing *****************************
|
|
|
|
| 142 |
return "Long elements:\nNone."
|
| 143 |
# #####################################
|
| 144 |
|
|
|
|
| 145 |
# ######### Complex Sentences #########
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def get_complex_sentences(elements):
|
| 147 |
|
| 148 |
complex_sentences = []
|
| 149 |
|
| 150 |
for key, value in elements.items():
|
| 151 |
for i in range(0, len(elements[key])):
|
| 152 |
+
if len(get_clauses_list(elements[key][i])) > 1:
|
| 153 |
complex_sentences.append(elements[key][i])
|
| 154 |
|
| 155 |
if complex_sentences:
|
|
|
|
| 157 |
return "Complex sentences:\n" + complex_sentences
|
| 158 |
else:
|
| 159 |
return "Complex sentences:\nNone."
|
| 160 |
+
|
| 161 |
+
def find_root_of_sentence(doc):
|
| 162 |
+
root_token = None
|
| 163 |
+
for token in doc:
|
| 164 |
+
if (token.dep_ == "ROOT"):
|
| 165 |
+
root_token = token
|
| 166 |
+
return root_token
|
| 167 |
+
|
| 168 |
+
def find_other_verbs(doc, root_token):
|
| 169 |
+
other_verbs = []
|
| 170 |
+
for token in doc:
|
| 171 |
+
ancestors = list(token.ancestors)
|
| 172 |
+
if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
|
| 173 |
+
other_verbs.append(token)
|
| 174 |
+
return other_verbs
|
| 175 |
+
|
| 176 |
+
# find the token spans for each verb
|
| 177 |
+
def get_clause_token_span_for_verb(verb, doc, all_verbs):
|
| 178 |
+
first_token_index = len(doc)
|
| 179 |
+
last_token_index = 0
|
| 180 |
+
this_verb_children = list(verb.children)
|
| 181 |
+
for child in this_verb_children:
|
| 182 |
+
if (child not in all_verbs):
|
| 183 |
+
if (child.i < first_token_index):
|
| 184 |
+
first_token_index = child.i
|
| 185 |
+
if (child.i > last_token_index):
|
| 186 |
+
last_token_index = child.i
|
| 187 |
+
return(first_token_index, last_token_index)
|
| 188 |
+
|
| 189 |
+
def get_clauses_list(sent):
|
| 190 |
+
|
| 191 |
+
nlp = spacy.load('en_core_web_sm')
|
| 192 |
+
|
| 193 |
+
doc = nlp(sent)
|
| 194 |
+
|
| 195 |
+
# find part of speech, dependency tag, ancestors, and children of each token
|
| 196 |
+
for token in doc:
|
| 197 |
+
ancestors = [t.text for t in token.ancestors]
|
| 198 |
+
children = [t.text for t in token.children]
|
| 199 |
+
#print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)
|
| 200 |
+
|
| 201 |
+
# find the root token of the sentenc
|
| 202 |
+
root_token = find_root_of_sentence(doc)
|
| 203 |
+
|
| 204 |
+
# find the other verbs
|
| 205 |
+
other_verbs = find_other_verbs(doc, root_token)
|
| 206 |
+
|
| 207 |
+
# put together all the verbs in one array and process each using get_clause_token_span_for_verb function
|
| 208 |
+
# this will return a tuple of start and end indices for each verb's clause
|
| 209 |
+
token_spans = []
|
| 210 |
+
all_verbs = [root_token] + other_verbs
|
| 211 |
+
for other_verb in all_verbs:
|
| 212 |
+
(first_token_index, last_token_index) = \
|
| 213 |
+
get_clause_token_span_for_verb(other_verb,
|
| 214 |
+
doc, all_verbs)
|
| 215 |
+
token_spans.append((first_token_index,
|
| 216 |
+
last_token_index))
|
| 217 |
+
|
| 218 |
+
# put together token spans for each clause
|
| 219 |
+
sentence_clauses = []
|
| 220 |
+
for token_span in token_spans:
|
| 221 |
+
start = token_span[0]
|
| 222 |
+
end = token_span[1]
|
| 223 |
+
if (start < end):
|
| 224 |
+
clause = doc[start:end]
|
| 225 |
+
sentence_clauses.append(clause)
|
| 226 |
+
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
|
| 227 |
+
|
| 228 |
+
# get the final result
|
| 229 |
+
clauses_text = [clause.text for clause in sentence_clauses]
|
| 230 |
+
#print(clauses_text)
|
| 231 |
+
return clauses_text
|
| 232 |
+
|
| 233 |
+
# #####################################
|
| 234 |
+
|
| 235 |
# ########## Punctuations #########
|
| 236 |
def get_punctuations(elements):
|
| 237 |
|
|
|
|
| 519 |
if 'Size' in selected_bad_smells:
|
| 520 |
output = output + get_long_elements(elements) + "\n\n"
|
| 521 |
|
| 522 |
+
if 'Complexity' in selected_bad_smells:
|
| 523 |
+
output = output + get_complex_sentences(elements) + "\n\n"
|
| 524 |
|
| 525 |
if 'Punctuations' in selected_bad_smells:
|
| 526 |
output = output + get_punctuations(elements) + "\n\n"
|
|
|
|
| 555 |
|
| 556 |
interface = gr.Interface(fn = identify_bad_smells,
|
| 557 |
inputs = [gr.File(label="TGRL File"),
|
| 558 |
+
gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
|
| 559 |
label="Which bad smells you want to detect?")],
|
| 560 |
outputs = ["text"],
|
| 561 |
title = "TGRL Bad Smells Detection",
|
| 562 |
description = "Upload your .xgrl file and we will find the bad smells for you!")
|
|
|
|
| 563 |
|
| 564 |
+
|
| 565 |
+
interface.launch(inline = False)
|