Spaces:

nouf-sst
/

TGRL-bad-smells

Runtime error

App Files Files Community

TGRL-bad-smells / app.py

nouf-sst

Fix parsing and complexity functions

301fee0 over 2 years ago

raw

history blame

17.6 kB

	import gradio as gr
	import re
	import json
	import numpy as np
	import nltk
	import stanza
	from stanza.models.constituency.parse_tree import Tree
	from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
	from sentence_transformers import CrossEncoder
	from autocorrect import Speller
	from transformers import BertTokenizer, BertForSequenceClassification
	import torch
	from torch.nn.utils.rnn import pad_sequence

	# *************************** Load needed models ***************************
	nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
	pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
	pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english")
	sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base')
	nli_model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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")
	nli_tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True)

	# *************************** TGRL Parsing ***************************

	def parse_tgrl(file_obj):

	with open(file_obj.name, 'r') as f:
	tgrl_text = f.read()
	tgrl_text = tgrl_text.replace('\t', '')
	tgrl_text = tgrl_text.replace('\n', '')

	return tgrl_text

	def extract_elements(tgrl_text):

	# Extract actors
	actors = re.findall("(?:.?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-])(?:\")", tgrl_text)
	# Extract goals
	goals = re.findall("(?:.?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-])(?:\")", tgrl_text)
	# Extract softGoals
	softGoals = re.findall("(?:.?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-])(?:\")", tgrl_text)
	# Extract tasks
	tasks = re.findall("(?:.?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-])(?:\")", tgrl_text)
	# Extract resources
	resources = re.findall("(?:.?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-])(?:\")", tgrl_text)

	elements = {
	"actors": actors,
	"goals": goals,
	"softGoals": softGoals,
	"tasks": tasks,
	"resources": resources
	}

	# get elements per actor
	elements_per_actor = {}

	for goal in goals:
	corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal))
	corresponding_actor = re.split(' \|{', tgrl_text[corresponding_actor:])[1]
	if corresponding_actor not in elements_per_actor:
	elements_per_actor[corresponding_actor] = []
	elements_per_actor[corresponding_actor].append(goal)

	for softGoal in softGoals:
	corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal))
	corresponding_actor = re.split(' \|{', tgrl_text[corresponding_actor:])[1]
	if corresponding_actor not in elements_per_actor:
	elements_per_actor[corresponding_actor] = []
	elements_per_actor[corresponding_actor].append(softGoal)

	for task in tasks:
	corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task))
	corresponding_actor = re.split(' \|{', tgrl_text[corresponding_actor:])[1]
	if corresponding_actor not in elements_per_actor:
	elements_per_actor[corresponding_actor] = []
	elements_per_actor[corresponding_actor].append(task)

	# get decomposed elements

	new_lines = tgrl_text
	decomposed_elements = {}

	main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines)

	for main_element in main_elements:

	sub_elements = []

	sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
	sub_elements.append(sub_element)
	new_lines = new_lines.replace(sub_element+', ', '')

	temp = main_element + " decomposedBy "
	for idx, sub_element in enumerate(sub_elements):
	if idx+1 == len (sub_elements):
	temp = temp + sub_element + ";"
	else:
	temp = temp + sub_element + ", "

	while temp not in tgrl_text:

	sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0])
	sub_elements.append(sub_element)
	new_lines = new_lines.replace(sub_element+', ', '')

	temp = main_element + " decomposedBy "
	for idx, sub_element in enumerate(sub_elements):
	if idx+1 == len (sub_elements):
	temp = temp + sub_element + ";"
	else:
	temp = temp + sub_element + ", "

	decomposed_elements[main_element] = sub_elements

	# Replace elements IDs with names
	new_decomposed_elements = {}

	for key, _ in decomposed_elements.items():

	new_key = re.findall("(?:"+key+"\s{\sname\s=\s\")([A-Za-z\s]*)", tgrl_text)[0]
	new_values = []

	for element in decomposed_elements[key]:
	new_value = re.findall("(?:"+element+"\s{\sname\s=\s\")([A-Za-z\s]*)", tgrl_text)[0]
	new_values.append(new_value)

	new_decomposed_elements[new_key] = new_values

	return elements, elements_per_actor, new_decomposed_elements

	# ************************************************************************

	# *********************** Bad Smells Detection ***********************

	# ########### Long Elements ###########
	def get_long_elements(elements, size_threshold): # Using RegEx

	long_elements = []

	for key, value in elements.items():
	for i in range(0, len(elements[key])):
	if len(re. findall(r'\w+', elements[key][i])) > size_threshold:
	long_elements.append(elements[key][i])

	if long_elements:
	long_elements = "\n".join(long_elements)
	return "Long elements:\n" + long_elements
	else:
	return "Long elements:\nNone."
	# #####################################

	# ######### Complex Sentences #########
	def is_complex_sentence(sentence):

	nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
	doc = nlp(sentence)
	for sentence in doc.sentences:
	unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency)
	if 'SBAR' in unique_constituent_labels:
	return True
	else:
	return False

	def get_complex_sentences(elements):

	complex_sentences = []

	for key, value in elements.items():
	for i in range(0, len(elements[key])):
	if is_complex_sentence(elements[key][i]):
	complex_sentences.append(elements[key][i])

	if complex_sentences:
	complex_sentences = "\n".join(complex_sentences)
	return "Complex sentences:\n" + complex_sentences
	else:
	return "Complex sentences:\nNone."

	# #####################################

	# ########## Punctuations #########
	def get_punctuations(elements):

	punctuations = []

	for key, value in elements.items():
	for i in range(0, len(elements[key])):
	if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0:
	punctuations.append(elements[key][i])

	if punctuations:
	punctuations = "\n".join(punctuations)
	return "Punctuations:\n" + punctuations
	else:
	return "Punctuations:\nNone."
	# #################################

	# ########## Incorrect Actor Syntax ##########
	def find_non_NPs(sentences):

	pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer)

	outputs = pipeline(sentences)

	Non_NPs = []

	for idx, output in enumerate(outputs):
	if not output[0]['entity'].startswith('N'):
	Non_NPs.append(sentences[idx])

	return Non_NPs

	def check_actor_syntax(actors):

	incorrect_actor_syntax = find_non_NPs(actors)

	if incorrect_actor_syntax:
	incorrect_actor_syntax = "\n".join(incorrect_actor_syntax)
	return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax
	else:
	return "All actors are syntactically correct."
	# ############################################

	# ########## Incorrect Goal Syntax ###########
	def check_goal_syntax(goals):

	incorrect_goal_syntax = find_non_NPs(goals)

	if incorrect_goal_syntax:
	incorrect_goal_syntax = "\n".join(incorrect_goal_syntax)
	return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax
	else:
	return "All goals are syntactically correct."
	# ############################################

	# ########## Incorrect Softgoal Syntax ###########
	def check_softgoal_syntax(softgoals):

	incorrect_softgoal_syntax = find_non_NPs(softgoals)

	if incorrect_softgoal_syntax:
	incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax)
	return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax
	else:
	return "All softgoal are syntactically correct."
	# ############################################

	# ########## Incorrect Task Syntax ###########
	def find_non_VPs(sentences):

	pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer)

	outputs = pipeline(sentences)

	Non_VPs = []

	for idx, output in enumerate(outputs):
	if not output[0]['entity'].startswith('V'):
	Non_VPs.append(sentences[idx])

	return Non_VPs

	def check_task_syntax(tasks):

	incorrect_task_syntax = find_non_VPs(tasks)

	if incorrect_task_syntax:
	incorrect_task_syntax = "\n".join(incorrect_task_syntax)
	return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax
	else:
	return "All tasks are syntactically correct."
	# ############################################

	# ########## Similarity ###########
	def get_similar_elements(elements_per_actor, similarity_threshold):

	# Prepare sentence pair array
	sentence_pairs = []

	for key, value in elements_per_actor.items():

	for i in range(len(elements_per_actor[key])):
	for j in range(i+1,len(elements_per_actor[key])):
	sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])

	# Predict semantic similarity
	semantic_similarity_scores = sentences_similarity_model.predict(sentence_pairs, show_progress_bar=True)

	similar_elements = []
	for index, value in enumerate(sentence_pairs):
	if semantic_similarity_scores[index] > similarity_threshold:
	similar_elements.append(value)
	#semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]]

	if similar_elements:
	similar_elements = [' and '.join(ele) for ele in similar_elements]
	similar_elements = "\n".join(similar_elements)
	return "The following elements are semantically similar:\n" + similar_elements
	else:
	return "There are no similar elements."

	return semantic_similarity
	# #################################

	# ########## Misspelling ###########
	def get_misspelled_words(sentence):

	spell = Speller(only_replacements=True)

	misspelled= []

	for word in sentence.split():
	correct_word = spell(word)
	if word != correct_word:
	misspelled.append([word, correct_word])

	return misspelled

	def check_spelling(elements):

	spelling_mistakes = []
	spelling_mistakes_string = ""

	for key, value in elements.items():
	for i in range(0, len(elements[key])):
	if get_misspelled_words(elements[key][i]):
	spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])])

	for idx, element in enumerate(spelling_mistakes):
	for spelling_mistake in element[1]:
	temp = ' should be written as '.join(spelling_mistake)
	spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp

	return spelling_mistakes_string
	# ##################################

	# ########## NLI ###########
	def do_nli(premise, hypothesis):

	# Tokenization
	token_ids = []
	seg_ids = []
	mask_ids = []

	premise_id = nli_tokenizer.encode(premise, add_special_tokens = False)
	hypothesis_id = nli_tokenizer.encode(hypothesis, add_special_tokens = False)
	pair_token_ids = [nli_tokenizer.cls_token_id] + premise_id + [nli_tokenizer.sep_token_id] + hypothesis_id + [nli_tokenizer.sep_token_id]
	premise_len = len(premise_id)
	hypothesis_len = len(hypothesis_id)

	segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1)) # sentence 0 and sentence 1
	attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3)) # mask padded values

	token_ids.append(torch.tensor(pair_token_ids))
	seg_ids.append(segment_ids)
	mask_ids.append(attention_mask_ids)

	# Forward pass
	token_ids = pad_sequence(token_ids, batch_first=True)
	mask_ids = pad_sequence(mask_ids, batch_first=True)
	seg_ids = pad_sequence(seg_ids, batch_first=True)

	with torch.no_grad():
	output = nli_model(token_ids,
	token_type_ids=seg_ids,
	attention_mask=mask_ids)

	# Output predication
	result = ""
	prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()
	if prediction == 0:
	result = "Entailment"
	#print("Entailment")
	elif prediction == 1:
	result = "Contradiction"
	#print("Contradiction")
	elif prediction == 2:
	result = "Neutral"
	#print("Neutral")

	return result

	# Entailment
	def check_entailment(decomposed_elements):

	sentence_pairs = []
	non_matching_elements = []

	for key, value in decomposed_elements.items():
	#print(key, value)
	for i in decomposed_elements[key]:
	#print(key, i)
	sentence_pairs.append([key, i])

	for sentence_pair in sentence_pairs:
	result = do_nli(sentence_pair[0], sentence_pair[1])
	print(result)
	if result != "Entailment":
	non_matching_elements.append(sentence_pair)

	if non_matching_elements:
	non_matching_elements = [' and '.join(ele) for ele in non_matching_elements]
	non_matching_elements = "\n".join(non_matching_elements)
	return "The following elements are miss matching:\n" + non_matching_elements
	else:
	return "There are no miss matched elements."

	return result

	# Contradiction
	def check_contradiction(elements_per_actor):

	sentence_pairs = []
	contradicting_elements = []

	for key, value in elements_per_actor.items():

	for i in range(len(elements_per_actor[key])):
	for j in range(i+1,len(elements_per_actor[key])):
	sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]])

	#print(sentence_pairs)
	# Check contradiction
	for sentence_pair in sentence_pairs:
	result = do_nli(sentence_pair[0], sentence_pair[1])
	#print(result)
	if result == "Contradiction":
	contradicting_elements.append(sentence_pair)

	if contradicting_elements:
	contradicting_elements = [' and '.join(ele) for ele in contradicting_elements]
	contradicting_elements = "\n".join(contradicting_elements)
	return "The following elements are contradicting:\n" + contradicting_elements
	else:
	return "There are no contradicting elements."
	# ##########################

	# *********************** User Interface ***********************

	def identify_bad_smells(tgrl_file, selected_bad_smells, size_threshold, similarity_threshold):

	output = ""

	tgrl_text = parse_tgrl(tgrl_file)

	elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text)

	if 'Size' in selected_bad_smells:
	output = output + get_long_elements(elements, size_threshold) + "\n\n"

	if 'Complexity' in selected_bad_smells:
	output = output + get_complex_sentences(elements) + "\n\n"

	if 'Punctuations' in selected_bad_smells:
	output = output + get_punctuations(elements) + "\n\n"

	if 'Actors Syntax' in selected_bad_smells:
	output = output + check_actor_syntax(elements['actors']) + "\n\n"

	if 'Goals Syntax' in selected_bad_smells:
	output = output + check_goal_syntax(elements['goals']) + "\n\n"

	if 'Softgoals Syntax' in selected_bad_smells:
	output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n"

	if 'Tasks Syntax' in selected_bad_smells:
	output = output + check_task_syntax(elements['tasks']) + "\n\n"

	if 'Similar Elements' in selected_bad_smells:
	output = output + get_similar_elements(elements_per_actor, similarity_threshold) + "\n\n"

	if 'Spelling Mistakes' in selected_bad_smells:
	output = output + check_spelling(elements) + "\n\n"

	if 'Goal-Subgoal Mismatch' in selected_bad_smells:
	output = output + check_entailment(decomposed_elements) + "\n\n"

	if 'Contradicting Elements' in selected_bad_smells:
	output = output + check_contradiction(elements_per_actor) + "\n\n"


	return output


	interface = gr.Interface(fn = identify_bad_smells,
	inputs = [gr.File(label="TGRL File"),
	gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"],
	label="Which bad smells you want to detect?"),
	gr.Slider(label= "Size threshold", value = 5, minimum = 2, maximum = 10, step = 1),
	gr.Slider(label= "Similarity threshold", value = 0.6, minimum = 0, maximum = 1, step = 0.1)],
	outputs = [gr.Textbox(label= "Detected bad smells:")],
	title = "TGRL Bad Smells Detection",
	description = "Upload your .xgrl file and we will find the bad smells for you!",
	theme = gr.themes.Soft())


	interface.launch(inline = False)