Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import re | |
| import json | |
| import numpy as np | |
| import nltk | |
| import stanza | |
| from stanza.models.constituency.parse_tree import Tree | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline | |
| from sentence_transformers import CrossEncoder | |
| from autocorrect import Speller | |
| from transformers import BertTokenizer, BertForSequenceClassification | |
| import torch | |
| from torch.nn.utils.rnn import pad_sequence | |
| # ***************************** Load needed models ***************************** | |
| nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency') | |
| pos_tokenizer = AutoTokenizer.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english") | |
| pos_model = AutoModelForTokenClassification.from_pretrained("QCRI/bert-base-multilingual-cased-pos-english") | |
| sentences_similarity_model = CrossEncoder('cross-encoder/stsb-roberta-base') | |
| nli_model = BertForSequenceClassification.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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") | |
| nli_tokenizer = BertTokenizer.from_pretrained("nouf-sst/bert-base-MultiNLI", use_auth_token="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", do_lower_case=True) | |
| # ***************************** TGRL Parsing ***************************** | |
| def parse_tgrl(file_obj): | |
| with open(file_obj.name, 'r') as f: | |
| tgrl_text = f.read() | |
| tgrl_text = tgrl_text.replace('\t', '') | |
| tgrl_text = tgrl_text.replace('\n', '') | |
| return tgrl_text | |
| def extract_elements(tgrl_text): | |
| # Extract actors | |
| actors = re.findall("(?:.*?actor\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
| # Extract goals | |
| goals = re.findall("(?:.*?goal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
| # Extract softGoals | |
| softGoals = re.findall("(?:.*?softGoal\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
| # Extract tasks | |
| tasks = re.findall("(?:.*?task\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
| # Extract resources | |
| resources = re.findall("(?:.*?resource\s\S+\s?{\s?name\s?=\s?\")([A-Za-z\s;.,!?:-]*)(?:\")", tgrl_text) | |
| elements = { | |
| "actors": actors, | |
| "goals": goals, | |
| "softGoals": softGoals, | |
| "tasks": tasks, | |
| "resources": resources | |
| } | |
| # get elements per actor | |
| elements_per_actor = {} | |
| for goal in goals: | |
| corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(goal)) | |
| corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1] | |
| if corresponding_actor not in elements_per_actor: | |
| elements_per_actor[corresponding_actor] = [] | |
| elements_per_actor[corresponding_actor].append(goal) | |
| for softGoal in softGoals: | |
| corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(softGoal)) | |
| corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1] | |
| if corresponding_actor not in elements_per_actor: | |
| elements_per_actor[corresponding_actor] = [] | |
| elements_per_actor[corresponding_actor].append(softGoal) | |
| for task in tasks: | |
| corresponding_actor = tgrl_text.rfind('actor', 0, tgrl_text.index(task)) | |
| corresponding_actor = re.split(' |{', tgrl_text[corresponding_actor:])[1] | |
| if corresponding_actor not in elements_per_actor: | |
| elements_per_actor[corresponding_actor] = [] | |
| elements_per_actor[corresponding_actor].append(task) | |
| # get decomposed elements | |
| new_lines = tgrl_text | |
| decomposed_elements = {} | |
| main_elements = re.findall("\w+(?=\s+decomposedBy)", new_lines) | |
| for main_element in main_elements: | |
| sub_elements = [] | |
| sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0]) | |
| sub_elements.append(sub_element) | |
| new_lines = new_lines.replace(sub_element+', ', '') | |
| temp = main_element + " decomposedBy " | |
| for idx, sub_element in enumerate(sub_elements): | |
| if idx+1 == len (sub_elements): | |
| temp = temp + sub_element + ";" | |
| else: | |
| temp = temp + sub_element + ", " | |
| while temp not in tgrl_text: | |
| sub_element = (re.findall(main_element+"(?: decomposedBy )([A-Za-z\s]*)", new_lines)[0]) | |
| sub_elements.append(sub_element) | |
| new_lines = new_lines.replace(sub_element+', ', '') | |
| temp = main_element + " decomposedBy " | |
| for idx, sub_element in enumerate(sub_elements): | |
| if idx+1 == len (sub_elements): | |
| temp = temp + sub_element + ";" | |
| else: | |
| temp = temp + sub_element + ", " | |
| decomposed_elements[main_element] = sub_elements | |
| # Replace elements IDs with names | |
| new_decomposed_elements = {} | |
| for key, _ in decomposed_elements.items(): | |
| new_key = re.findall("(?:"+key+"\s*{\s*name\s=\s\")([A-Za-z\s]*)", tgrl_text)[0] | |
| new_values = [] | |
| for element in decomposed_elements[key]: | |
| new_value = re.findall("(?:"+element+"\s*{\s*name\s=\s\")([A-Za-z\s]*)", tgrl_text)[0] | |
| new_values.append(new_value) | |
| new_decomposed_elements[new_key] = new_values | |
| return elements, elements_per_actor, new_decomposed_elements | |
| # ************************************************************************ | |
| # ************************* Bad Smells Detection ************************* | |
| # ########### Long Elements ########### | |
| def get_long_elements(elements, size_threshold): # Using RegEx | |
| long_elements = [] | |
| for key, value in elements.items(): | |
| for i in range(0, len(elements[key])): | |
| if len(re. findall(r'\w+', elements[key][i])) > size_threshold: | |
| long_elements.append(elements[key][i]) | |
| if long_elements: | |
| long_elements = "\n".join(long_elements) | |
| return "Long elements:\n" + long_elements | |
| else: | |
| return "Long elements:\nNone." | |
| # ##################################### | |
| # ######### Complex Sentences ######### | |
| def is_complex_sentence(sentence): | |
| nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency') | |
| doc = nlp(sentence) | |
| for sentence in doc.sentences: | |
| unique_constituent_labels = Tree.get_unique_constituent_labels(sentence.constituency) | |
| if 'SBAR' in unique_constituent_labels: | |
| return True | |
| else: | |
| return False | |
| def get_complex_sentences(elements): | |
| complex_sentences = [] | |
| for key, value in elements.items(): | |
| for i in range(0, len(elements[key])): | |
| if is_complex_sentence(elements[key][i]): | |
| complex_sentences.append(elements[key][i]) | |
| if complex_sentences: | |
| complex_sentences = "\n".join(complex_sentences) | |
| return "Complex sentences:\n" + complex_sentences | |
| else: | |
| return "Complex sentences:\nNone." | |
| # ##################################### | |
| # ########## Punctuations ######### | |
| def get_punctuations(elements): | |
| punctuations = [] | |
| for key, value in elements.items(): | |
| for i in range(0, len(elements[key])): | |
| if len(re.findall("[^\s\w\d-]", elements[key][i])) > 0: | |
| punctuations.append(elements[key][i]) | |
| if punctuations: | |
| punctuations = "\n".join(punctuations) | |
| return "Punctuations:\n" + punctuations | |
| else: | |
| return "Punctuations:\nNone." | |
| # ################################# | |
| # ########## Incorrect Actor Syntax ########## | |
| def find_non_NPs(sentences): | |
| pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer) | |
| outputs = pipeline(sentences) | |
| Non_NPs = [] | |
| for idx, output in enumerate(outputs): | |
| if not output[0]['entity'].startswith('N'): | |
| Non_NPs.append(sentences[idx]) | |
| return Non_NPs | |
| def check_actor_syntax(actors): | |
| incorrect_actor_syntax = find_non_NPs(actors) | |
| if incorrect_actor_syntax: | |
| incorrect_actor_syntax = "\n".join(incorrect_actor_syntax) | |
| return "Incorrect Actors Syntax:\n" + incorrect_actor_syntax | |
| else: | |
| return "All actors are syntactically correct." | |
| # ############################################ | |
| # ########## Incorrect Goal Syntax ########### | |
| def check_goal_syntax(goals): | |
| incorrect_goal_syntax = find_non_NPs(goals) | |
| if incorrect_goal_syntax: | |
| incorrect_goal_syntax = "\n".join(incorrect_goal_syntax) | |
| return "Incorrect Goals Syntax:\n" + incorrect_goal_syntax | |
| else: | |
| return "All goals are syntactically correct." | |
| # ############################################ | |
| # ########## Incorrect Softgoal Syntax ########### | |
| def check_softgoal_syntax(softgoals): | |
| incorrect_softgoal_syntax = find_non_NPs(softgoals) | |
| if incorrect_softgoal_syntax: | |
| incorrect_softgoal_syntax = "\n".join(incorrect_softgoal_syntax) | |
| return "Incorrect Softgoals Syntax:\n" + incorrect_softgoal_syntax | |
| else: | |
| return "All softgoal are syntactically correct." | |
| # ############################################ | |
| # ########## Incorrect Task Syntax ########### | |
| def find_non_VPs(sentences): | |
| pipeline = TokenClassificationPipeline(model=pos_model, tokenizer=pos_tokenizer) | |
| outputs = pipeline(sentences) | |
| Non_VPs = [] | |
| for idx, output in enumerate(outputs): | |
| if not output[0]['entity'].startswith('V'): | |
| Non_VPs.append(sentences[idx]) | |
| return Non_VPs | |
| def check_task_syntax(tasks): | |
| incorrect_task_syntax = find_non_VPs(tasks) | |
| if incorrect_task_syntax: | |
| incorrect_task_syntax = "\n".join(incorrect_task_syntax) | |
| return "Incorrect Tasks Syntax:\n" + incorrect_task_syntax | |
| else: | |
| return "All tasks are syntactically correct." | |
| # ############################################ | |
| # ########## Similarity ########### | |
| def get_similar_elements(elements_per_actor, similarity_threshold): | |
| # Prepare sentence pair array | |
| sentence_pairs = [] | |
| for key, value in elements_per_actor.items(): | |
| for i in range(len(elements_per_actor[key])): | |
| for j in range(i+1,len(elements_per_actor[key])): | |
| sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]]) | |
| # Predict semantic similarity | |
| semantic_similarity_scores = sentences_similarity_model.predict(sentence_pairs, show_progress_bar=True) | |
| similar_elements = [] | |
| for index, value in enumerate(sentence_pairs): | |
| if semantic_similarity_scores[index] > similarity_threshold: | |
| similar_elements.append(value) | |
| #semantic_similarity["pair_"+str(index+1)] = [value,semantic_similarity_scores[index]] | |
| if similar_elements: | |
| similar_elements = [' and '.join(ele) for ele in similar_elements] | |
| similar_elements = "\n".join(similar_elements) | |
| return "The following elements are semantically similar:\n" + similar_elements | |
| else: | |
| return "There are no similar elements." | |
| return semantic_similarity | |
| # ################################# | |
| # ########## Misspelling ########### | |
| def get_misspelled_words(sentence): | |
| spell = Speller(only_replacements=True) | |
| misspelled= [] | |
| for word in sentence.split(): | |
| correct_word = spell(word) | |
| if word != correct_word: | |
| misspelled.append([word, correct_word]) | |
| return misspelled | |
| def check_spelling(elements): | |
| spelling_mistakes = [] | |
| spelling_mistakes_string = "" | |
| for key, value in elements.items(): | |
| for i in range(0, len(elements[key])): | |
| if get_misspelled_words(elements[key][i]): | |
| spelling_mistakes.append([elements[key][i], get_misspelled_words(elements[key][i])]) | |
| for idx, element in enumerate(spelling_mistakes): | |
| for spelling_mistake in element[1]: | |
| temp = ' should be written as '.join(spelling_mistake) | |
| spelling_mistakes_string = spelling_mistakes_string + "\n" + element[0] + ": " + temp | |
| return spelling_mistakes_string | |
| # ################################## | |
| # ########## NLI ########### | |
| def do_nli(premise, hypothesis): | |
| # Tokenization | |
| token_ids = [] | |
| seg_ids = [] | |
| mask_ids = [] | |
| premise_id = nli_tokenizer.encode(premise, add_special_tokens = False) | |
| hypothesis_id = nli_tokenizer.encode(hypothesis, add_special_tokens = False) | |
| pair_token_ids = [nli_tokenizer.cls_token_id] + premise_id + [nli_tokenizer.sep_token_id] + hypothesis_id + [nli_tokenizer.sep_token_id] | |
| premise_len = len(premise_id) | |
| hypothesis_len = len(hypothesis_id) | |
| segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1)) # sentence 0 and sentence 1 | |
| attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3)) # mask padded values | |
| token_ids.append(torch.tensor(pair_token_ids)) | |
| seg_ids.append(segment_ids) | |
| mask_ids.append(attention_mask_ids) | |
| # Forward pass | |
| token_ids = pad_sequence(token_ids, batch_first=True) | |
| mask_ids = pad_sequence(mask_ids, batch_first=True) | |
| seg_ids = pad_sequence(seg_ids, batch_first=True) | |
| with torch.no_grad(): | |
| output = nli_model(token_ids, | |
| token_type_ids=seg_ids, | |
| attention_mask=mask_ids) | |
| # Output predication | |
| result = "" | |
| prediction = np.argmax(output.logits.cpu().numpy()).flatten().item() | |
| if prediction == 0: | |
| result = "Entailment" | |
| #print("Entailment") | |
| elif prediction == 1: | |
| result = "Contradiction" | |
| #print("Contradiction") | |
| elif prediction == 2: | |
| result = "Neutral" | |
| #print("Neutral") | |
| return result | |
| # Entailment | |
| def check_entailment(decomposed_elements): | |
| sentence_pairs = [] | |
| non_matching_elements = [] | |
| for key, value in decomposed_elements.items(): | |
| #print(key, value) | |
| for i in decomposed_elements[key]: | |
| #print(key, i) | |
| sentence_pairs.append([key, i]) | |
| for sentence_pair in sentence_pairs: | |
| result = do_nli(sentence_pair[0], sentence_pair[1]) | |
| print(result) | |
| if result != "Entailment": | |
| non_matching_elements.append(sentence_pair) | |
| if non_matching_elements: | |
| non_matching_elements = [' and '.join(ele) for ele in non_matching_elements] | |
| non_matching_elements = "\n".join(non_matching_elements) | |
| return "The following elements are miss matching:\n" + non_matching_elements | |
| else: | |
| return "There are no miss matched elements." | |
| return result | |
| # Contradiction | |
| def check_contradiction(elements_per_actor): | |
| sentence_pairs = [] | |
| contradicting_elements = [] | |
| for key, value in elements_per_actor.items(): | |
| for i in range(len(elements_per_actor[key])): | |
| for j in range(i+1,len(elements_per_actor[key])): | |
| sentence_pairs.append([elements_per_actor[key][i], elements_per_actor[key][j]]) | |
| #print(sentence_pairs) | |
| # Check contradiction | |
| for sentence_pair in sentence_pairs: | |
| result = do_nli(sentence_pair[0], sentence_pair[1]) | |
| #print(result) | |
| if result == "Contradiction": | |
| contradicting_elements.append(sentence_pair) | |
| if contradicting_elements: | |
| contradicting_elements = [' and '.join(ele) for ele in contradicting_elements] | |
| contradicting_elements = "\n".join(contradicting_elements) | |
| return "The following elements are contradicting:\n" + contradicting_elements | |
| else: | |
| return "There are no contradicting elements." | |
| # ########################## | |
| # ************************* User Interface ************************* | |
| def identify_bad_smells(tgrl_file, selected_bad_smells, size_threshold, similarity_threshold): | |
| output = "" | |
| tgrl_text = parse_tgrl(tgrl_file) | |
| elements, elements_per_actor, decomposed_elements = extract_elements(tgrl_text) | |
| if 'Size' in selected_bad_smells: | |
| output = output + get_long_elements(elements, size_threshold) + "\n\n" | |
| if 'Complexity' in selected_bad_smells: | |
| output = output + get_complex_sentences(elements) + "\n\n" | |
| if 'Punctuations' in selected_bad_smells: | |
| output = output + get_punctuations(elements) + "\n\n" | |
| if 'Actors Syntax' in selected_bad_smells: | |
| output = output + check_actor_syntax(elements['actors']) + "\n\n" | |
| if 'Goals Syntax' in selected_bad_smells: | |
| output = output + check_goal_syntax(elements['goals']) + "\n\n" | |
| if 'Softgoals Syntax' in selected_bad_smells: | |
| output = output + check_softgoal_syntax(elements['softGoals']) + "\n\n" | |
| if 'Tasks Syntax' in selected_bad_smells: | |
| output = output + check_task_syntax(elements['tasks']) + "\n\n" | |
| if 'Similar Elements' in selected_bad_smells: | |
| output = output + get_similar_elements(elements_per_actor, similarity_threshold) + "\n\n" | |
| if 'Spelling Mistakes' in selected_bad_smells: | |
| output = output + check_spelling(elements) + "\n\n" | |
| if 'Goal-Subgoal Mismatch' in selected_bad_smells: | |
| output = output + check_entailment(decomposed_elements) + "\n\n" | |
| if 'Contradicting Elements' in selected_bad_smells: | |
| output = output + check_contradiction(elements_per_actor) + "\n\n" | |
| return output | |
| interface = gr.Interface(fn = identify_bad_smells, | |
| inputs = [gr.File(label="TGRL File"), | |
| gr.CheckboxGroup(["Size", "Complexity", "Punctuations", "Actors Syntax", "Goals Syntax", "Softgoals Syntax", "Tasks Syntax", "Similar Elements", "Spelling Mistakes", "Goal-Subgoal Mismatch", "Contradicting Elements"], | |
| label="Which bad smells you want to detect?"), | |
| gr.Slider(label= "Size threshold", value = 5, minimum = 2, maximum = 10, step = 1), | |
| gr.Slider(label= "Similarity threshold", value = 0.6, minimum = 0, maximum = 1, step = 0.1)], | |
| outputs = [gr.Textbox(label= "Detected bad smells:")], | |
| title = "TGRL Bad Smells Detection", | |
| description = "Upload your .xgrl file and we will find the bad smells for you!", | |
| theme = gr.themes.Soft()) | |
| interface.launch(inline = False) |