Spaces:

Towhidul
/

PECCAVI

Runtime error

App Files Files Community

Towhidul commited on Mar 27, 2024

Commit

1416b93

verified ·

1 Parent(s): 26f648a

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -38

app.py CHANGED Viewed

@@ -27,6 +27,8 @@ import seaborn as sns
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from colorama import Fore, Style
 # import openai
 para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
@@ -62,28 +64,118 @@ def paraphrase(
     return res
-def find_longest_common_sequences(main_sentence, paraphrases):
-    main_tokens = main_sentence.split()
-    common_sequences = set()
-    for paraphrase in paraphrases:
-        paraphrase_tokens = paraphrase.split()
-        for i in range(len(main_tokens)):
-            for j in range(len(paraphrase_tokens)):
-                # Start comparing pairs of words
-                m = i
-                n = j
-                while m < len(main_tokens) and n < len(paraphrase_tokens) and main_tokens[m] == paraphrase_tokens[n]:
-                    m += 1
-                    n += 1
-                # If we found a longer common sequence, update it
-                if m - i > 1:
-                    sequence = ' '.join(main_tokens[i:m])
-                    is_subsequence = any(sequence in existing_seq for existing_seq in common_sequences)
-                    if not is_subsequence:
-                        common_sequences.add(sequence)
-    return sorted(common_sequences, key=len, reverse=True)
@@ -123,26 +215,45 @@ longest_common_sequences = find_longest_common_sequences(main_sentence, paraphra
 color_palette = ["#FF0000", "#008000", "#0000FF", "#FF00FF", "#00FFFF"]
 highlighted_sentences = []
-# Highlighting sequences in main sentence and paraphrases
-for sentence in [main_sentence] + paraphrases:
-    highlighted_sentence = sentence
-    for i, sequence in enumerate(longest_common_sequences):
-        color = color_palette[i % len(color_palette)]
-        highlighted_sentence = highlighted_sentence.replace(sequence, f"<span style='color:{color}'>{sequence}</span>")
-    highlighted_sentences.append(highlighted_sentence)
-# Display paraphrases with numbers
-st.markdown("**Paraphrases**:")
-for i, para in enumerate(paraphrases, 1):
-    st.write(f"Paraphrase {i}:")
-    st.write(para)
-# Displaying the main sentence with highlighted longest common sequences
-st.markdown("**Main sentence with highlighted longest common sequences**:")
-st.markdown(highlighted_sentences[0], unsafe_allow_html=True)
-st.markdown("**Paraphrases with highlighted longest common sequences**:")
-for paraphrase in highlighted_sentences[1:]:
-    st.markdown(paraphrase, unsafe_allow_html=True)

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from colorama import Fore, Style
 # import openai
+import re
+from termcolor import colored
 para_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
     return res
+def remove_punctuations(text):
+    # Remove punctuations while preserving hyphenated words
+    return re.sub(r'(?<!\w)-|-(?!\w)', ' ', re.sub(r'[^\w\s-]', '', text))
+def tokenize(sentence):
+    # Remove punctuations using the updated function and tokenize the sentence into words
+    cleaned_sentence = remove_punctuations(sentence)
+    return cleaned_sentence.split()
+def generate_bigrams(words):
+    # Generate bigrams from a list of words
+    return [(words[i], words[i+1]) for i in range(len(words)-1)]
+def hash_bigram(bigram):
+    # Hash function for bigrams
+    return hash(tuple(bigram))
+def find_matching_words(sentence1, sentence2):
+    # Tokenize the sentences
+    words1 = tokenize(sentence1)
+    words2 = tokenize(sentence2)
+    # Generate bigrams
+    bigrams1 = generate_bigrams(words1)
+    bigrams2 = generate_bigrams(words2)
+    # Hash bigrams of sentence 1 and store them in a set for efficient lookup
+    hashed_bigrams_set = set(hash_bigram(bigram) for bigram in bigrams1)
+    # Find matching words by comparing hashed bigrams of sentence 2 with the set of hashed bigrams from sentence 1
+    matching_words = []
+    for i, bigram in enumerate(bigrams2):
+        if hash_bigram(bigram) in hashed_bigrams_set:
+            word1_idx = sentence2.find(bigram[0], sum(len(word) for word in sentence2.split()[:i]))
+            word2_idx = sentence2.find(bigram[1], word1_idx + len(bigram[0]))
+            matching_words.append((sentence2[word1_idx:word1_idx+len(bigram[0])], sentence2[word2_idx:word2_idx+len(bigram[1])]))
+    return matching_words
+matching_bigrams_list = []
+combined_words_list = []
+for paraphrase in paraphrases:
+    # Find matching words
+    matching_words = find_matching_words(main_sentence, paraphrase)
+    matching_bigrams_list.append(matching_words)
+    def combine_matching_bigrams(matching_bigrams):
+        combined_words = []
+        combined_word = ""
+        for i, bigram in enumerate(matching_bigrams):
+            if i == 0:
+                combined_word += ' '.join(bigram)
+            elif bigram[0] == matching_bigrams[i-1][1]:
+                combined_word += ' ' + bigram[1]
+            else:
+                combined_words.append(combined_word)
+                combined_word = ' '.join(bigram)
+        # Append the last combined word
+        combined_words.append(combined_word)
+        return combined_words
+    # Combine matching bigrams into single words
+    combined_words = combine_matching_bigrams(matching_words)
+    combined_words_list.append(combined_words)
+def remove_overlapping(input_set):
+    sorted_set = sorted(input_set, key=len, reverse=True)
+    output_set = set()
+    for word in sorted_set:
+        if not any(word in existing_word for existing_word in output_set):
+            output_set.add(word)
+    return output_set
+def find_longest_match(string1, string2):
+    # Initialize variables
+    longest_match = ''
+    # Iterate through all possible substrings of string1
+    for i in range(len(string1)):
+        for j in range(i + 1, len(string1) + 1):
+            substring = string1[i:j]
+            if ' ' + substring + ' ' in ' ' + string2 + ' ':
+                if len(substring) > len(longest_match):
+                    longest_match = substring
+    return longest_match
+common_substrings = set()
+highlighted_text = []
+for i in combined_words_list[0]:
+    for j in combined_words_list[1]:
+        for k in combined_words_list[2]:
+            for l in combined_words_list[3]:
+                for m in combined_words_list[4]:
+                    matching_portion = find_longest_match(i, j)
+                    matching_portion = find_longest_match(matching_portion, k)
+                    matching_portion = find_longest_match(matching_portion, l)
+                    matching_portion = find_longest_match(matching_portion, m)
+                    if matching_portion:
+                        common_substrings.add(matching_portion)
 color_palette = ["#FF0000", "#008000", "#0000FF", "#FF00FF", "#00FFFF"]
 highlighted_sentences = []
+highlighted_sentence = main_sentence
+for substring in remove_overlapping(common_substrings):
+    highlighted_sentence = highlighted_sentence.replace(substring, colored(substring, 'white', 'on_blue'))
+    highlighted_text.append(substring)
+st.markdown(("Common substrings that occur in all five lists:")
+for substring in highlighted_text:
+    st.markdown((substring)
+st.markdown(("\nHighlighted Main Sentence:")
+st.markdown(highlighted_sentence)
+# # Highlighting sequences in main sentence and paraphrases
+# for sentence in [main_sentence] + paraphrases:
+#     highlighted_sentence = sentence
+#     for i, sequence in enumerate(longest_common_sequences):
+#         color = color_palette[i % len(color_palette)]
+#         highlighted_sentence = highlighted_sentence.replace(sequence, f"<span style='color:{color}'>{sequence}</span>")
+#     highlighted_sentences.append(highlighted_sentence)
+# # Display paraphrases with numbers
+# st.markdown("**Paraphrases**:")
+# for i, para in enumerate(paraphrases, 1):
+#     st.write(f"Paraphrase {i}:")
+#     st.write(para)
+# # Displaying the main sentence with highlighted longest common sequences
+# st.markdown("**Main sentence with highlighted longest common sequences**:")
+# st.markdown(highlighted_sentences[0], unsafe_allow_html=True)
+# st.markdown("**Paraphrases with highlighted longest common sequences**:")
+# for paraphrase in highlighted_sentences[1:]:
+#     st.markdown(paraphrase, unsafe_allow_html=True)