Spaces:

furkanunluturk
/

atasozu-onerici

Running

furkanunluturk commited on Jan 16

Commit

f9dda4c

1 Parent(s): a2556d4

replace by rapidfuzz

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
 # Load the dataset from Hugging Face
@@ -27,26 +28,27 @@ def embedding_search(input_text, top_x, embeddings, corpus, reference):
         for result in results
     ]
-def substring_overlap_search(input_text, corpus, reference, top_x):
     """
-    Perform word overlap search, accounting for substrings between input_text and corpus.
     """
     input_words = input_text.lower().split()
     scores = []
     for i, text in enumerate(corpus):
         text_words = text.lower().split()
-        # Calculate overlap based on substrings
-        overlap = sum(
-            any(input_word in text_word or text_word in input_word
-                for text_word in text_words)
-            for input_word in input_words
-        )
-        scores.append((i, overlap))
-    # Sort by overlap score (descending) and return top results
     scores.sort(key=lambda x: x[1], reverse=True)
     return [(reference[idx], corpus[idx], score) for idx, score in scores[:top_x]]
@@ -63,10 +65,10 @@ def recommend_proverbs_comparison(input_text, top_x):
     )
     # Word Overlap Similarity
-    semantic_results_overlap = substring_overlap_search(
         input_text, definitions, proverbs, top_x
     )
-    literal_results_overlap = substring_overlap_search(
         input_text, proverbs, definitions, top_x
     )

 import gradio as gr
 from datasets import load_dataset
+from rapidfuzz.fuzz import ratio
 from sentence_transformers import SentenceTransformer, util
 # Load the dataset from Hugging Face
         for result in results
     ]
+def fuzzy_search(input_text, corpus, reference, top_x, threshold=70):
     """
+    Perform fuzzy word overlap search between input_text and corpus using RapidFuzz.
+    Matches words based on a similarity score above the threshold.
     """
     input_words = input_text.lower().split()
     scores = []
     for i, text in enumerate(corpus):
         text_words = text.lower().split()
+        match_score = 0
+        # Compare each word in input_text with each word in the current text
+        for input_word in input_words:
+            for text_word in text_words:
+                if ratio(input_word, text_word) >= threshold:  # Fuzzy match
+                    match_score += 1
+        scores.append((i, match_score))
+    # Sort by match score and return the top results
     scores.sort(key=lambda x: x[1], reverse=True)
     return [(reference[idx], corpus[idx], score) for idx, score in scores[:top_x]]
     )
     # Word Overlap Similarity
+    semantic_results_overlap = fuzzy_search(
         input_text, definitions, proverbs, top_x
     )
+    literal_results_overlap = fuzzy_search(
         input_text, proverbs, definitions, top_x
     )

requirements.txt CHANGED Viewed