furkanunluturk commited on
Commit
f9dda4c
·
1 Parent(s): a2556d4

replace by rapidfuzz

Browse files
Files changed (2) hide show
  1. app.py +14 -12
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from datasets import load_dataset
 
3
  from sentence_transformers import SentenceTransformer, util
4
 
5
  # Load the dataset from Hugging Face
@@ -27,26 +28,27 @@ def embedding_search(input_text, top_x, embeddings, corpus, reference):
27
  for result in results
28
  ]
29
 
30
- def substring_overlap_search(input_text, corpus, reference, top_x):
31
  """
32
- Perform word overlap search, accounting for substrings between input_text and corpus.
 
33
  """
34
  input_words = input_text.lower().split()
35
  scores = []
36
 
37
  for i, text in enumerate(corpus):
38
  text_words = text.lower().split()
 
39
 
40
- # Calculate overlap based on substrings
41
- overlap = sum(
42
- any(input_word in text_word or text_word in input_word
43
- for text_word in text_words)
44
- for input_word in input_words
45
- )
46
 
47
- scores.append((i, overlap))
48
 
49
- # Sort by overlap score (descending) and return top results
50
  scores.sort(key=lambda x: x[1], reverse=True)
51
  return [(reference[idx], corpus[idx], score) for idx, score in scores[:top_x]]
52
 
@@ -63,10 +65,10 @@ def recommend_proverbs_comparison(input_text, top_x):
63
  )
64
 
65
  # Word Overlap Similarity
66
- semantic_results_overlap = substring_overlap_search(
67
  input_text, definitions, proverbs, top_x
68
  )
69
- literal_results_overlap = substring_overlap_search(
70
  input_text, proverbs, definitions, top_x
71
  )
72
 
 
1
  import gradio as gr
2
  from datasets import load_dataset
3
+ from rapidfuzz.fuzz import ratio
4
  from sentence_transformers import SentenceTransformer, util
5
 
6
  # Load the dataset from Hugging Face
 
28
  for result in results
29
  ]
30
 
31
+ def fuzzy_search(input_text, corpus, reference, top_x, threshold=70):
32
  """
33
+ Perform fuzzy word overlap search between input_text and corpus using RapidFuzz.
34
+ Matches words based on a similarity score above the threshold.
35
  """
36
  input_words = input_text.lower().split()
37
  scores = []
38
 
39
  for i, text in enumerate(corpus):
40
  text_words = text.lower().split()
41
+ match_score = 0
42
 
43
+ # Compare each word in input_text with each word in the current text
44
+ for input_word in input_words:
45
+ for text_word in text_words:
46
+ if ratio(input_word, text_word) >= threshold: # Fuzzy match
47
+ match_score += 1
 
48
 
49
+ scores.append((i, match_score))
50
 
51
+ # Sort by match score and return the top results
52
  scores.sort(key=lambda x: x[1], reverse=True)
53
  return [(reference[idx], corpus[idx], score) for idx, score in scores[:top_x]]
54
 
 
65
  )
66
 
67
  # Word Overlap Similarity
68
+ semantic_results_overlap = fuzzy_search(
69
  input_text, definitions, proverbs, top_x
70
  )
71
+ literal_results_overlap = fuzzy_search(
72
  input_text, proverbs, definitions, top_x
73
  )
74
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
  sentence_transformers
 
 
1
  sentence_transformers
2
+ rapidfuzz