Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload fuzzy_matching.py
Browse filesfeat: a python script with functions used to process and map users locations to the most similar matches from a reference dataset of town names
- src/fuzzy_matching.py +258 -0
src/fuzzy_matching.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Problem:
|
3 |
+
Nt3awnou's platform collects raw data filled manually by users (people in need).
|
4 |
+
Among this data is the user's localisation.
|
5 |
+
The localisation is a text input that is not standardized:
|
6 |
+
i.e. a user can input a single or multiple locations
|
7 |
+
(either douars/provinces/communes/regions or all combined),
|
8 |
+
in arabic or latin, with misspellings etc.
|
9 |
+
This doesn't help in visualization or in statistics
|
10 |
+
where localisations can be redundant because they were written in different manners.
|
11 |
+
|
12 |
+
Examples
|
13 |
+
```
|
14 |
+
دوار تجكَالت
|
15 |
+
ابرداتن ازكور
|
16 |
+
خزامة
|
17 |
+
Tansgharte
|
18 |
+
دوار امندار
|
19 |
+
Douar Essour Tidrara Aghwatim Tahnaouet Al Haouz
|
20 |
+
دوار تكاديرت
|
21 |
+
Douar Essour tidrara- aghouatine- Tahanaout-El Haouz
|
22 |
+
```
|
23 |
+
Solution:
|
24 |
+
We collected a reference dataset that contains all douar names (arabic and latin)
|
25 |
+
with their corresponding regions, communes and provinces.
|
26 |
+
We developed methods using fuzzy matching and phonetics
|
27 |
+
to map the user's localisation to the closest match in the reference dataset
|
28 |
+
|
29 |
+
"""
|
30 |
+
|
31 |
+
from typing import Tuple
|
32 |
+
from pyphonetics import RefinedSoundex, Metaphone
|
33 |
+
import math
|
34 |
+
import difflib
|
35 |
+
import re
|
36 |
+
|
37 |
+
|
38 |
+
EPICENTER_LOCATION = [31.12210171476489, -8.42945837915193]
|
39 |
+
certainty_threshold = 1
|
40 |
+
|
41 |
+
|
42 |
+
def extract_ngrams(text, n):
|
43 |
+
"""
|
44 |
+
A function that returns a list of n-grams from a text
|
45 |
+
"""
|
46 |
+
ngrams = []
|
47 |
+
|
48 |
+
if n < 1 or n > len(text):
|
49 |
+
return ngrams # Return an empty list if n is invalid
|
50 |
+
|
51 |
+
# Iterate through the text and extract n-grams
|
52 |
+
for i in range(len(text) - n + 1):
|
53 |
+
ngram = text[i:i + n]
|
54 |
+
ngrams.append(' '.join(ngram))
|
55 |
+
|
56 |
+
return ngrams
|
57 |
+
|
58 |
+
|
59 |
+
def get_phonetics_distance(w1, w2):
|
60 |
+
"""
|
61 |
+
A function that calculates levenhstein distance between phonetics
|
62 |
+
representation of two words: add error term to the score
|
63 |
+
"""
|
64 |
+
rs = RefinedSoundex()
|
65 |
+
mt = Metaphone()
|
66 |
+
d1 = mt.distance(w1, w2, metric='levenshtein')
|
67 |
+
d2 = rs.distance(w1, w2, metric='levenshtein')
|
68 |
+
res = (d1 + d2) / 2 + 0.05
|
69 |
+
return res
|
70 |
+
|
71 |
+
|
72 |
+
def get_top_n_phonetics_matches(
|
73 |
+
w: str, ref_words: list, threshold=1, top_n=1) -> list[Tuple]:
|
74 |
+
"""
|
75 |
+
A function that returns the top_n closest words to w from ref_words
|
76 |
+
for which distance <= threshold
|
77 |
+
using phonetical representation
|
78 |
+
"""
|
79 |
+
if not w:
|
80 |
+
return list()
|
81 |
+
distances = {x: get_phonetics_distance(w, x) for x in ref_words}
|
82 |
+
selected_words = {x: d for x, d in distances.items() if d<=threshold}
|
83 |
+
sorted_d = dict(sorted(selected_words.items(), key=lambda item: item[1]))
|
84 |
+
|
85 |
+
return list(sorted_d.items())[:top_n]
|
86 |
+
|
87 |
+
|
88 |
+
def get_geometric_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
|
89 |
+
"""
|
90 |
+
A function that returns the distance between two points on earth
|
91 |
+
using the haversine formula
|
92 |
+
"""
|
93 |
+
dlon = math.radians(lon2 - lon1)
|
94 |
+
dlat = math.radians(lat2 - lat1)
|
95 |
+
a0 = (math.sin(dlat / 2)) ** 2 + math.cos(math.radians(lat1))
|
96 |
+
a = a0 * math.cos(math.radians(lat2)) * (math.sin(dlon / 2)) ** 2
|
97 |
+
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
|
98 |
+
distance = 6371 * c
|
99 |
+
return distance
|
100 |
+
|
101 |
+
|
102 |
+
def are_village_names_similar(village_a: str, village_b: str) -> float:
|
103 |
+
"""
|
104 |
+
A function that returns True if the two villages
|
105 |
+
are similar using strict fuzzy matching
|
106 |
+
"""
|
107 |
+
if difflib.SequenceMatcher(None, village_a, village_b).ratio() >= 0.90:
|
108 |
+
return True
|
109 |
+
return False
|
110 |
+
|
111 |
+
|
112 |
+
def get_uncertainty_range(input_dict: dict, threshold: float) -> list:
|
113 |
+
"""
|
114 |
+
A function that returns a list of tuples of the closest matches
|
115 |
+
"""
|
116 |
+
if len(input_dict)<=1:
|
117 |
+
return input_dict
|
118 |
+
|
119 |
+
# sort by distance
|
120 |
+
sorted_items = sorted(input_dict.items(), key=lambda item: item[1][1])
|
121 |
+
data = {key: value for key, value in sorted_items}
|
122 |
+
|
123 |
+
# Iterate through the keys in the dictionary
|
124 |
+
keys = list(data.keys())
|
125 |
+
min_key = keys[0]
|
126 |
+
min_value = data[min_key][1]
|
127 |
+
|
128 |
+
# Initialize a list to store the result tuples
|
129 |
+
result = {f"{min_key}":data[min_key]}
|
130 |
+
|
131 |
+
for j in range(1, len(keys)):
|
132 |
+
key2 = keys[j]
|
133 |
+
value2 = data[key2][1]
|
134 |
+
|
135 |
+
# Calculate the absolute difference between the float values
|
136 |
+
difference = abs(min_value - value2)
|
137 |
+
|
138 |
+
# If the difference is less than the threshold, add the tuple to the result
|
139 |
+
if difference <= threshold:
|
140 |
+
result[key2] = data[key2]
|
141 |
+
else:
|
142 |
+
break
|
143 |
+
|
144 |
+
return result
|
145 |
+
|
146 |
+
|
147 |
+
def match_word(w, ref_dict, select_one_match=False):
|
148 |
+
"""
|
149 |
+
A function that returns the closest match of w from ref_dict
|
150 |
+
using phonetical representation and fuzzy matching
|
151 |
+
"""
|
152 |
+
w = w.strip().upper()
|
153 |
+
|
154 |
+
if len(w)==0:
|
155 |
+
return {}
|
156 |
+
|
157 |
+
else:
|
158 |
+
closest_ref_w = dict()
|
159 |
+
use_phonetics = True
|
160 |
+
|
161 |
+
for category, names in ref_dict.items():
|
162 |
+
# check exact matching
|
163 |
+
if w in names:
|
164 |
+
use_phonetics = False
|
165 |
+
closest_ref_w[category] = (w, 0)
|
166 |
+
break
|
167 |
+
|
168 |
+
# check textual similarity (fuzzy matching)
|
169 |
+
sim = list(map(lambda x:are_village_names_similar(w,x), names))
|
170 |
+
similar_names = [names[i] for i in range(len(names)) if sim[i]==True]
|
171 |
+
if similar_names:
|
172 |
+
use_phonetics = False
|
173 |
+
closest_ref_w[category] = (similar_names[0], 0.01) if select_one_match else list(map(lambda x:(x, 0.01), similar_names))
|
174 |
+
|
175 |
+
# if no similar name was found check phonetical similarity
|
176 |
+
else:
|
177 |
+
res = get_top_n_phonetics_matches(w, names, threshold=2, top_n=1)
|
178 |
+
if res:
|
179 |
+
closest_ref_w[category] = res[0] # get closest match
|
180 |
+
|
181 |
+
if closest_ref_w and use_phonetics:
|
182 |
+
if not select_one_match:
|
183 |
+
closest_ref_w = get_uncertainty_range(closest_ref_w, certainty_threshold)
|
184 |
+
else:
|
185 |
+
k, v = min(closest_ref_w.items(), key=lambda x: x[1][1])
|
186 |
+
closest_ref_w = {k: v}
|
187 |
+
|
188 |
+
return closest_ref_w
|
189 |
+
|
190 |
+
|
191 |
+
def parse_and_map_localisation(text: str, ref_dict: dict, select_one_match: bool=True):
|
192 |
+
"""
|
193 |
+
A function that parses text containing users localisation
|
194 |
+
and returns the closest matches per categoty from ref_dict
|
195 |
+
Example:
|
196 |
+
input = COMMUNE MZODA : DOUARS : TOUKHRIBIN –TLAKEMT - COMMUNE IMINDOUNITE : DOUAR AZARZO
|
197 |
+
output = {'commune_fr': ('IMINDOUNIT', 0.01), 'nom_fr': ('TOUKHRIBINE', 0.01)}
|
198 |
+
"""
|
199 |
+
toxic = r"\bدوار|مصلى|\(|\)|douars?|communes?|cercles?|provinces?|villes?|regions?|caidate?|and|جماعة|\b|:|-|\d"
|
200 |
+
text = re.sub(toxic, '', text.lower())
|
201 |
+
regex_pattern = r"\|| |\.|,|/|et |و "
|
202 |
+
tokens = re.split(regex_pattern, text.replace('-', ' '))
|
203 |
+
filtered_tokens = [s for s in tokens if s.strip() != '']
|
204 |
+
|
205 |
+
ngrams_mapping = {}
|
206 |
+
|
207 |
+
for n in range(1, len(filtered_tokens)+1):
|
208 |
+
|
209 |
+
# generate ngrams
|
210 |
+
ngrams = extract_ngrams(filtered_tokens, n)
|
211 |
+
|
212 |
+
# init dict with ngram mapping
|
213 |
+
mapping_ngram = {}
|
214 |
+
|
215 |
+
# generate a mapping for the ngram with argmin matches
|
216 |
+
for tok in ngrams:
|
217 |
+
res = match_word(tok, ref_dict, select_one_match=select_one_match)
|
218 |
+
if not res:
|
219 |
+
continue
|
220 |
+
|
221 |
+
min_k, min_v = min(res.items(), key=lambda x:x[1][1])
|
222 |
+
|
223 |
+
# if min_k in previous tokens, then choose the min, else add it to mapping
|
224 |
+
if min_k in mapping_ngram:
|
225 |
+
saved_match, saved_distance = mapping_ngram[min_k]
|
226 |
+
|
227 |
+
if saved_distance > min_v[1]:
|
228 |
+
mapping_ngram[min_k] = min_v
|
229 |
+
|
230 |
+
else:
|
231 |
+
continue
|
232 |
+
|
233 |
+
else:
|
234 |
+
mapping_ngram[min_k] = min_v
|
235 |
+
|
236 |
+
ngrams_mapping[n] = mapping_ngram
|
237 |
+
|
238 |
+
|
239 |
+
# first squeeze dict s.t. one match remains per category
|
240 |
+
categories = ref_dict.keys()
|
241 |
+
result = {}
|
242 |
+
for _, inner_dict in ngrams_mapping.items():
|
243 |
+
for k in categories:
|
244 |
+
# Check if the key exists in the inner dictionary
|
245 |
+
if k in inner_dict:
|
246 |
+
current_match, current_val = inner_dict[k]
|
247 |
+
if k in result:
|
248 |
+
previous_match, previous_val = result[k]
|
249 |
+
if current_val < previous_val:
|
250 |
+
result[k] = (current_match, current_val)
|
251 |
+
else:
|
252 |
+
result[k] = (current_match, current_val)
|
253 |
+
|
254 |
+
# then, discard matches with a high distance from min (set 0.5+min_d as threshold)
|
255 |
+
thresh = min(result.values(), key=lambda x:x[1])[1] + 0.5
|
256 |
+
output = {k: v_d for k, v_d in result.items() if v_d[1]<=thresh}
|
257 |
+
|
258 |
+
return output
|