Spaces:
Sleeping
Sleeping
| from cnocr import CnOcr | |
| def string_similarity(s1, s2): # Levenshtein distance algorithm | |
| s1 = s1.replace(' ','') | |
| s1 = s1.lower() | |
| s2 = s2.replace(' ','') | |
| s2 = s2.lower() | |
| if s1 == s2: | |
| return 100.0 | |
| len1 = len(s1) | |
| len2 = len(s2) | |
| matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)] | |
| for i in range(len1 + 1): | |
| matrix[i][0] = i | |
| for j in range(len2 + 1): | |
| matrix[0][j] = j | |
| for i in range(1, len1 + 1): | |
| for j in range(1, len2 + 1): | |
| if s1[i - 1] == s2[j - 1]: | |
| cost = 0 | |
| else: | |
| cost = 1 | |
| matrix[i][j] = min(matrix[i - 1][j] + 1, # deletion | |
| matrix[i][j - 1] + 1, # insertion | |
| matrix[i - 1][j - 1] + cost) # substitution | |
| similarity = (1 - matrix[len1][len2] / max(len1, len2)) * 100 | |
| return round(similarity, 1) | |
| def is_good_subsequence(s1, s2): | |
| len_s2 = len(s2) | |
| len_s1 = len(s1) | |
| s1 = s1.lower() | |
| s2 = s2.lower() | |
| if len_s2 > len_s1 + 10: | |
| return False | |
| # Initialize variables for counting matches | |
| match_count = 0 | |
| s1_index = 0 | |
| # Iterate over each character in s2 | |
| for char in s2: | |
| # Search for the character in s1 starting from the last matched index | |
| while s1_index < len_s1: | |
| if s1[s1_index] == char: | |
| match_count += 1 | |
| s1_index += 1 | |
| break | |
| s1_index += 1 | |
| # Check if the match count is more than 70% of s2 length | |
| return match_count >= (0.5 * len_s2) | |
| def check_hkid(path): | |
| ocr = CnOcr(rec_model_name='en_PP-OCRv3') | |
| # ocr = CnOcr(rec_model_name='densenet_lite_136-fc') | |
| out = ocr.ocr(path) | |
| for data in out: | |
| text = data['text'] | |
| if string_similarity('HONGKONGPERMANENTIDENTITYCARD', text) > 60: | |
| return True | |
| return False | |
| # print(check_hkid('image/hkid.jpg')) |