Spaces:
Sleeping
Sleeping
| ################# cnocr ################## | |
| from cnocr import CnOcr | |
| def validate(text): | |
| invalid_list = [' ',','] | |
| for char in invalid_list: | |
| text = text.replace(char, '') | |
| return text | |
| def check_bank(text): | |
| text = text.replace(' ', '') | |
| bank_list = ['bankofchina','hangseng','hsbc','sc'] | |
| for bank in bank_list: | |
| if bank in text: | |
| return bank | |
| else: | |
| return False | |
| def check_bank_name(img_path): | |
| # BOCH - "Consolidated Statement 2023-01-01" | |
| # HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" | |
| # HSBC - "Statement - HSBC One Account 2023-02-10" | |
| # Standard Chartered - "statementOfAccount 2023-02-01" | |
| standard_names = {'boch': "Consolidated Statement", | |
| 'hangseng': "Statement of", | |
| 'hsbc': "Statement - HSBC One Account", | |
| 'sc': "statementOfAccount"} | |
| for bank_name in standard_names: | |
| if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): | |
| return bank_name | |
| def check_mr(text): | |
| openings = ['mr', 'ms', 'miss', 'mrs'] | |
| words = text.lower().split() | |
| if words and words[0] in openings: | |
| return ''.join(words[1:]) | |
| else: | |
| return text | |
| def get_info_from_bank(img_path, file_name): | |
| # Running the model | |
| ocr = CnOcr(rec_model_name='densenet_lite_136-gru') | |
| out = ocr.ocr(img_path) | |
| # Data | |
| bank_data = { | |
| "nameStatement": "", | |
| "address": "", | |
| "bank": check_bank_name(file_name), | |
| "date": "", | |
| "asset": 0.0, | |
| "liabilities": "" | |
| } | |
| asset_y = [722,747] | |
| asset_equa = '' | |
| asset_iterations = 2 | |
| liabilities_y = [747,800] | |
| count = 0 | |
| invalid_list = ['', ' ', ','] | |
| for item in out: | |
| detected_text = item['text'] | |
| raw_detected_text = detected_text.lower() | |
| #raw_detected_text = detected_text | |
| positions = item['position'] | |
| if raw_detected_text in invalid_list or raw_detected_text is None: | |
| pass | |
| elif ((positions[0][0] >= 147) and (positions[0][1] >= 265) and (positions[2][0] <= 400) and (positions[2][1] <= 295)): | |
| if (raw_detected_text != ''): # name | |
| bank_data["nameStatement"] += raw_detected_text | |
| bank_data["nameStatement"] = check_mr(bank_data["nameStatement"]) | |
| elif ((positions[0][0] >= 113) and (positions[0][1] >= 291) and (positions[2][0] <= 500) and (positions[2][1] <= 381)): | |
| if (raw_detected_text != ''): # position | |
| bank_data["address"] += raw_detected_text | |
| bank_data["address"] += ' ' | |
| elif ((positions[0][0] >= 996) and (positions[0][1] >= 289) and (positions[2][0] <= 1083) and (positions[2][1] <= 314)): | |
| if (raw_detected_text != ''): # statement date | |
| bank_data["date"] += raw_detected_text | |
| elif ((positions[0][0] >= 900) and (positions[0][1] >= asset_y[0]) and (positions[2][0] <= 1120) and (positions[2][1] <= asset_y[1])): # | |
| # take a look at the y0/y1 position | |
| if (raw_detected_text != '' and count <= asset_iterations and ('DR' not in raw_detected_text)): # asset | |
| asset_equa += raw_detected_text | |
| asset_equa += '+' | |
| raw_detected_text = raw_detected_text.replace(',', '') | |
| #raw_detected_text = validate(raw_detected_text).lower() | |
| asset_float = float(raw_detected_text) | |
| bank_data["asset"] += asset_float | |
| asset_y[0] += 21 | |
| asset_y[1] += 27 | |
| liabilities_y[1] += 27 | |
| count += 1 | |
| elif 'DR' in raw_detected_text: | |
| bank_data["liabilities"] = validate(raw_detected_text) | |
| elif ((positions[0][0] >= 900) and (positions[0][1] >= liabilities_y[0]) and (positions[2][0] <= 1130) and (positions[2][1] <= liabilities_y[1])): | |
| if (raw_detected_text != '' and 'dr' in raw_detected_text): # liabilities | |
| raw_detected_text = raw_detected_text.replace('dr','') | |
| bank_data["liabilities"] = validate(raw_detected_text) | |
| elif check_bank(raw_detected_text) != False: # bank | |
| bank_data["bank"] = check_bank(raw_detected_text) | |
| # print('------------From bank statement------------') | |
| # print(f'Name: {bank_data["nameStatement"]}') | |
| # print(f'Address: {bank_data["address"]}') | |
| # print(f'Bank: {bank_data["bank"]}') | |
| # print(f'Date: {bank_data["date"]}') | |
| # print(f'Asset: {asset_equa} = {bank_data["asset"]}') | |
| # print(f'Liabilities: {bank_data["liabilities"]}') | |
| # post_data(bank_data["bank"], bank_data["nameStatement"], bank_data["address"], bank_data["asset"], bank_data["liabilities"], bank_data["date"]) | |
| return bank_data | |
| ########## Posting data through API ############ | |
| import requests | |
| import data_encryption | |
| # POST /api/v1/users HTTP/1.1 | |
| def post_data(bank, name, address, asset, liabilities, date): | |
| # endpoint = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT/api/v1/users' | |
| data = { | |
| "endpoint": "/SBT", | |
| "apiType": "store_statement_verif", | |
| "requestId": 'request_1234', | |
| "userId": 'user1', | |
| "bank": bank, | |
| "nameStatement": name, | |
| "address": address, | |
| "asset": str(asset), | |
| "liability": liabilities, | |
| "statementDate": date | |
| } | |
| encrypted_data = data_encryption.encrypt(data) | |
| # request = requests.post(url=endpoint, data=encrypted_data) | |
| # def extract_pdf_data(img_path='hangseng_page-0001.jpg'): | |
| # page_number = 1 | |
| # images = f'hangseng_page-000{page_number}.jpg' | |
| # get_info_from_bank(img_path) | |