Spaces:
Sleeping
Sleeping
| ################# cnocr ################## | |
| from cnocr import CnOcr | |
| from pdfquery import PDFQuery | |
| import openai | |
| def validate(text): | |
| invalid_list = [' ',','] | |
| for char in invalid_list: | |
| text = text.replace(char, '') | |
| return text | |
| def check_bank(text): | |
| text = text.replace(' ', '') | |
| bank_list = ['bankofchina','hangseng','hsbc','sc'] | |
| for bank in bank_list: | |
| if bank in text: | |
| return bank | |
| else: | |
| return False | |
| def check_bank_name(img_path): | |
| # BOCH - "Consolidated Statement 2023-01-01" | |
| # HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" | |
| # HSBC - "Statement - HSBC One Account 2023-02-10" | |
| # Standard Chartered - "statementOfAccount 2023-02-01" | |
| standard_names = {'boch': "Consolidated Statement", | |
| 'hangseng': "Statement of", | |
| 'hsbc': "Statement - HSBC One Account", | |
| 'sc': "statementOfAccount"} | |
| for bank_name in standard_names: | |
| if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): | |
| return bank_name | |
| def check_mr(text): | |
| openings = ['mr', 'ms', 'miss', 'mrs'] | |
| words = text.lower().split() | |
| if words and words[0] in openings: | |
| return ''.join(words[1:]) | |
| else: | |
| return text | |
| def get_info_from_bank(img_path, pdf_path): | |
| # Running the model | |
| ocr = CnOcr(rec_model_name='densenet_lite_136-gru') | |
| out = ocr.ocr(img_path) | |
| # Data | |
| bank_data = { | |
| "name_on_bs": "", | |
| "address": "", | |
| "bank": "", | |
| "date": "", | |
| "asset": 0.0, | |
| "liabilities": "" | |
| } | |
| # { | |
| # "Customer Name": "MR CHIU CHUNG YIN", | |
| # "Address": "FLAT 13,8/F,OILOK HOUSE, YAU OI ESTATE, TUEN MUN NT", | |
| # "Bank Name": "HSBC", | |
| # "Statement Issue Date": "10 January 2023", | |
| # "Total Asset": "7,265.80", | |
| # "Total Liability": "7,265.80" | |
| # } | |
| openai.api_key = "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" | |
| invalid_list = [' ',','] | |
| data_set_1 = [] | |
| pdf = PDFQuery(pdf_path) | |
| pdf.load(0) | |
| text_elements = pdf.pq('LTTextLineHorizontal').text() | |
| text_elements = text_elements.replace("cid:", "") | |
| for item in out: | |
| if item['text'] not in invalid_list: | |
| data_set_1.append(item['text']) | |
| completion = openai.ChatCompletion.create( | |
| model = "gpt-3.5-turbo", | |
| temperature = 0.2, | |
| messages = [ | |
| {"role": "system", "content": "You are an AI assistant for extracting data from bank statements. Uppercase and lowercase letters are the same. List results in a dictionary format."}, | |
| {"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1} and {text_elements}. (1.) Data that locate in the front part of the text: customer full name, address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."}, | |
| # {"role": "assistant", "content": "Q: How do you make 7 even? A: Take away the s."}, | |
| # {"role": "user", "content": "Write one related to programmers."} | |
| ] | |
| ) | |
| bs_data = completion['choices'][0]['message']['content'] | |
| print(bs_data) | |
| return bs_data | |
| # get_info_from_bank('hangseng_page-0001.jpg','hangseng.pdf') | |
| # get_info_from_bank('hsbc_one_account_page-0001.jpg','hsbc_one_account.pdf') | |
| # get_info_from_bank('boch_consolidated.jpg','boch_consolidated.pdf') | |
| get_info_from_bank('hsbc_one_account_page-10001.jpg','hsbc_one_account_page-10001.pdf') | |