Spaces:

OneFi
/

hf-similarity-check

Sleeping

hf-similarity-check / extraction_data.py

Mitul Mohammad Abdullah Al Mukit

first commit

1f72938 over 2 years ago

3.88 kB

	################# cnocr ##################
	from cnocr import CnOcr
	from pdfquery import PDFQuery
	import openai

	def validate(text):
	invalid_list = [' ',',']
	for char in invalid_list:
	text = text.replace(char, '')
	return text

	def check_bank(text):
	text = text.replace(' ', '')
	bank_list = ['bankofchina','hangseng','hsbc','sc']
	for bank in bank_list:
	if bank in text:
	return bank
	else:
	return False

	def check_bank_name(img_path):
	# BOCH - "Consolidated Statement 2023-01-01"
	# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07"
	# HSBC - "Statement - HSBC One Account 2023-02-10"
	# Standard Chartered - "statementOfAccount 2023-02-01"
	standard_names = {'boch': "Consolidated Statement",
	'hangseng': "Statement of",
	'hsbc': "Statement - HSBC One Account",
	'sc': "statementOfAccount"}
	for bank_name in standard_names:
	if bank_name in str(img_path) or standard_names[bank_name] in str(img_path):
	return bank_name

	def check_mr(text):
	openings = ['mr', 'ms', 'miss', 'mrs']
	words = text.lower().split()
	if words and words[0] in openings:
	return ''.join(words[1:])
	else:
	return text

	def get_info_from_bank(img_path, pdf_path):
	# Running the model
	ocr = CnOcr(rec_model_name='densenet_lite_136-gru')
	out = ocr.ocr(img_path)

	# Data
	bank_data = {
	"name_on_bs": "",
	"address": "",
	"bank": "",
	"date": "",
	"asset": 0.0,
	"liabilities": ""
	}

	# {
	# "Customer Name": "MR CHIU CHUNG YIN",
	# "Address": "FLAT 13,8/F,OILOK HOUSE, YAU OI ESTATE, TUEN MUN NT",
	# "Bank Name": "HSBC",
	# "Statement Issue Date": "10 January 2023",
	# "Total Asset": "7,265.80",
	# "Total Liability": "7,265.80"
	# }

	openai.api_key = "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"
	invalid_list = [' ',',']
	data_set_1 = []

	pdf = PDFQuery(pdf_path)
	pdf.load(0)
	text_elements = pdf.pq('LTTextLineHorizontal').text()
	text_elements = text_elements.replace("cid:", "")

	for item in out:
	if item['text'] not in invalid_list:
	data_set_1.append(item['text'])

	completion = openai.ChatCompletion.create(
	model = "gpt-3.5-turbo",
	temperature = 0.2,
	messages = [
	{"role": "system", "content": "You are an AI assistant for extracting data from bank statements. Uppercase and lowercase letters are the same. List results in a dictionary format."},
	{"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1} and {text_elements}. (1.) Data that locate in the front part of the text: customer full name, address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."},
	# {"role": "assistant", "content": "Q: How do you make 7 even? A: Take away the s."},
	# {"role": "user", "content": "Write one related to programmers."}
	]
	)
	bs_data = completion['choices'][0]['message']['content']
	print(bs_data)
	return bs_data

	# get_info_from_bank('hangseng_page-0001.jpg','hangseng.pdf')
	# get_info_from_bank('hsbc_one_account_page-0001.jpg','hsbc_one_account.pdf')
	# get_info_from_bank('boch_consolidated.jpg','boch_consolidated.pdf')
	get_info_from_bank('hsbc_one_account_page-10001.jpg','hsbc_one_account_page-10001.pdf')