Spaces:
Sleeping
Sleeping
| from cnocr import CnOcr | |
| import openai | |
| from dotenv import load_dotenv | |
| import os | |
| import json | |
| import checkTool | |
| def model0(path): | |
| ocr = CnOcr(rec_model_name='en_PP-OCRv3') | |
| out = ocr.ocr(path) | |
| print(out) | |
| load_dotenv() | |
| openai.api_key = os.environ.get("data-extraction-api") | |
| invalid_list = [' ',','] | |
| data_set_1 = [] | |
| for item in out: | |
| if item['text'] not in invalid_list: | |
| data_set_1.append(item['text']) | |
| print(f'All data here: {data_set_1}') | |
| completion = openai.ChatCompletion.create( | |
| model = "gpt-3.5-turbo", | |
| temperature = 0, | |
| messages = [ | |
| {"role": "system", "content": "You are an AI assistant for extracting data from HKID card with following information \ | |
| (name, date of birth, date of issue, HKID number) from HKID card. Uppercase and lowercase letters are the same. Store the results in \ | |
| dictionary format"}, | |
| {"role": "user", "content": f"Extract data from the following set of text: {data_set_1}. \ | |
| You have three types of data to extract. \ | |
| 1. id card holder full name (it noramlly is a chinese name, including surname and family \ | |
| name in English spelling, and it may be separate in different fields in the data set for surname and family name \ | |
| sometimes) \ | |
| 2. date of birth (should be a date with year, month and day, e.g. 23-02-2003 is the required format, but 26-11 is not \ | |
| because date of birth should have 10 characters) Only choose valid format!!!\ | |
| 3. date of issue (a string with format xx-xx) \ | |
| 4. HKID number (The standard format of HKID number is @123456(#) e.g. A123456(7) is a valid HKID number. \ | |
| (a) @ represents any one or two capital letters of the alphabet. \ | |
| (b) # is the check digit which has 11 possible values from 0 to 9 and A.) \ | |
| Remember to include the check digit with () \ | |
| Only reply a dictionary. No need to add other words or explanation. Use double quote for dictionary."}, | |
| ] | |
| ) | |
| data = completion['choices'][0]['message']['content'] | |
| print(data) | |
| id_data = json.loads(data) | |
| name = id_data["name"] | |
| dateofbirth = id_data["date of birth"] | |
| issuedate = id_data["date of issue"] | |
| hkid = id_data["HKID number"] | |
| if checkTool.validate_hkid(hkid=hkid): | |
| valid_hkid = 'True' | |
| else: | |
| valid_hkid = 'False' | |
| name = checkTool.seperate_name(name) | |
| print(id_data) | |
| return [name, valid_hkid, hkid, issuedate, dateofbirth] | |
| # return [name, valid_hkid, hkid, issuedate] | |