Amamrnaf commited on
Commit
df646cb
·
1 Parent(s): 2bba469

`Added new parsers and updated existing ones for Noc timesheet and invoice extraction`

Browse files
Files changed (2) hide show
  1. app.py +89 -18
  2. dataSchema.py +124 -2
app.py CHANGED
@@ -36,21 +36,63 @@ def Noc_timeSheet_pdf_to_img(pdf_path,dpi: int = 300, quality: int = 95):
36
 
37
  # Save the combined image
38
  combined_image.save(output_path, "JPEG", quality=quality)
39
-
40
  def Clauses_in_invoice(pdf_path: str) -> bool:
41
  """
42
  Extract text from the last page of a PDF.
43
  """
44
- pdf_document = pymupdf.open(pdf_path)
45
- total_pages = pdf_document.page_count
46
- last_page = pdf_document.load_page(total_pages - 1)
47
- text = last_page.get_text()
48
- pdf_document.close()
49
- if "clauses" in text.lower():
50
- return True
51
- else:
 
 
 
 
 
 
 
 
 
 
52
  return False
53
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
55
  pdf_document = pymupdf.open(pdf_path)
56
  folder_path = folder_path.rstrip(os.sep)
@@ -107,11 +149,26 @@ def noc_invoice_extraction(pdf_path: str, folder_path):
107
  new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
108
  for item in new_item["items"]:
109
  data["items"].append(item)
110
- result = get_image_informations(image_paths[-2], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
111
  data.update(result)
112
  delete_images(image_paths)
113
  return data
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def process_file(file, option):
116
  if file is None:
117
  return "Please upload a PDF or image file."
@@ -125,31 +182,45 @@ def process_file(file, option):
125
  print(file_extension)
126
  if file_extension in ['.pdf']:
127
  # Process PDF files
128
- if option == "Noc_timesheet_residential":
129
  print(file_path)
130
  Noc_timeSheet_pdf_to_img(file_path)
131
  print("here 2")
132
  result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
133
  return result
134
- elif option == "Noc_timesheet_rotational":
135
  Noc_timeSheet_pdf_to_img(file_path)
136
  result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
137
  return result
138
- elif option == "Noc_invoice":
139
  result = noc_invoice_extraction(file_path, save_dir)
140
  return result
 
 
 
 
 
 
 
 
141
  elif file_extension in ['.jpg', '.jpeg', '.png']:
142
  # Process image files directly
143
- if option == "Noc_timesheet_residential":
144
  result = get_image_informations(file_path, Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
145
  return result
146
- elif option == "Noc_timesheet_rotational":
147
  result = get_image_informations(file_path, Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
148
  return result
149
- elif option == "Noc_invoice":
150
  # For invoice images, we assume it's a single page
151
  result = get_image_informations(file_path, invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
152
  return result
 
 
 
 
 
 
153
  else:
154
  return "Unsupported file type. Please upload a PDF or image file."
155
  except Exception as e:
@@ -160,7 +231,7 @@ demo = gr.Interface(
160
  fn=process_file,
161
  inputs=[
162
  gr.File(label="Upload PDF or Image"), # File upload input
163
- gr.Radio(["Noc_timesheet_residential", "Noc_timesheet_rotational", "Noc_invoice"], label="Choose an option") # Radio buttons for options
164
  ],
165
  outputs="text", # Text output
166
  title="PDF/Image Processor",
 
36
 
37
  # Save the combined image
38
  combined_image.save(output_path, "JPEG", quality=quality)
39
+
40
  def Clauses_in_invoice(pdf_path: str) -> bool:
41
  """
42
  Extract text from the last page of a PDF.
43
  """
44
+ try:
45
+
46
+ pdf_document = pymupdf.open(pdf_path)
47
+ total_pages = pdf_document.page_count
48
+ if total_pages < 2:
49
+ print("The PDF has fewer than 2 pages.")
50
+ return False
51
+ last_page = pdf_document.load_page(total_pages - 1)
52
+ text = last_page.get_text()
53
+
54
+ last_page = text.lower()
55
+
56
+ if "clauses" in last_page:
57
+ return True
58
+ else:
59
+ return False
60
+ except Exception as e:
61
+ print(f"error :{e}")
62
  return False
63
+
64
+ finally:
65
+ # Ensure the PDF document is closed
66
+ if 'pdf_document' in locals():
67
+ pdf_document.close()
68
+
69
+ def Clauses_in_invoice_2nd_version(pdf_path: str) -> bool:
70
+ """
71
+ Extract text from the last page of a PDF.
72
+ """
73
+ try:
74
+
75
+ pdf_document = pymupdf.open(pdf_path)
76
+ total_pages = pdf_document.page_count
77
+ if total_pages < 2:
78
+ print("The PDF has fewer than 2 pages.")
79
+ return False
80
+ second_to_last_page = pdf_document.load_page(total_pages - 2)
81
+ text = second_to_last_page.get_text()
82
+
83
+ if "clauses" in text.lower():
84
+ return True
85
+ else:
86
+ return False
87
+ except Exception as e:
88
+ print(f"error :{e}")
89
+ return False
90
+
91
+ finally:
92
+ # Ensure the PDF document is closed
93
+ if 'pdf_document' in locals():
94
+ pdf_document.close()
95
+
96
  def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
97
  pdf_document = pymupdf.open(pdf_path)
98
  folder_path = folder_path.rstrip(os.sep)
 
149
  new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
150
  for item in new_item["items"]:
151
  data["items"].append(item)
152
+ result = get_image_informations(image_paths[-1], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
153
  data.update(result)
154
  delete_images(image_paths)
155
  return data
156
 
157
+
158
+ def pdf_to_img(pdf_path, dpi: float = 300, quality: float = 95):
159
+ pdf_document = pymupdf.open(pdf_path)
160
+ page = pdf_document.load_page(0) # Load the first page
161
+ output_path = "output.jpg"
162
+
163
+ # Convert the page to a pixmap (image)
164
+ pix = page.get_pixmap(dpi=dpi)
165
+
166
+ # Convert the pixmap to a PIL Image
167
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
168
+
169
+ image.save(output_path, "JPEG",quality=quality)
170
+
171
+
172
  def process_file(file, option):
173
  if file is None:
174
  return "Please upload a PDF or image file."
 
182
  print(file_extension)
183
  if file_extension in ['.pdf']:
184
  # Process PDF files
185
+ if option == "Noc_timesheet_residential_old":
186
  print(file_path)
187
  Noc_timeSheet_pdf_to_img(file_path)
188
  print("here 2")
189
  result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
190
  return result
191
+ elif option == "Noc_timesheet_rotational_old":
192
  Noc_timeSheet_pdf_to_img(file_path)
193
  result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
194
  return result
195
+ elif option == "Noc_PO":
196
  result = noc_invoice_extraction(file_path, save_dir)
197
  return result
198
+ elif option =="Noc_timesheet_new":
199
+ pdf_to_img(file_path)
200
+ result = get_image_informations("output.jpg", Noc_timesheet_prompt, Noc_timesheet_parser_v1)
201
+ return result
202
+ elif option == "Noc_invoice":
203
+ pdf_to_img(file_path)
204
+ result = get_image_informations("output.jpg", Noc_invoice_prompt, Noc_invoice_parser_v1)
205
+ return result
206
  elif file_extension in ['.jpg', '.jpeg', '.png']:
207
  # Process image files directly
208
+ if option == "Noc_timesheet_residential_old":
209
  result = get_image_informations(file_path, Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
210
  return result
211
+ elif option == "Noc_timesheet_rotational_old":
212
  result = get_image_informations(file_path, Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
213
  return result
214
+ elif option == "Noc_PO":
215
  # For invoice images, we assume it's a single page
216
  result = get_image_informations(file_path, invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
217
  return result
218
+ elif option =="Noc_timesheet_new":
219
+ result = get_image_informations(file_path, Noc_timesheet_prompt, Noc_timesheet_parser_v1)
220
+ return result
221
+ elif option == "Noc_invoice":
222
+ result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
223
+ return result
224
  else:
225
  return "Unsupported file type. Please upload a PDF or image file."
226
  except Exception as e:
 
231
  fn=process_file,
232
  inputs=[
233
  gr.File(label="Upload PDF or Image"), # File upload input
234
+ gr.Radio(["Noc_timesheet_new","Noc_invoice","Noc_timesheet_residential_old", "Noc_timesheet_rotational_old", "Noc_PO"], label="Choose an option") # Radio buttons for options
235
  ],
236
  outputs="text", # Text output
237
  title="PDF/Image Processor",
dataSchema.py CHANGED
@@ -1,6 +1,6 @@
1
  from pydantic import BaseModel, Field
2
  from typing import Optional,List
3
- from langchain_core.output_parsers import JsonOutputParser
4
 
5
  class Noc_Residential_TimeSheetInformation(BaseModel):
6
  """Details of a timesheet entry."""
@@ -53,6 +53,29 @@ class Noc_Rotational_TimeSheetInformation(BaseModel):
53
  approved_on : str = Field(...,description="DD/MM/YY of the stamp")
54
  approved_by : str = Field(...,description="Name of the person who approved the document")
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  class Noc_Item_Information(BaseModel):
57
  """Details of each item in the document."""
58
  item_number : int = Field(...,description="the number of the item")
@@ -106,7 +129,103 @@ class Noc_PurchaseOrderInformation(BaseModel):
106
  class Noc_Clauses(BaseModel):
107
  Clauses: str = Field(..., description="the contract clauses.")
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  Noc_Res_timesheet_prompt = """
111
  Based on the provided timesheet details, extract the following information:
112
  - Full name of the person
@@ -221,6 +340,7 @@ extract from the document:
221
  invoice_clauses_page_prompt = """
222
  extract from the document the clauses """
223
 
 
224
  # CHOOSING PARSER DEPENDING ON THE TYPE OF DOCUMENT
225
  Noc_Res_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Residential_TimeSheetInformation)
226
  Noc_Rot_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Rotational_TimeSheetInformation)
@@ -228,4 +348,6 @@ Noc_PurchaseOrder_information_parser = JsonOutputParser(pydantic_object=Noc_Purc
228
  Noc_PurchaseOrder_item1_parser = JsonOutputParser(pydantic_object=Noc_Document_Information)
229
  Noc_PurchaseOrder_items_parser = JsonOutputParser(pydantic_object=Noc_items)
230
  Noc_PurchaseOrder_total_parser = JsonOutputParser(pydantic_object=Noc_total)
231
- Noc_PurchaseOrder_clauses_parser = JsonOutputParser(pydantic_object=Noc_Clauses)
 
 
 
1
  from pydantic import BaseModel, Field
2
  from typing import Optional,List
3
+ from langchain_core.output_parsers import JsonOutputParser # type: ignore
4
 
5
  class Noc_Residential_TimeSheetInformation(BaseModel):
6
  """Details of a timesheet entry."""
 
53
  approved_on : str = Field(...,description="DD/MM/YY of the stamp")
54
  approved_by : str = Field(...,description="Name of the person who approved the document")
55
 
56
+ class Noc_TimeSheetInformation(BaseModel):
57
+ """Details of a timesheet entry."""
58
+ position_title: str = Field(..., description="Position title of the person.")
59
+ work_location: str = Field(..., description="Work location ")
60
+ agency: str = Field(..., description="the agency")
61
+ noc_id: str = Field(..., description="NOC ID of the person.")
62
+ full_name: str = Field(..., description="Full name of the person.")
63
+ approval_status: str = Field("not approved",description="(e.g., 'approved', 'not approved').")
64
+ approved_on : str = Field(...,description="DD/MM/YY of the stamp")
65
+ approved_by : str = Field(...,description="Name of the person who approved the document")
66
+
67
+ service_days_onshore: int = Field(0, description="Number of service days onshore.")
68
+ standby_days_onshore: int = Field(0, description="Number of standby days onshore in Doha.")
69
+ service_days_offshore: int = Field(0, description="Number of service days offshore.")
70
+ service_days_weekend_public_holiday: int = Field(0, description="Number of service days during weekends or public holidays.")
71
+ standby_extended_hitch_days_offshore: int = Field(0, description="Number of standby and extended hitch days offshore.")
72
+ extended_hitch_days_onshore: int = Field(0, description="Number of extended hitch days onshore for rotational personnel.")
73
+ overtime_Hourly_Rate_ONSHORE:int = Field(0,description="number of over time hours onshore (Over 8 hours) ")
74
+ overtime_Hourly_Rate_OFFSHORE:int = Field(0,description="number of over time hours offshore (Over 12 hours) ")
75
+ per_diem_days: int = Field(0, description="Number of Per Diem days for onshore/offshore rotational personnel.")
76
+ training_days: int = Field(0, description="Number of training days.")
77
+ travel_days: int = Field(0, description="Number of travel days.")
78
+
79
  class Noc_Item_Information(BaseModel):
80
  """Details of each item in the document."""
81
  item_number : int = Field(...,description="the number of the item")
 
129
  class Noc_Clauses(BaseModel):
130
  Clauses: str = Field(..., description="the contract clauses.")
131
 
132
+ class service(BaseModel):
133
+ service: str = Field(..., description="the service name.")
134
+ from_date: str = Field(None,description="starting date in DD/MM/YYYY format..")
135
+ to_date: str = Field(None,description="ending date in DD/MM/YYYY format.")
136
+ currency : str =Field(...,description="currency of the rate.")
137
+ fx: str = Field(None,description="foreign exchange.")
138
+ Number_of_days_hours: int = Field(...,description="number of hours or days for the service.")
139
+ rate: float = Field(..., description="the rate of the service.")
140
+ total: float = Field(...,description="total which is the rate* No of days/hours .")
141
+
142
+ class bank_details:
143
+ bank_name: str = Field(..., description="Name of the bank.")
144
+ swift_bic_code: str = Field(None, description=" SWIFT/BIC CODE.")
145
+ iban_number: str = Field(...,description="IBAN Number.")
146
+ beneficiary_name:str = Field(...,description="full name")
147
+ account_currency: str = Field(...,description="Account currency.")
148
+ expected_amount: str = Field(...,description="the amount.")
149
+
150
+ class Noc_Invoice(BaseModel):
151
+ """Details of an invoice."""
152
+ invoice_date: str = Field(..., description="Date of the invoice in DD/MM/YYYY format")
153
+ invoice_number: str = Field(..., description="Unique identifier for the invoice.")
154
+ full_name: str = Field(..., description="Full name of the person.")
155
+ invoice_to:str = Field(...,description="email to send the invoice forward to.")
156
+ company_name:str = Field(...,description="company name")
157
+ address: str = Field(..., description="Address of the company.")
158
+
159
+ services : List[service] = Field(None,description="list of services in the table.")
160
+ sub_total: float = Field(...,description="the sub total.")
161
+ vat: float = Field(None, description="the vat.")
162
+ total_due:float= Field(...,description="the total due.")
163
+
164
+ first_bank : List[bank_details]=Field(...,description="first bank informations.")
165
+ second_details : List[bank_details]=Field(...,description="second bank details")
166
+
167
+
168
+
169
+ Noc_timesheet_prompt = """
170
+ Based on the provided timesheet details, extract the following information:
171
+ - Position title of the person.
172
+ - Work location .
173
+ - the agency.
174
+ - NOC ID of the person.
175
+ - Name of the person.
176
+ - approval status
177
+ - date of the approval
178
+ - approved by
179
+ - Number of service days onshore
180
+ - Number of standby days onshore in Doha
181
+ - Number of service days offshore
182
+ - Number of service days during weekends or public holidays
183
+ - Number of standby and extended hitch days offshore
184
+ - Number of extended hitch days onshore for rotational personnel
185
+ - Number of over time hours onshore (Over 8 hours)
186
+ - Number of over time hours offshore (Over 12 hours)
187
+ - Number of Per Diem days for onshore/offshore rotational personnel
188
+ - Number of training days
189
+ - Number of travel days
190
+
191
+ """
192
 
193
+ Noc_invoice_prompt ="""
194
+ Based on the provided timesheet details, extract the following information:
195
+ - Invoice date
196
+ - Invoice number
197
+ - Full name of the person
198
+ - Email to send the invoice forward to.
199
+ - Company name
200
+ - Address of the company
201
+ - List of services,for each existing service provide:
202
+ - the service name.
203
+ - starting date in DD/MM/YYYY format.
204
+ - ending date in DD/MM/YYYY format.
205
+ - currency of the rate.
206
+ - foreign exchange.
207
+ - number of hours or days for the service.
208
+ - the rate of the service.
209
+ - total which is the rate* No of days/hours .
210
+ - Sub total
211
+ - VAT
212
+ - Total due
213
+ - First bank informations, for each existing bank provide, do not mess up the iban:
214
+ - Name of the bank.
215
+ - SWIFT/BIC CODE.
216
+ - IBAN Number.
217
+ - full name.
218
+ - Account currency.
219
+ - the amount.
220
+ - Second bank informations, for each existing bank provide,do not mess up the iban:
221
+ - Name of the bank.
222
+ - SWIFT/BIC CODE.
223
+ - IBAN Number.
224
+ - full name.
225
+ - Account currency.
226
+ - the amount.
227
+ """
228
+
229
  Noc_Res_timesheet_prompt = """
230
  Based on the provided timesheet details, extract the following information:
231
  - Full name of the person
 
340
  invoice_clauses_page_prompt = """
341
  extract from the document the clauses """
342
 
343
+
344
  # CHOOSING PARSER DEPENDING ON THE TYPE OF DOCUMENT
345
  Noc_Res_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Residential_TimeSheetInformation)
346
  Noc_Rot_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Rotational_TimeSheetInformation)
 
348
  Noc_PurchaseOrder_item1_parser = JsonOutputParser(pydantic_object=Noc_Document_Information)
349
  Noc_PurchaseOrder_items_parser = JsonOutputParser(pydantic_object=Noc_items)
350
  Noc_PurchaseOrder_total_parser = JsonOutputParser(pydantic_object=Noc_total)
351
+ Noc_PurchaseOrder_clauses_parser = JsonOutputParser(pydantic_object=Noc_Clauses)
352
+ Noc_invoice_parser_v1 = JsonOutputParser(pydantic_object=Noc_Invoice)
353
+ Noc_timesheet_parser_v1 = JsonOutputParser(pydantic_object=Noc_TimeSheetInformation)