Amamrnaf commited on
Commit
2d39f2f
·
1 Parent(s): daaaba0
Files changed (3) hide show
  1. app.py +25 -1
  2. dataSchema.py +1 -1
  3. excel_to_pdf.py +63 -0
app.py CHANGED
@@ -3,6 +3,7 @@ import pymupdf #type: ignore
3
  from PIL import Image
4
  import os
5
  from functions import get_image_informations
 
6
  from dataSchema import *
7
  # import shutil
8
 
@@ -183,7 +184,7 @@ def process_file(file, option):
183
  if file_extension in ['.pdf']:
184
  # Process PDF files
185
  if option == "Noc_timesheet_residential_old":
186
- print(file_path)
187
  Noc_timeSheet_pdf_to_img(file_path)
188
  print("here 2")
189
  result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
@@ -221,6 +222,29 @@ def process_file(file, option):
221
  elif option == "Noc_invoice":
222
  result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
223
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  else:
225
  return "Unsupported file type. Please upload a PDF or image file."
226
  except Exception as e:
 
3
  from PIL import Image
4
  import os
5
  from functions import get_image_informations
6
+ from excel_to_pdf import excel_to_pdf
7
  from dataSchema import *
8
  # import shutil
9
 
 
184
  if file_extension in ['.pdf']:
185
  # Process PDF files
186
  if option == "Noc_timesheet_residential_old":
187
+
188
  Noc_timeSheet_pdf_to_img(file_path)
189
  print("here 2")
190
  result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
 
222
  elif option == "Noc_invoice":
223
  result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
224
  return result
225
+
226
+ elif file_extension in ['.xls','.xlsx']:
227
+
228
+ if option == "Noc_timesheet_residential_old":
229
+ Noc_timeSheet_pdf_to_img(file_path)
230
+ print("here 2")
231
+ result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
232
+ return result
233
+ elif option == "Noc_timesheet_rotational_old":
234
+ Noc_timeSheet_pdf_to_img(file_path)
235
+ result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
236
+ return result
237
+ elif option == "Noc_PO":
238
+ result = noc_invoice_extraction(file_path, save_dir)
239
+ return result
240
+ elif option =="Noc_timesheet_new":
241
+ pdf_to_img(file_path)
242
+ result = get_image_informations("output.jpg", Noc_timesheet_prompt, Noc_timesheet_parser_v1)
243
+ return result
244
+ elif option == "Noc_invoice":
245
+ pdf_to_img(file_path)
246
+ result = get_image_informations("output.jpg", Noc_invoice_prompt, Noc_invoice_parser_v1)
247
+ return result
248
  else:
249
  return "Unsupported file type. Please upload a PDF or image file."
250
  except Exception as e:
dataSchema.py CHANGED
@@ -184,7 +184,7 @@ Based on the provided timesheet details, extract the following information:
184
  - Number of extended hitch days onshore for rotational personnel
185
  - Number of over time hours onshore (Over 8 hours)
186
  - Number of over time hours offshore (Over 12 hours)
187
- - Number of Per Diem days for onshore/offshore rotational personnel
188
  - Number of training days
189
  - Number of travel days
190
 
 
184
  - Number of extended hitch days onshore for rotational personnel
185
  - Number of over time hours onshore (Over 8 hours)
186
  - Number of over time hours offshore (Over 12 hours)
187
+ - Number of Per Diem
188
  - Number of training days
189
  - Number of travel days
190
 
excel_to_pdf.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import xlrd # Reads old .xls files
4
+ from openpyxl import Workbook
5
+ from openpyxl import load_workbook
6
+ from reportlab.pdfgen import canvas
7
+ from reportlab.lib.pagesizes import landscape, portrait, A0, A1, A2, A3, A4
8
+ from reportlab.lib.colors import Color, black
9
+
10
+
11
+ def convert_xls_to_xlsx(xls_path):
12
+ """Convert .xls to .xlsx while keeping all sheets and data intact."""
13
+ if not xls_path.endswith(".xls"):
14
+ return xls_path
15
+
16
+ xlsx_path = xls_path.replace(".xls", ".xlsx")
17
+
18
+ # Open .xls file using xlrd
19
+ book = xlrd.open_workbook(xls_path)
20
+ new_book = Workbook()
21
+ new_book.remove(new_book.active)
22
+
23
+ for sheet_index in range(book.nsheets):
24
+ sheet = book.sheet_by_index(sheet_index)
25
+ new_sheet = new_book.create_sheet(title=sheet.name)
26
+
27
+ for row_idx in range(sheet.nrows):
28
+ for col_idx in range(sheet.ncols):
29
+ cell_value = sheet.cell(row_idx, col_idx).value
30
+ new_sheet.cell(row=row_idx + 1, column=col_idx + 1, value=cell_value)
31
+
32
+ new_book.save(xlsx_path)
33
+ print(f"Converted {xls_path} to {xlsx_path}")
34
+
35
+ return xlsx_path
36
+
37
+ def excel_to_pdf(excel_file, pdf_file = "output.pdf"):
38
+ excel_file = convert_xls_to_xlsx(excel_file) # Convert if .xls
39
+ workbook = load_workbook(excel_file, data_only=True)
40
+ c = canvas.Canvas(pdf_file)
41
+
42
+ for sheet_index, sheet in enumerate(workbook.worksheets):
43
+ print("hello")
44
+ num_columns = sheet.max_column
45
+ num_rows = sheet.max_row
46
+
47
+ page_size = A2 # page size, customize as needed
48
+ c.setPageSize(page_size)
49
+
50
+ if sheet_index > 0:
51
+ c.showPage()
52
+
53
+ page_width, page_height = page_size
54
+ y = page_height - 20 # Start from top
55
+
56
+ for row in sheet.iter_rows():
57
+ x = 10 # Start from left
58
+ for cell in row:
59
+ c.drawString(x, y, str(cell.value or ""))
60
+ x += 150 # Adjust column width
61
+ y -= 20 # Adjust row height
62
+
63
+ c.save()