Daemontatox commited on
Commit
3e972e7
·
verified ·
1 Parent(s): 1033f29

Update app.py

Browse files

try this out and let me know what you think , maybe try it locally incase if its not working we can fix it up .
key changes :
**File Type Handling**: The process_file function now checks the file extension to determine if the uploaded file is a PDF or an image.
**Image Processing**: If the file is an image, it processes it directly using the get_image_informations function.
**Error Handling**: Added a check for unsupported file types and provided appropriate feedback.

Files changed (1) hide show
  1. app.py +58 -79
app.py CHANGED
@@ -6,9 +6,7 @@ from functions import get_image_informations
6
  from dataSchema import *
7
  # import shutil
8
 
9
-
10
-
11
- def Noc_timeSheet_pdf_to_img(pdf_path,output_path,dpi: int = 300, quality: int = 95):
12
  pdf_document = pymupdf.open(pdf_path)
13
 
14
  # Get the first page of the PDF
@@ -17,16 +15,15 @@ def Noc_timeSheet_pdf_to_img(pdf_path,output_path,dpi: int = 300, quality: int =
17
  # Convert the page to a pixmap (image)
18
  pix = page.get_pixmap(dpi=dpi)
19
 
20
-
21
  # Convert the pixmap to a PIL Image and save as JPG
22
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
23
 
24
  width, height = image.size
25
- start_y_total_table = int(height* 0.42)
26
- end_y_first_table = int(height*0.30)
27
 
28
- croped1 = image.crop((0, 0, width//2, end_y_first_table))
29
- croped2 = image.crop((0, start_y_total_table, width//2, height))
30
  upper_width, upper_height = croped1.size
31
  lower_width, lower_height = croped2.size
32
  combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))
@@ -38,20 +35,8 @@ def Noc_timeSheet_pdf_to_img(pdf_path,output_path,dpi: int = 300, quality: int =
38
  combined_image.paste(croped2, (0, upper_height))
39
 
40
  # Save the combined image
41
- combined_image.save(output_path, "JPEG",quality=quality)
42
-
43
- #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL
44
- # import boto3
45
-
46
- # s3_client = boto3.client('s3', region_name=S3_REGION)
47
- # s3_client.upload_file(output_path, S3_BUCKET, key)
48
-
49
- # file_url = f"{S3_URL}/{key}"
50
-
51
- # return file_url
52
-
53
- # return output_path
54
-
55
  def Clauses_in_invoice(pdf_path: str) -> bool:
56
  """
57
  Extract text from the last page of a PDF.
@@ -65,16 +50,15 @@ def Clauses_in_invoice(pdf_path: str) -> bool:
65
  return True
66
  else:
67
  return False
68
-
69
  def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
70
-
71
  pdf_document = pymupdf.open(pdf_path)
72
  folder_path = folder_path.rstrip(os.sep)
73
  os.makedirs(folder_path, exist_ok=True)
74
 
75
  pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
76
  total_pages = pdf_document.page_count
77
- image_paths=[]
78
  for page_num in range(total_pages):
79
  page = pdf_document.load_page(page_num)
80
  pix = page.get_pixmap(dpi=dpi)
@@ -82,19 +66,6 @@ def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, qual
82
 
83
  output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
84
  image.save(output_path, "JPEG", quality=quality)
85
-
86
- #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL
87
- # import boto3
88
-
89
- # s3_client = boto3.client('s3', region_name=S3_REGION)
90
- # s3_client.upload_file(output_path, S3_BUCKET, key)
91
-
92
- # file_url = f"{S3_URL}/{key}"
93
-
94
- # append the s3 links
95
- # image_paths.append(file_url)
96
-
97
-
98
  image_paths.append(output_path)
99
 
100
  pdf_document.close()
@@ -113,84 +84,92 @@ def delete_images(image_paths):
113
  except Exception as e:
114
  print(f"Error deleting {image_path}: {e}")
115
 
116
- def noc_invoice_extraction(pdf_path: str,folder_path):
117
-
118
- image_paths=Noc_invoice_pdf_to_img(pdf_path,folder_path)
119
  data = {}
120
- result = get_image_informations(image_paths[0],invoice_first_page_prompt,Noc_PurchaseOrder_information_parser)
121
  data.update(result)
122
- result = get_image_informations(image_paths[1],invoice_item_page1_prompt,Noc_PurchaseOrder_item1_parser)
123
  data.update(result)
124
  if Clauses_in_invoice(pdf_path):
125
- for pic in range(len(image_paths)-4):
126
- new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
127
  for item in new_item["items"]:
128
  data["items"].append(item)
129
- result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
130
  data.update(result)
131
- result = get_image_informations(image_paths[-1],invoice_clauses_page_prompt,Noc_PurchaseOrder_clauses_parser)
132
  data.update(result)
133
  delete_images(image_paths)
134
  return data
135
  else:
136
- for pic in range(len(image_paths)-3):
137
- new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
138
  for item in new_item["items"]:
139
  data["items"].append(item)
140
- result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
141
  data.update(result)
142
  delete_images(image_paths)
143
  return data
144
-
145
 
146
- def process_pdf(file, option):
147
  if file is None:
148
- return "Please upload a PDF file."
149
 
150
  try:
151
-
152
  save_dir = "uploaded_files"
153
  os.makedirs(save_dir, exist_ok=True) # Create the directory if it doesn't exist
154
 
155
-
156
- # Save the uploaded file to the new location
157
  file_path = file.name
158
-
159
- # Process based on the selected option
160
- if option == "Noc_timesheet_residential":
161
- Noc_timeSheet_pdf_to_img(file_path,"output.jpg")
162
- result = get_image_informations("output.jpg",Noc_Res_timesheet_prompt,Noc_Res_timeSheet_parser)
163
- return result
164
- elif option == "Noc_timesheet_rotational":
165
- Noc_timeSheet_pdf_to_img(file_path,"output.jpg")
166
- result = get_image_informations("output.jpg",Noc_Rot_timesheet_prompt,Noc_Rot_timeSheet_parser)
167
- return result
168
- elif option=="Noc_invoice":
169
- result = noc_invoice_extraction(file_path,save_dir)
170
- return result
171
-
172
- # else:
173
- # return "Invalid option selected."
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  except Exception as e:
175
  return f"An error occurred: {e}"
176
 
177
  # Define the Gradio interface
178
  demo = gr.Interface(
179
- fn=process_pdf,
180
  inputs=[
181
- gr.File(label="Upload PDF"), # File upload input
182
- gr.Radio(["Noc_timesheet_residential","Noc_timesheet_rotational", "Noc_invoice"], label="Choose an option") # Radio buttons for options
183
  ],
184
  outputs="text", # Text output
185
- title="PDF Processor",
186
- description="Upload a PDF and choose an option to process the text."
187
  )
188
 
189
  with gr.Blocks() as app:
190
  demo.render()
191
- gr.Markdown("### pdf examples") # Section title
192
  with gr.Row():
193
  gr.Image("TS.png", label="NOC timesheet example")
194
  gr.Image("invoice.png", label="NOC invoice example")
195
 
196
- app.launch()
 
6
  from dataSchema import *
7
  # import shutil
8
 
9
+ def Noc_timeSheet_pdf_to_img(pdf_path, output_path, dpi: int = 300, quality: int = 95):
 
 
10
  pdf_document = pymupdf.open(pdf_path)
11
 
12
  # Get the first page of the PDF
 
15
  # Convert the page to a pixmap (image)
16
  pix = page.get_pixmap(dpi=dpi)
17
 
 
18
  # Convert the pixmap to a PIL Image and save as JPG
19
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
20
 
21
  width, height = image.size
22
+ start_y_total_table = int(height * 0.42)
23
+ end_y_first_table = int(height * 0.30)
24
 
25
+ croped1 = image.crop((0, 0, width // 2, end_y_first_table))
26
+ croped2 = image.crop((0, start_y_total_table, width // 2, height))
27
  upper_width, upper_height = croped1.size
28
  lower_width, lower_height = croped2.size
29
  combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))
 
35
  combined_image.paste(croped2, (0, upper_height))
36
 
37
  # Save the combined image
38
+ combined_image.save(output_path, "JPEG", quality=quality)
39
+
 
 
 
 
 
 
 
 
 
 
 
 
40
  def Clauses_in_invoice(pdf_path: str) -> bool:
41
  """
42
  Extract text from the last page of a PDF.
 
50
  return True
51
  else:
52
  return False
53
+
54
  def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
 
55
  pdf_document = pymupdf.open(pdf_path)
56
  folder_path = folder_path.rstrip(os.sep)
57
  os.makedirs(folder_path, exist_ok=True)
58
 
59
  pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
60
  total_pages = pdf_document.page_count
61
+ image_paths = []
62
  for page_num in range(total_pages):
63
  page = pdf_document.load_page(page_num)
64
  pix = page.get_pixmap(dpi=dpi)
 
66
 
67
  output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
68
  image.save(output_path, "JPEG", quality=quality)
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  image_paths.append(output_path)
70
 
71
  pdf_document.close()
 
84
  except Exception as e:
85
  print(f"Error deleting {image_path}: {e}")
86
 
87
+ def noc_invoice_extraction(pdf_path: str, folder_path):
88
+ image_paths = Noc_invoice_pdf_to_img(pdf_path, folder_path)
 
89
  data = {}
90
+ result = get_image_informations(image_paths[0], invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
91
  data.update(result)
92
+ result = get_image_informations(image_paths[1], invoice_item_page1_prompt, Noc_PurchaseOrder_item1_parser)
93
  data.update(result)
94
  if Clauses_in_invoice(pdf_path):
95
+ for pic in range(len(image_paths) - 4):
96
+ new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
97
  for item in new_item["items"]:
98
  data["items"].append(item)
99
+ result = get_image_informations(image_paths[-2], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
100
  data.update(result)
101
+ result = get_image_informations(image_paths[-1], invoice_clauses_page_prompt, Noc_PurchaseOrder_clauses_parser)
102
  data.update(result)
103
  delete_images(image_paths)
104
  return data
105
  else:
106
+ for pic in range(len(image_paths) - 3):
107
+ new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
108
  for item in new_item["items"]:
109
  data["items"].append(item)
110
+ result = get_image_informations(image_paths[-2], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
111
  data.update(result)
112
  delete_images(image_paths)
113
  return data
 
114
 
115
+ def process_file(file, option):
116
  if file is None:
117
+ return "Please upload a PDF or image file."
118
 
119
  try:
 
120
  save_dir = "uploaded_files"
121
  os.makedirs(save_dir, exist_ok=True) # Create the directory if it doesn't exist
122
 
 
 
123
  file_path = file.name
124
+ file_extension = os.path.splitext(file_path)[1].lower()
125
+
126
+ if file_extension in ['.pdf']:
127
+ # Process PDF files
128
+ if option == "Noc_timesheet_residential":
129
+ Noc_timeSheet_pdf_to_img(file_path, "output.jpg")
130
+ result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
131
+ return result
132
+ elif option == "Noc_timesheet_rotational":
133
+ Noc_timeSheet_pdf_to_img(file_path, "output.jpg")
134
+ result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
135
+ return result
136
+ elif option == "Noc_invoice":
137
+ result = noc_invoice_extraction(file_path, save_dir)
138
+ return result
139
+ elif file_extension in ['.jpg', '.jpeg', '.png']:
140
+ # Process image files directly
141
+ if option == "Noc_timesheet_residential":
142
+ result = get_image_informations(file_path, Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
143
+ return result
144
+ elif option == "Noc_timesheet_rotational":
145
+ result = get_image_informations(file_path, Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
146
+ return result
147
+ elif option == "Noc_invoice":
148
+ # For invoice images, we assume it's a single page
149
+ result = get_image_informations(file_path, invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
150
+ return result
151
+ else:
152
+ return "Unsupported file type. Please upload a PDF or image file."
153
  except Exception as e:
154
  return f"An error occurred: {e}"
155
 
156
  # Define the Gradio interface
157
  demo = gr.Interface(
158
+ fn=process_file,
159
  inputs=[
160
+ gr.File(label="Upload PDF or Image"), # File upload input
161
+ gr.Radio(["Noc_timesheet_residential", "Noc_timesheet_rotational", "Noc_invoice"], label="Choose an option") # Radio buttons for options
162
  ],
163
  outputs="text", # Text output
164
+ title="PDF/Image Processor",
165
+ description="Upload a PDF or image and choose an option to process the content."
166
  )
167
 
168
  with gr.Blocks() as app:
169
  demo.render()
170
+ gr.Markdown("### PDF/Image examples") # Section title
171
  with gr.Row():
172
  gr.Image("TS.png", label="NOC timesheet example")
173
  gr.Image("invoice.png", label="NOC invoice example")
174
 
175
+ app.launch()