Amamrnaf commited on
Commit
6e805b9
·
1 Parent(s): 2cf3347

app done ?

Browse files
Files changed (3) hide show
  1. app.py +158 -13
  2. dataSchema.py +12 -0
  3. functions.py +48 -0
app.py CHANGED
@@ -1,25 +1,170 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF for handling PDF files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def process_pdf(file, option):
5
  if file is None:
6
  return "Please upload a PDF file."
7
 
8
  try:
9
- # Open the PDF file
10
- doc = fitz.open(file.name)
11
- text = ""
12
- for page in doc:
13
- text += page.get_text()
14
- doc.close()
 
 
15
 
16
  # Process based on the selected option
17
- if option == "Option 1":
18
- return f"Option 1 selected. Extracted text:\n{text[:500]}..." # Truncated for brevity
19
- elif option == "Option 2":
20
- return f"Option 2 selected. Extracted text:\n{text[:500]}..." # Truncated for brevity
21
- else:
22
- return "Invalid option selected."
 
 
23
  except Exception as e:
24
  return f"An error occurred: {e}"
25
 
 
1
  import gradio as gr
2
+ import pymupdf # PyMuPDF for handling PDF files
3
+ from PIL import Image
4
+ import os
5
+ from functions import get_image_informations
6
+ from dataSchema import *
7
+
8
+
9
+
10
+ def Noc_timeSheet_pdf_to_img(pdf_path,output_path,dpi: int = 300, quality: int = 95):
11
+ pdf_document = pymupdf.open(pdf_path)
12
+
13
+ # Get the first page of the PDF
14
+ page = pdf_document.load_page(0) # 0 is the first page
15
+
16
+ # Convert the page to a pixmap (image)
17
+ pix = page.get_pixmap(dpi=dpi)
18
+
19
+
20
+ # Convert the pixmap to a PIL Image and save as JPG
21
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
22
+
23
+ width, height = image.size
24
+ start_y_total_table = int(height* 0.42)
25
+ end_y_first_table = int(height*0.30)
26
+
27
+ croped1 = image.crop((0, 0, width//2, end_y_first_table))
28
+ croped2 = image.crop((0, start_y_total_table, width//2, height))
29
+ upper_width, upper_height = croped1.size
30
+ lower_width, lower_height = croped2.size
31
+ combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))
32
+
33
+ # Paste the upper image (croped1) on top
34
+ combined_image.paste(croped1, (0, 0))
35
+
36
+ # Paste the lower image (croped2) below the upper image
37
+ combined_image.paste(croped2, (0, upper_height))
38
+
39
+ # Save the combined image
40
+ combined_image.save(output_path, "JPEG",quality=quality)
41
+
42
+ #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL
43
+ # import boto3
44
+
45
+ # s3_client = boto3.client('s3', region_name=S3_REGION)
46
+ # s3_client.upload_file(output_path, S3_BUCKET, key)
47
+
48
+ # file_url = f"{S3_URL}/{key}"
49
+
50
+ # return file_url
51
+
52
+ # return output_path
53
+
54
+ def Clauses_in_invoice(pdf_path: str) -> bool:
55
+ """
56
+ Extract text from the last page of a PDF.
57
+ """
58
+ pdf_document = pymupdf.open(pdf_path)
59
+ total_pages = pdf_document.page_count
60
+ last_page = pdf_document.load_page(total_pages - 1)
61
+ text = last_page.get_text()
62
+ pdf_document.close()
63
+ if "clauses" in text.lower():
64
+ return True
65
+ else:
66
+ return False
67
+
68
+ def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
69
+
70
+ pdf_document = pymupdf.open(pdf_path)
71
+ folder_path = folder_path.rstrip(os.sep)
72
+ os.makedirs(folder_path, exist_ok=True)
73
+
74
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
75
+ total_pages = pdf_document.page_count
76
+ image_paths=[]
77
+ for page_num in range(total_pages):
78
+ page = pdf_document.load_page(page_num)
79
+ pix = page.get_pixmap(dpi=dpi)
80
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
81
+
82
+ output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
83
+ image.save(output_path, "JPEG", quality=quality)
84
+
85
+ #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL
86
+ # import boto3
87
+
88
+ # s3_client = boto3.client('s3', region_name=S3_REGION)
89
+ # s3_client.upload_file(output_path, S3_BUCKET, key)
90
+
91
+ # file_url = f"{S3_URL}/{key}"
92
+
93
+ # append the s3 links
94
+ # image_paths.append(file_url)
95
+
96
+
97
+ image_paths.append(output_path)
98
+
99
+ pdf_document.close()
100
+ return image_paths
101
+
102
+ def delete_images(image_paths):
103
+ # Iterate through the list of image paths
104
+ for image_path in image_paths:
105
+ try:
106
+ # Check if the file exists before attempting to delete
107
+ if os.path.exists(image_path):
108
+ os.remove(image_path)
109
+ print(f"Deleted: {image_path}")
110
+ else:
111
+ print(f"File not found: {image_path}")
112
+ except Exception as e:
113
+ print(f"Error deleting {image_path}: {e}")
114
+
115
+ def noc_invoice_extraction(pdf_path: str,folder_path):
116
+
117
+ image_paths=Noc_invoice_pdf_to_img(pdf_path,folder_path)
118
+ data = {}
119
+ result = get_image_informations(image_paths[0],invoice_first_page_prompt,Noc_PurchaseOrder_information_parser)
120
+ data.update(result)
121
+ result = get_image_informations(image_paths[1],invoice_item_page1_prompt,Noc_PurchaseOrder_item1_parser)
122
+ data.update(result)
123
+ if Clauses_in_invoice(pdf_path):
124
+ for pic in range(len(image_paths)-4):
125
+ new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
126
+ for item in new_item["items"]:
127
+ data["items"].append(item)
128
+ result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
129
+ data.update(result)
130
+ result = get_image_informations(image_paths[-1],invoice_clauses_page_prompt,Noc_PurchaseOrder_clauses_parser)
131
+ data.update(result)
132
+ delete_images(image_paths)
133
+ return data
134
+ else:
135
+ for pic in range(len(image_paths)-3):
136
+ new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
137
+ for item in new_item["items"]:
138
+ data["items"].append(item)
139
+ result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
140
+ data.update(result)
141
+ delete_images(image_paths)
142
+ return data
143
+
144
 
145
  def process_pdf(file, option):
146
  if file is None:
147
  return "Please upload a PDF file."
148
 
149
  try:
150
+
151
+ save_dir = "uploaded_files"
152
+ os.makedirs(save_dir, exist_ok=True) # Create the directory if it doesn't exist
153
+
154
+ # Save the uploaded file
155
+ file_path = os.path.join(save_dir, file.name)
156
+ with open(file_path, "wb") as f:
157
+ f.write(file.read())
158
 
159
  # Process based on the selected option
160
+ if option == "Noc_timesheet_resdiential":
161
+ Noc_timeSheet_pdf_to_img(file_path,"output.jpg")
162
+ result = get_image_informations("output.jpg",Noc_Res_timesheet_prompt,Noc_Res_timeSheet_parser)
163
+ return result
164
+ # elif option == "Option 2":
165
+ # return f"Option 2 selected. Extracted text:\n{text[:500]}..." # Truncated for brevity
166
+ # else:
167
+ # return "Invalid option selected."
168
  except Exception as e:
169
  return f"An error occurred: {e}"
170
 
dataSchema.py CHANGED
@@ -1,5 +1,6 @@
1
  from pydantic import BaseModel, Field
2
  from typing import Optional,List
 
3
 
4
  class Noc_Residential_TimeSheetInformation(BaseModel):
5
  """Details of a timesheet entry."""
@@ -88,6 +89,7 @@ class Noc_PurchaseOrderInformation(BaseModel):
88
  your_reference: Optional[str] = Field(None, description="under Your reference title.")
89
  incoterms: Optional[str] = Field(None, description="Incoterms applicable to the order.")
90
  total_value_of_order: str = Field(..., description="Total value of the purchase order.")
 
91
  signature_released_by: str = Field(None, description="Name of the person who released the purchase order.")
92
  signature_date: Optional[str] = Field(None, description="Date the order was signed.")
93
 
@@ -155,6 +157,7 @@ Extract the following details from the provided purchase order document:
155
  - Your Reference: Reference specified under the "Your Reference" section (if present).
156
  - Incoterms: Any applicable incoterms mentioned in the document (e.g., FOB, CIF).
157
  - Total Value of the Order: The total monetary value of the purchase order (include currency).
 
158
  - Signature Released By: The name of the person who authorized or released the purchase order.
159
  - Signature Date: The date when the order was signed (format: DD/MM/YYYY).
160
  """
@@ -199,3 +202,12 @@ extract from the document:
199
 
200
  invoice_clauses_page_prompt = """
201
  extract from the document the clauses """
 
 
 
 
 
 
 
 
 
 
1
  from pydantic import BaseModel, Field
2
  from typing import Optional,List
3
+ from langchain_core.output_parsers import JsonOutputParser
4
 
5
  class Noc_Residential_TimeSheetInformation(BaseModel):
6
  """Details of a timesheet entry."""
 
89
  your_reference: Optional[str] = Field(None, description="under Your reference title.")
90
  incoterms: Optional[str] = Field(None, description="Incoterms applicable to the order.")
91
  total_value_of_order: str = Field(..., description="Total value of the purchase order.")
92
+ signed: bool = Field(..., description="Whether the document has been signed or not.")
93
  signature_released_by: str = Field(None, description="Name of the person who released the purchase order.")
94
  signature_date: Optional[str] = Field(None, description="Date the order was signed.")
95
 
 
157
  - Your Reference: Reference specified under the "Your Reference" section (if present).
158
  - Incoterms: Any applicable incoterms mentioned in the document (e.g., FOB, CIF).
159
  - Total Value of the Order: The total monetary value of the purchase order (include currency).
160
+ - signed: Whether the document has been signed or not.
161
  - Signature Released By: The name of the person who authorized or released the purchase order.
162
  - Signature Date: The date when the order was signed (format: DD/MM/YYYY).
163
  """
 
202
 
203
  invoice_clauses_page_prompt = """
204
  extract from the document the clauses """
205
+
206
+ # CHOOSING PARSER DEPENDING ON THE TYPE OF DOCUMENT
207
+ Noc_Res_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Residential_TimeSheetInformation)
208
+ Noc_Rot_timeSheet_parser = JsonOutputParser(pydantic_object=Noc_Rotational_TimeSheetInformation)
209
+ Noc_PurchaseOrder_information_parser = JsonOutputParser(pydantic_object=Noc_PurchaseOrderInformation)
210
+ Noc_PurchaseOrder_item1_parser = JsonOutputParser(pydantic_object=Noc_Document_Information)
211
+ Noc_PurchaseOrder_items_parser = JsonOutputParser(pydantic_object=Noc_items)
212
+ Noc_PurchaseOrder_total_parser = JsonOutputParser(pydantic_object=Noc_total)
213
+ Noc_PurchaseOrder_clauses_parser = JsonOutputParser(pydantic_object=Noc_Clauses)
functions.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import TransformChain
2
+ from langchain_core.messages import HumanMessage
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain import globals
5
+ from langchain_core.runnables import chain
6
+ import base64
7
+ from typing import Dict,List,Union
8
+
9
+
10
+
11
+ def load_image(inputs: dict) -> dict:
12
+ """Load image from file and encode it as base64."""
13
+ image_path = inputs["image_path"]
14
+
15
+ def encode_image(image_path):
16
+ with open(image_path, "rb") as image_file:
17
+ return base64.b64encode(image_file.read()).decode('utf-8')
18
+ image_base64 = encode_image(image_path)
19
+ return {"image": image_base64}
20
+
21
+ load_image_chain = TransformChain(
22
+ input_variables=["image_path"],
23
+ output_variables=["image"],
24
+ transform=load_image
25
+ )
26
+
27
+
28
+ @chain
29
+ def image_model(inputs: dict) -> Union[str, List[str], dict]:
30
+ """Invoke model with image and prompt."""
31
+ model = ChatOpenAI(temperature=0.1, model="gpt-4o", max_tokens=1024)
32
+ parser = inputs["parser"]
33
+ msg = model.invoke(
34
+ [HumanMessage(
35
+ content=[
36
+ {"type": "text", "text": inputs["prompt"]},
37
+ {"type": "text", "text": parser.get_format_instructions()},
38
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
39
+ ])]
40
+ )
41
+ return msg.content
42
+
43
+ def get_image_informations(image_path: str,prompt,parser) -> dict:
44
+ vision_chain = load_image_chain | image_model | parser
45
+ return vision_chain.invoke({'image_path': f'{image_path}',
46
+ 'prompt': prompt,
47
+ 'parser': parser
48
+ })