Update pdf_processing.py (#2)
Browse files- Update pdf_processing.py (ce5b56b59e072225616a5b6064c35896ff4cb0a9)
- pdf_processing.py +103 -25
pdf_processing.py
CHANGED
@@ -3,7 +3,7 @@ import fitz # PyMuPDF
|
|
3 |
import pymupdf4llm
|
4 |
import os
|
5 |
import traceback
|
6 |
-
from typing import Any, Dict, List # Use standard List, Dict
|
7 |
from collections import Counter
|
8 |
|
9 |
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
|
@@ -16,18 +16,112 @@ def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
|
|
16 |
"width": rect.width, "height": rect.height
|
17 |
}
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def try_map_issues_to_page_rects(
|
20 |
issues_to_map_for_context: List[Dict[str, Any]],
|
21 |
-
|
22 |
-
page_number_for_mapping: int
|
|
|
23 |
) -> int:
|
24 |
mapped_count = 0
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
for i in range(limit):
|
27 |
issue_to_update = issues_to_map_for_context[i]
|
28 |
-
if issue_to_update['is_mapped_to_pdf']:
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
if coord_dict:
|
32 |
issue_to_update['pdf_coordinates_list'] = [coord_dict]
|
33 |
issue_to_update['is_mapped_to_pdf'] = True
|
@@ -36,23 +130,7 @@ def try_map_issues_to_page_rects(
|
|
36 |
return mapped_count
|
37 |
|
38 |
|
39 |
-
|
40 |
-
import os
|
41 |
-
import traceback
|
42 |
-
from typing import Any, Dict, List
|
43 |
-
from collections import Counter
|
44 |
-
|
45 |
-
|
46 |
-
# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
|
47 |
-
|
48 |
-
import fitz # PyMuPDF
|
49 |
-
import os
|
50 |
-
import traceback
|
51 |
-
from typing import Any, Dict, List # Use standard List, Dict
|
52 |
-
from collections import Counter
|
53 |
-
|
54 |
-
|
55 |
-
# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
|
56 |
|
57 |
def extract_majority_font_text_directly(pdf_path: str) -> str:
|
58 |
"""
|
@@ -171,7 +249,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
|
|
171 |
try:
|
172 |
doc_orig_text = fitz.open(pdf_path)
|
173 |
full_text_parts = [page.get_text("text") for page in doc_orig_text]
|
174 |
-
print(full_text_parts)
|
175 |
return "".join(full_text_parts)
|
176 |
except Exception as e:
|
177 |
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
|
|
|
3 |
import pymupdf4llm
|
4 |
import os
|
5 |
import traceback
|
6 |
+
from typing import Any, Dict, List, Optional # Use standard List, Dict, Optional
|
7 |
from collections import Counter
|
8 |
|
9 |
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
|
|
|
16 |
"width": rect.width, "height": rect.height
|
17 |
}
|
18 |
|
19 |
+
def _get_specific_error_rect_in_context(
|
20 |
+
page: fitz.Page,
|
21 |
+
context_rect: fitz.Rect,
|
22 |
+
error_text_verbatim: str
|
23 |
+
) -> Optional[fitz.Rect]:
|
24 |
+
"""
|
25 |
+
Tries to find the precise bounding box of error_text_verbatim within
|
26 |
+
the larger context_rect on the given page.
|
27 |
+
"""
|
28 |
+
if not error_text_verbatim or error_text_verbatim.isspace():
|
29 |
+
print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.")
|
30 |
+
return None
|
31 |
+
|
32 |
+
# Extract words sorted by position within the given context_rect
|
33 |
+
# Each word_data is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
|
34 |
+
words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True)
|
35 |
+
|
36 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Searching for '{error_text_verbatim}' in {len(words_on_page_in_clip)} words within clip {context_rect}")
|
37 |
+
|
38 |
+
error_tokens = error_text_verbatim.strip().split()
|
39 |
+
if not error_tokens:
|
40 |
+
print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.")
|
41 |
+
return None
|
42 |
+
|
43 |
+
found_rects_for_error_sequence = []
|
44 |
+
|
45 |
+
for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1):
|
46 |
+
match = True
|
47 |
+
current_sequence_rects = []
|
48 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Trying match starting at PDF word '{words_on_page_in_clip[i][4]}'")
|
49 |
+
for j in range(len(error_tokens)):
|
50 |
+
pdf_word_text = words_on_page_in_clip[i+j][4]
|
51 |
+
error_token_to_match = error_tokens[j]
|
52 |
+
|
53 |
+
# Basic normalization for comparison
|
54 |
+
pdf_word_normalized = pdf_word_text.strip().lower()
|
55 |
+
error_token_normalized = error_token_to_match.strip().lower()
|
56 |
+
|
57 |
+
# A more robust comparison might involve removing common punctuation
|
58 |
+
# or handling hyphenation if LanguageTool splits differently than PyMuPDF.
|
59 |
+
if error_token_normalized != pdf_word_normalized:
|
60 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Mismatch: '{error_token_normalized}' (expected) vs '{pdf_word_normalized}' (pdf word)")
|
61 |
+
match = False
|
62 |
+
break
|
63 |
+
current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4]))
|
64 |
+
|
65 |
+
if match:
|
66 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Found match for '{error_text_verbatim}'")
|
67 |
+
found_rects_for_error_sequence = current_sequence_rects
|
68 |
+
break # Found the first full match of the error_text_verbatim
|
69 |
+
|
70 |
+
if found_rects_for_error_sequence:
|
71 |
+
final_error_bbox = fitz.Rect() # Start with an empty rect
|
72 |
+
for r_part in found_rects_for_error_sequence:
|
73 |
+
final_error_bbox.include_rect(r_part) # Expand to include this part
|
74 |
+
|
75 |
+
if not final_error_bbox.is_empty:
|
76 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Combined bbox: {final_error_bbox}")
|
77 |
+
return final_error_bbox
|
78 |
+
else:
|
79 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Combined bbox was empty.")
|
80 |
+
pass
|
81 |
+
else:
|
82 |
+
# print(f"Debug: _get_specific_error_rect_in_context: No match found for '{error_text_verbatim}'.")
|
83 |
+
pass
|
84 |
+
return None
|
85 |
+
|
86 |
+
|
87 |
def try_map_issues_to_page_rects(
|
88 |
issues_to_map_for_context: List[Dict[str, Any]],
|
89 |
+
pdf_rects_from_search: List[fitz.Rect], # Rects for occurrences of the wider context string
|
90 |
+
page_number_for_mapping: int,
|
91 |
+
page: fitz.Page # The current PyMuPDF page object
|
92 |
) -> int:
|
93 |
mapped_count = 0
|
94 |
+
# We assume that the number of issues for a given context string on a page
|
95 |
+
# should not exceed the number of times that context string appears.
|
96 |
+
# If it does, we only map up to the number of found context occurrences.
|
97 |
+
limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search))
|
98 |
+
|
99 |
for i in range(limit):
|
100 |
issue_to_update = issues_to_map_for_context[i]
|
101 |
+
if issue_to_update['is_mapped_to_pdf']:
|
102 |
+
continue
|
103 |
+
|
104 |
+
# This is the rectangle for the i-th occurrence of the wider context string
|
105 |
+
context_occurrence_rect = pdf_rects_from_search[i]
|
106 |
+
|
107 |
+
final_rect_for_issue = context_occurrence_rect # Default to the whole context rect
|
108 |
+
|
109 |
+
# For LanguageTool issues, try to refine the rect to the specific error text
|
110 |
+
if issue_to_update.get('source_check_type') == 'LanguageTool':
|
111 |
+
error_text_verbatim = issue_to_update.get('error_text_verbatim')
|
112 |
+
if error_text_verbatim:
|
113 |
+
# print(f"Debug: Refining LT issue: '{error_text_verbatim}' within context rect {context_occurrence_rect}")
|
114 |
+
specific_error_rect = _get_specific_error_rect_in_context(
|
115 |
+
page, context_occurrence_rect, error_text_verbatim
|
116 |
+
)
|
117 |
+
if specific_error_rect:
|
118 |
+
final_rect_for_issue = specific_error_rect
|
119 |
+
# print(f"Debug: Refined rect to: {final_rect_for_issue}")
|
120 |
+
else:
|
121 |
+
# print(f"Debug: Could not refine rect, using context rect: {context_occurrence_rect}")
|
122 |
+
pass # Stick with the wider context_occurrence_rect if specific not found
|
123 |
+
|
124 |
+
coord_dict = convert_rect_to_dict(final_rect_for_issue)
|
125 |
if coord_dict:
|
126 |
issue_to_update['pdf_coordinates_list'] = [coord_dict]
|
127 |
issue_to_update['is_mapped_to_pdf'] = True
|
|
|
130 |
return mapped_count
|
131 |
|
132 |
|
133 |
+
# ... (rest of pdf_processing.py, including extract_majority_font_text_directly and extract_plain_text_from_original_pdf) ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
def extract_majority_font_text_directly(pdf_path: str) -> str:
|
136 |
"""
|
|
|
249 |
try:
|
250 |
doc_orig_text = fitz.open(pdf_path)
|
251 |
full_text_parts = [page.get_text("text") for page in doc_orig_text]
|
252 |
+
# print(full_text_parts) # This was the user's debug print, can be noisy
|
253 |
return "".join(full_text_parts)
|
254 |
except Exception as e:
|
255 |
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
|