Update pdf_processing.py
Browse files- pdf_processing.py +103 -25
pdf_processing.py
CHANGED
|
@@ -3,7 +3,7 @@ import fitz # PyMuPDF
|
|
| 3 |
import pymupdf4llm
|
| 4 |
import os
|
| 5 |
import traceback
|
| 6 |
-
from typing import Any, Dict, List # Use standard List, Dict
|
| 7 |
from collections import Counter
|
| 8 |
|
| 9 |
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
|
|
@@ -16,18 +16,112 @@ def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
|
|
| 16 |
"width": rect.width, "height": rect.height
|
| 17 |
}
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def try_map_issues_to_page_rects(
|
| 20 |
issues_to_map_for_context: List[Dict[str, Any]],
|
| 21 |
-
|
| 22 |
-
page_number_for_mapping: int
|
|
|
|
| 23 |
) -> int:
|
| 24 |
mapped_count = 0
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
for i in range(limit):
|
| 27 |
issue_to_update = issues_to_map_for_context[i]
|
| 28 |
-
if issue_to_update['is_mapped_to_pdf']:
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
if coord_dict:
|
| 32 |
issue_to_update['pdf_coordinates_list'] = [coord_dict]
|
| 33 |
issue_to_update['is_mapped_to_pdf'] = True
|
|
@@ -36,23 +130,7 @@ def try_map_issues_to_page_rects(
|
|
| 36 |
return mapped_count
|
| 37 |
|
| 38 |
|
| 39 |
-
|
| 40 |
-
import os
|
| 41 |
-
import traceback
|
| 42 |
-
from typing import Any, Dict, List
|
| 43 |
-
from collections import Counter
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
|
| 47 |
-
|
| 48 |
-
import fitz # PyMuPDF
|
| 49 |
-
import os
|
| 50 |
-
import traceback
|
| 51 |
-
from typing import Any, Dict, List # Use standard List, Dict
|
| 52 |
-
from collections import Counter
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
|
| 56 |
|
| 57 |
def extract_majority_font_text_directly(pdf_path: str) -> str:
|
| 58 |
"""
|
|
@@ -171,7 +249,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
|
|
| 171 |
try:
|
| 172 |
doc_orig_text = fitz.open(pdf_path)
|
| 173 |
full_text_parts = [page.get_text("text") for page in doc_orig_text]
|
| 174 |
-
print(full_text_parts)
|
| 175 |
return "".join(full_text_parts)
|
| 176 |
except Exception as e:
|
| 177 |
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
|
|
|
|
| 3 |
import pymupdf4llm
|
| 4 |
import os
|
| 5 |
import traceback
|
| 6 |
+
from typing import Any, Dict, List, Optional # Use standard List, Dict, Optional
|
| 7 |
from collections import Counter
|
| 8 |
|
| 9 |
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
|
|
|
|
| 16 |
"width": rect.width, "height": rect.height
|
| 17 |
}
|
| 18 |
|
| 19 |
+
def _get_specific_error_rect_in_context(
|
| 20 |
+
page: fitz.Page,
|
| 21 |
+
context_rect: fitz.Rect,
|
| 22 |
+
error_text_verbatim: str
|
| 23 |
+
) -> Optional[fitz.Rect]:
|
| 24 |
+
"""
|
| 25 |
+
Tries to find the precise bounding box of error_text_verbatim within
|
| 26 |
+
the larger context_rect on the given page.
|
| 27 |
+
"""
|
| 28 |
+
if not error_text_verbatim or error_text_verbatim.isspace():
|
| 29 |
+
print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.")
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
# Extract words sorted by position within the given context_rect
|
| 33 |
+
# Each word_data is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
|
| 34 |
+
words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True)
|
| 35 |
+
|
| 36 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Searching for '{error_text_verbatim}' in {len(words_on_page_in_clip)} words within clip {context_rect}")
|
| 37 |
+
|
| 38 |
+
error_tokens = error_text_verbatim.strip().split()
|
| 39 |
+
if not error_tokens:
|
| 40 |
+
print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.")
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
found_rects_for_error_sequence = []
|
| 44 |
+
|
| 45 |
+
for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1):
|
| 46 |
+
match = True
|
| 47 |
+
current_sequence_rects = []
|
| 48 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Trying match starting at PDF word '{words_on_page_in_clip[i][4]}'")
|
| 49 |
+
for j in range(len(error_tokens)):
|
| 50 |
+
pdf_word_text = words_on_page_in_clip[i+j][4]
|
| 51 |
+
error_token_to_match = error_tokens[j]
|
| 52 |
+
|
| 53 |
+
# Basic normalization for comparison
|
| 54 |
+
pdf_word_normalized = pdf_word_text.strip().lower()
|
| 55 |
+
error_token_normalized = error_token_to_match.strip().lower()
|
| 56 |
+
|
| 57 |
+
# A more robust comparison might involve removing common punctuation
|
| 58 |
+
# or handling hyphenation if LanguageTool splits differently than PyMuPDF.
|
| 59 |
+
if error_token_normalized != pdf_word_normalized:
|
| 60 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Mismatch: '{error_token_normalized}' (expected) vs '{pdf_word_normalized}' (pdf word)")
|
| 61 |
+
match = False
|
| 62 |
+
break
|
| 63 |
+
current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4]))
|
| 64 |
+
|
| 65 |
+
if match:
|
| 66 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Found match for '{error_text_verbatim}'")
|
| 67 |
+
found_rects_for_error_sequence = current_sequence_rects
|
| 68 |
+
break # Found the first full match of the error_text_verbatim
|
| 69 |
+
|
| 70 |
+
if found_rects_for_error_sequence:
|
| 71 |
+
final_error_bbox = fitz.Rect() # Start with an empty rect
|
| 72 |
+
for r_part in found_rects_for_error_sequence:
|
| 73 |
+
final_error_bbox.include_rect(r_part) # Expand to include this part
|
| 74 |
+
|
| 75 |
+
if not final_error_bbox.is_empty:
|
| 76 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Combined bbox: {final_error_bbox}")
|
| 77 |
+
return final_error_bbox
|
| 78 |
+
else:
|
| 79 |
+
# print(f"Debug: _get_specific_error_rect_in_context: Combined bbox was empty.")
|
| 80 |
+
pass
|
| 81 |
+
else:
|
| 82 |
+
# print(f"Debug: _get_specific_error_rect_in_context: No match found for '{error_text_verbatim}'.")
|
| 83 |
+
pass
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
|
| 87 |
def try_map_issues_to_page_rects(
|
| 88 |
issues_to_map_for_context: List[Dict[str, Any]],
|
| 89 |
+
pdf_rects_from_search: List[fitz.Rect], # Rects for occurrences of the wider context string
|
| 90 |
+
page_number_for_mapping: int,
|
| 91 |
+
page: fitz.Page # The current PyMuPDF page object
|
| 92 |
) -> int:
|
| 93 |
mapped_count = 0
|
| 94 |
+
# We assume that the number of issues for a given context string on a page
|
| 95 |
+
# should not exceed the number of times that context string appears.
|
| 96 |
+
# If it does, we only map up to the number of found context occurrences.
|
| 97 |
+
limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search))
|
| 98 |
+
|
| 99 |
for i in range(limit):
|
| 100 |
issue_to_update = issues_to_map_for_context[i]
|
| 101 |
+
if issue_to_update['is_mapped_to_pdf']:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# This is the rectangle for the i-th occurrence of the wider context string
|
| 105 |
+
context_occurrence_rect = pdf_rects_from_search[i]
|
| 106 |
+
|
| 107 |
+
final_rect_for_issue = context_occurrence_rect # Default to the whole context rect
|
| 108 |
+
|
| 109 |
+
# For LanguageTool issues, try to refine the rect to the specific error text
|
| 110 |
+
if issue_to_update.get('source_check_type') == 'LanguageTool':
|
| 111 |
+
error_text_verbatim = issue_to_update.get('error_text_verbatim')
|
| 112 |
+
if error_text_verbatim:
|
| 113 |
+
# print(f"Debug: Refining LT issue: '{error_text_verbatim}' within context rect {context_occurrence_rect}")
|
| 114 |
+
specific_error_rect = _get_specific_error_rect_in_context(
|
| 115 |
+
page, context_occurrence_rect, error_text_verbatim
|
| 116 |
+
)
|
| 117 |
+
if specific_error_rect:
|
| 118 |
+
final_rect_for_issue = specific_error_rect
|
| 119 |
+
# print(f"Debug: Refined rect to: {final_rect_for_issue}")
|
| 120 |
+
else:
|
| 121 |
+
# print(f"Debug: Could not refine rect, using context rect: {context_occurrence_rect}")
|
| 122 |
+
pass # Stick with the wider context_occurrence_rect if specific not found
|
| 123 |
+
|
| 124 |
+
coord_dict = convert_rect_to_dict(final_rect_for_issue)
|
| 125 |
if coord_dict:
|
| 126 |
issue_to_update['pdf_coordinates_list'] = [coord_dict]
|
| 127 |
issue_to_update['is_mapped_to_pdf'] = True
|
|
|
|
| 130 |
return mapped_count
|
| 131 |
|
| 132 |
|
| 133 |
+
# ... (rest of pdf_processing.py, including extract_majority_font_text_directly and extract_plain_text_from_original_pdf) ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
def extract_majority_font_text_directly(pdf_path: str) -> str:
|
| 136 |
"""
|
|
|
|
| 249 |
try:
|
| 250 |
doc_orig_text = fitz.open(pdf_path)
|
| 251 |
full_text_parts = [page.get_text("text") for page in doc_orig_text]
|
| 252 |
+
# print(full_text_parts) # This was the user's debug print, can be noisy
|
| 253 |
return "".join(full_text_parts)
|
| 254 |
except Exception as e:
|
| 255 |
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
|