samyak152002 commited on
Commit
fee8cba
·
verified ·
1 Parent(s): 3770ab0

Update pdf_processing.py (#2)

Browse files

- Update pdf_processing.py (ce5b56b59e072225616a5b6064c35896ff4cb0a9)

Files changed (1) hide show
  1. pdf_processing.py +103 -25
pdf_processing.py CHANGED
@@ -3,7 +3,7 @@ import fitz # PyMuPDF
3
  import pymupdf4llm
4
  import os
5
  import traceback
6
- from typing import Any, Dict, List # Use standard List, Dict
7
  from collections import Counter
8
 
9
  def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
@@ -16,18 +16,112 @@ def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
16
  "width": rect.width, "height": rect.height
17
  }
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def try_map_issues_to_page_rects(
20
  issues_to_map_for_context: List[Dict[str, Any]],
21
- pdf_rects: List[fitz.Rect],
22
- page_number_for_mapping: int
 
23
  ) -> int:
24
  mapped_count = 0
25
- limit = min(len(issues_to_map_for_context), len(pdf_rects))
 
 
 
 
26
  for i in range(limit):
27
  issue_to_update = issues_to_map_for_context[i]
28
- if issue_to_update['is_mapped_to_pdf']: continue
29
- pdf_rect = pdf_rects[i]
30
- coord_dict = convert_rect_to_dict(pdf_rect)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  if coord_dict:
32
  issue_to_update['pdf_coordinates_list'] = [coord_dict]
33
  issue_to_update['is_mapped_to_pdf'] = True
@@ -36,23 +130,7 @@ def try_map_issues_to_page_rects(
36
  return mapped_count
37
 
38
 
39
- import fitz # PyMuPDF
40
- import os
41
- import traceback
42
- from typing import Any, Dict, List
43
- from collections import Counter
44
-
45
-
46
- # Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
47
-
48
- import fitz # PyMuPDF
49
- import os
50
- import traceback
51
- from typing import Any, Dict, List # Use standard List, Dict
52
- from collections import Counter
53
-
54
-
55
- # Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
56
 
57
  def extract_majority_font_text_directly(pdf_path: str) -> str:
58
  """
@@ -171,7 +249,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
171
  try:
172
  doc_orig_text = fitz.open(pdf_path)
173
  full_text_parts = [page.get_text("text") for page in doc_orig_text]
174
- print(full_text_parts)
175
  return "".join(full_text_parts)
176
  except Exception as e:
177
  print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
 
3
  import pymupdf4llm
4
  import os
5
  import traceback
6
+ from typing import Any, Dict, List, Optional # Use standard List, Dict, Optional
7
  from collections import Counter
8
 
9
  def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
 
16
  "width": rect.width, "height": rect.height
17
  }
18
 
19
+ def _get_specific_error_rect_in_context(
20
+ page: fitz.Page,
21
+ context_rect: fitz.Rect,
22
+ error_text_verbatim: str
23
+ ) -> Optional[fitz.Rect]:
24
+ """
25
+ Tries to find the precise bounding box of error_text_verbatim within
26
+ the larger context_rect on the given page.
27
+ """
28
+ if not error_text_verbatim or error_text_verbatim.isspace():
29
+ print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.")
30
+ return None
31
+
32
+ # Extract words sorted by position within the given context_rect
33
+ # Each word_data is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
34
+ words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True)
35
+
36
+ # print(f"Debug: _get_specific_error_rect_in_context: Searching for '{error_text_verbatim}' in {len(words_on_page_in_clip)} words within clip {context_rect}")
37
+
38
+ error_tokens = error_text_verbatim.strip().split()
39
+ if not error_tokens:
40
+ print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.")
41
+ return None
42
+
43
+ found_rects_for_error_sequence = []
44
+
45
+ for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1):
46
+ match = True
47
+ current_sequence_rects = []
48
+ # print(f"Debug: _get_specific_error_rect_in_context: Trying match starting at PDF word '{words_on_page_in_clip[i][4]}'")
49
+ for j in range(len(error_tokens)):
50
+ pdf_word_text = words_on_page_in_clip[i+j][4]
51
+ error_token_to_match = error_tokens[j]
52
+
53
+ # Basic normalization for comparison
54
+ pdf_word_normalized = pdf_word_text.strip().lower()
55
+ error_token_normalized = error_token_to_match.strip().lower()
56
+
57
+ # A more robust comparison might involve removing common punctuation
58
+ # or handling hyphenation if LanguageTool splits differently than PyMuPDF.
59
+ if error_token_normalized != pdf_word_normalized:
60
+ # print(f"Debug: _get_specific_error_rect_in_context: Mismatch: '{error_token_normalized}' (expected) vs '{pdf_word_normalized}' (pdf word)")
61
+ match = False
62
+ break
63
+ current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4]))
64
+
65
+ if match:
66
+ # print(f"Debug: _get_specific_error_rect_in_context: Found match for '{error_text_verbatim}'")
67
+ found_rects_for_error_sequence = current_sequence_rects
68
+ break # Found the first full match of the error_text_verbatim
69
+
70
+ if found_rects_for_error_sequence:
71
+ final_error_bbox = fitz.Rect() # Start with an empty rect
72
+ for r_part in found_rects_for_error_sequence:
73
+ final_error_bbox.include_rect(r_part) # Expand to include this part
74
+
75
+ if not final_error_bbox.is_empty:
76
+ # print(f"Debug: _get_specific_error_rect_in_context: Combined bbox: {final_error_bbox}")
77
+ return final_error_bbox
78
+ else:
79
+ # print(f"Debug: _get_specific_error_rect_in_context: Combined bbox was empty.")
80
+ pass
81
+ else:
82
+ # print(f"Debug: _get_specific_error_rect_in_context: No match found for '{error_text_verbatim}'.")
83
+ pass
84
+ return None
85
+
86
+
87
  def try_map_issues_to_page_rects(
88
  issues_to_map_for_context: List[Dict[str, Any]],
89
+ pdf_rects_from_search: List[fitz.Rect], # Rects for occurrences of the wider context string
90
+ page_number_for_mapping: int,
91
+ page: fitz.Page # The current PyMuPDF page object
92
  ) -> int:
93
  mapped_count = 0
94
+ # We assume that the number of issues for a given context string on a page
95
+ # should not exceed the number of times that context string appears.
96
+ # If it does, we only map up to the number of found context occurrences.
97
+ limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search))
98
+
99
  for i in range(limit):
100
  issue_to_update = issues_to_map_for_context[i]
101
+ if issue_to_update['is_mapped_to_pdf']:
102
+ continue
103
+
104
+ # This is the rectangle for the i-th occurrence of the wider context string
105
+ context_occurrence_rect = pdf_rects_from_search[i]
106
+
107
+ final_rect_for_issue = context_occurrence_rect # Default to the whole context rect
108
+
109
+ # For LanguageTool issues, try to refine the rect to the specific error text
110
+ if issue_to_update.get('source_check_type') == 'LanguageTool':
111
+ error_text_verbatim = issue_to_update.get('error_text_verbatim')
112
+ if error_text_verbatim:
113
+ # print(f"Debug: Refining LT issue: '{error_text_verbatim}' within context rect {context_occurrence_rect}")
114
+ specific_error_rect = _get_specific_error_rect_in_context(
115
+ page, context_occurrence_rect, error_text_verbatim
116
+ )
117
+ if specific_error_rect:
118
+ final_rect_for_issue = specific_error_rect
119
+ # print(f"Debug: Refined rect to: {final_rect_for_issue}")
120
+ else:
121
+ # print(f"Debug: Could not refine rect, using context rect: {context_occurrence_rect}")
122
+ pass # Stick with the wider context_occurrence_rect if specific not found
123
+
124
+ coord_dict = convert_rect_to_dict(final_rect_for_issue)
125
  if coord_dict:
126
  issue_to_update['pdf_coordinates_list'] = [coord_dict]
127
  issue_to_update['is_mapped_to_pdf'] = True
 
130
  return mapped_count
131
 
132
 
133
+ # ... (rest of pdf_processing.py, including extract_majority_font_text_directly and extract_plain_text_from_original_pdf) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  def extract_majority_font_text_directly(pdf_path: str) -> str:
136
  """
 
249
  try:
250
  doc_orig_text = fitz.open(pdf_path)
251
  full_text_parts = [page.get_text("text") for page in doc_orig_text]
252
+ # print(full_text_parts) # This was the user's debug print, can be noisy
253
  return "".join(full_text_parts)
254
  except Exception as e:
255
  print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")