Update main_analyzer.py (#1)
Browse files- Update main_analyzer.py (21e59ca78f93aa1af8acd53cd3e78c131cc7bb50)
- main_analyzer.py +10 -16
main_analyzer.py
CHANGED
@@ -90,7 +90,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
90 |
print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
|
91 |
# ... (rest of mapping logic as before) ...
|
92 |
for page_idx in range(doc_for_mapping.page_count):
|
93 |
-
page = doc_for_mapping[page_idx]
|
94 |
current_page_num_1_based = page_idx + 1
|
95 |
unmapped_issues_on_this_page_by_context = defaultdict(list)
|
96 |
for issue_dict in detailed_issues_for_mapping:
|
@@ -104,9 +104,14 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
104 |
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
|
105 |
if not ctx_str or not ctx_str.strip(): continue
|
106 |
try:
|
107 |
-
|
108 |
-
if
|
109 |
-
try_map_issues_to_page_rects(
|
|
|
|
|
|
|
|
|
|
|
110 |
except Exception as search_exc:
|
111 |
print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
|
112 |
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
|
@@ -153,15 +158,4 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
153 |
# itself if it received a stream; this isn't happening in the Gradio flow.
|
154 |
if doc_for_mapping: # Ensure the fitz document for mapping is closed
|
155 |
doc_for_mapping.close()
|
156 |
-
print(f"Analyzer: Closed fitz document used for mapping.")
|
157 |
-
|
158 |
-
# The original finally block for temp_file_for_stream_path:
|
159 |
-
# if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
|
160 |
-
# try:
|
161 |
-
# os.remove(temp_file_for_stream_path)
|
162 |
-
# print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
|
163 |
-
# except Exception as e_clean:
|
164 |
-
# print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")
|
165 |
-
# This part is removed because temp_file_for_stream_path is never assigned a value
|
166 |
-
# in the current structure of analyze_pdf. If analyze_pdf were to handle streams
|
167 |
-
# by creating its own temp file, then this cleanup would be relevant for that temp file.
|
|
|
90 |
print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
|
91 |
# ... (rest of mapping logic as before) ...
|
92 |
for page_idx in range(doc_for_mapping.page_count):
|
93 |
+
page = doc_for_mapping[page_idx] # Current PyMuPDF page object
|
94 |
current_page_num_1_based = page_idx + 1
|
95 |
unmapped_issues_on_this_page_by_context = defaultdict(list)
|
96 |
for issue_dict in detailed_issues_for_mapping:
|
|
|
104 |
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
|
105 |
if not ctx_str or not ctx_str.strip(): continue
|
106 |
try:
|
107 |
+
pdf_rects_for_context_occurrences = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
|
108 |
+
if pdf_rects_for_context_occurrences:
|
109 |
+
try_map_issues_to_page_rects(
|
110 |
+
issues_for_ctx,
|
111 |
+
pdf_rects_for_context_occurrences,
|
112 |
+
current_page_num_1_based,
|
113 |
+
page # Pass the current page object
|
114 |
+
)
|
115 |
except Exception as search_exc:
|
116 |
print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
|
117 |
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
|
|
|
158 |
# itself if it received a stream; this isn't happening in the Gradio flow.
|
159 |
if doc_for_mapping: # Ensure the fitz document for mapping is closed
|
160 |
doc_for_mapping.close()
|
161 |
+
print(f"Analyzer: Closed fitz document used for mapping.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|