samyak152002 commited on
Commit
f9e77fb
·
verified ·
1 Parent(s): fee8cba

Update main_analyzer.py (#1)

Browse files

- Update main_analyzer.py (21e59ca78f93aa1af8acd53cd3e78c131cc7bb50)

Files changed (1) hide show
  1. main_analyzer.py +10 -16
main_analyzer.py CHANGED
@@ -90,7 +90,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
90
  print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
91
  # ... (rest of mapping logic as before) ...
92
  for page_idx in range(doc_for_mapping.page_count):
93
- page = doc_for_mapping[page_idx]
94
  current_page_num_1_based = page_idx + 1
95
  unmapped_issues_on_this_page_by_context = defaultdict(list)
96
  for issue_dict in detailed_issues_for_mapping:
@@ -104,9 +104,14 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
104
  for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
105
  if not ctx_str or not ctx_str.strip(): continue
106
  try:
107
- pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
108
- if pdf_rects:
109
- try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
 
 
 
 
 
110
  except Exception as search_exc:
111
  print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
112
  total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
@@ -153,15 +158,4 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
153
  # itself if it received a stream; this isn't happening in the Gradio flow.
154
  if doc_for_mapping: # Ensure the fitz document for mapping is closed
155
  doc_for_mapping.close()
156
- print(f"Analyzer: Closed fitz document used for mapping.")
157
-
158
- # The original finally block for temp_file_for_stream_path:
159
- # if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
160
- # try:
161
- # os.remove(temp_file_for_stream_path)
162
- # print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
163
- # except Exception as e_clean:
164
- # print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")
165
- # This part is removed because temp_file_for_stream_path is never assigned a value
166
- # in the current structure of analyze_pdf. If analyze_pdf were to handle streams
167
- # by creating its own temp file, then this cleanup would be relevant for that temp file.
 
90
  print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
91
  # ... (rest of mapping logic as before) ...
92
  for page_idx in range(doc_for_mapping.page_count):
93
+ page = doc_for_mapping[page_idx] # Current PyMuPDF page object
94
  current_page_num_1_based = page_idx + 1
95
  unmapped_issues_on_this_page_by_context = defaultdict(list)
96
  for issue_dict in detailed_issues_for_mapping:
 
104
  for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
105
  if not ctx_str or not ctx_str.strip(): continue
106
  try:
107
+ pdf_rects_for_context_occurrences = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
108
+ if pdf_rects_for_context_occurrences:
109
+ try_map_issues_to_page_rects(
110
+ issues_for_ctx,
111
+ pdf_rects_for_context_occurrences,
112
+ current_page_num_1_based,
113
+ page # Pass the current page object
114
+ )
115
  except Exception as search_exc:
116
  print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
117
  total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
 
158
  # itself if it received a stream; this isn't happening in the Gradio flow.
159
  if doc_for_mapping: # Ensure the fitz document for mapping is closed
160
  doc_for_mapping.close()
161
+ print(f"Analyzer: Closed fitz document used for mapping.")