texmetrics-regex-checks-gradio-1-devtesting

Sleeping

App Files Files Community

samyak152002 commited on 11 days ago

Commit

fee8cba

verified ·

1 Parent(s): 3770ab0

Update pdf_processing.py (#2)

Browse files

- Update pdf_processing.py (ce5b56b59e072225616a5b6064c35896ff4cb0a9)

Files changed (1) hide show

pdf_processing.py +103 -25

pdf_processing.py CHANGED Viewed

@@ -3,7 +3,7 @@ import fitz  # PyMuPDF
 import pymupdf4llm
 import os
 import traceback
-from typing import Any, Dict, List # Use standard List, Dict
 from collections import Counter
 def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
@@ -16,18 +16,112 @@ def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
         "width": rect.width, "height": rect.height
     }
 def try_map_issues_to_page_rects(
     issues_to_map_for_context: List[Dict[str, Any]],
-    pdf_rects: List[fitz.Rect],
-    page_number_for_mapping: int
 ) -> int:
     mapped_count = 0
-    limit = min(len(issues_to_map_for_context), len(pdf_rects))
     for i in range(limit):
         issue_to_update = issues_to_map_for_context[i]
-        if issue_to_update['is_mapped_to_pdf']: continue
-        pdf_rect = pdf_rects[i]
-        coord_dict = convert_rect_to_dict(pdf_rect)
         if coord_dict:
             issue_to_update['pdf_coordinates_list'] = [coord_dict]
             issue_to_update['is_mapped_to_pdf'] = True
@@ -36,23 +130,7 @@ def try_map_issues_to_page_rects(
     return mapped_count
-import fitz  # PyMuPDF
-import os
-import traceback
-from typing import Any, Dict, List
-from collections import Counter
-# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
-import fitz  # PyMuPDF
-import os
-import traceback
-from typing import Any, Dict, List  # Use standard List, Dict
-from collections import Counter
-# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
 def extract_majority_font_text_directly(pdf_path: str) -> str:
     """
@@ -171,7 +249,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
     try:
         doc_orig_text = fitz.open(pdf_path)
         full_text_parts = [page.get_text("text") for page in doc_orig_text]
-        print(full_text_parts)
         return "".join(full_text_parts)
     except Exception as e:
         print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")

 import pymupdf4llm
 import os
 import traceback
+from typing import Any, Dict, List, Optional # Use standard List, Dict, Optional
 from collections import Counter
 def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
         "width": rect.width, "height": rect.height
     }
+def _get_specific_error_rect_in_context(
+    page: fitz.Page,
+    context_rect: fitz.Rect,
+    error_text_verbatim: str
+) -> Optional[fitz.Rect]:
+    """
+    Tries to find the precise bounding box of error_text_verbatim within
+    the larger context_rect on the given page.
+    """
+    if not error_text_verbatim or error_text_verbatim.isspace():
+        print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.")
+        return None
+    # Extract words sorted by position within the given context_rect
+    # Each word_data is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
+    words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True)
+    # print(f"Debug: _get_specific_error_rect_in_context: Searching for '{error_text_verbatim}' in {len(words_on_page_in_clip)} words within clip {context_rect}")
+    error_tokens = error_text_verbatim.strip().split()
+    if not error_tokens:
+        print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.")
+        return None
+    found_rects_for_error_sequence = []
+    for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1):
+        match = True
+        current_sequence_rects = []
+        # print(f"Debug: _get_specific_error_rect_in_context: Trying match starting at PDF word '{words_on_page_in_clip[i][4]}'")
+        for j in range(len(error_tokens)):
+            pdf_word_text = words_on_page_in_clip[i+j][4]
+            error_token_to_match = error_tokens[j]
+            # Basic normalization for comparison
+            pdf_word_normalized = pdf_word_text.strip().lower()
+            error_token_normalized = error_token_to_match.strip().lower()
+            # A more robust comparison might involve removing common punctuation
+            # or handling hyphenation if LanguageTool splits differently than PyMuPDF.
+            if error_token_normalized != pdf_word_normalized:
+                # print(f"Debug: _get_specific_error_rect_in_context: Mismatch: '{error_token_normalized}' (expected) vs '{pdf_word_normalized}' (pdf word)")
+                match = False
+                break
+            current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4]))
+        if match:
+            # print(f"Debug: _get_specific_error_rect_in_context: Found match for '{error_text_verbatim}'")
+            found_rects_for_error_sequence = current_sequence_rects
+            break # Found the first full match of the error_text_verbatim
+    if found_rects_for_error_sequence:
+        final_error_bbox = fitz.Rect() # Start with an empty rect
+        for r_part in found_rects_for_error_sequence:
+            final_error_bbox.include_rect(r_part) # Expand to include this part
+        if not final_error_bbox.is_empty:
+            # print(f"Debug: _get_specific_error_rect_in_context: Combined bbox: {final_error_bbox}")
+            return final_error_bbox
+        else:
+            # print(f"Debug: _get_specific_error_rect_in_context: Combined bbox was empty.")
+            pass
+    else:
+        # print(f"Debug: _get_specific_error_rect_in_context: No match found for '{error_text_verbatim}'.")
+        pass
+    return None
 def try_map_issues_to_page_rects(
     issues_to_map_for_context: List[Dict[str, Any]],
+    pdf_rects_from_search: List[fitz.Rect], # Rects for occurrences of the wider context string
+    page_number_for_mapping: int,
+    page: fitz.Page # The current PyMuPDF page object
 ) -> int:
     mapped_count = 0
+    # We assume that the number of issues for a given context string on a page
+    # should not exceed the number of times that context string appears.
+    # If it does, we only map up to the number of found context occurrences.
+    limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search))
     for i in range(limit):
         issue_to_update = issues_to_map_for_context[i]
+        if issue_to_update['is_mapped_to_pdf']:
+            continue
+        # This is the rectangle for the i-th occurrence of the wider context string
+        context_occurrence_rect = pdf_rects_from_search[i]
+        final_rect_for_issue = context_occurrence_rect # Default to the whole context rect
+        # For LanguageTool issues, try to refine the rect to the specific error text
+        if issue_to_update.get('source_check_type') == 'LanguageTool':
+            error_text_verbatim = issue_to_update.get('error_text_verbatim')
+            if error_text_verbatim:
+                # print(f"Debug: Refining LT issue: '{error_text_verbatim}' within context rect {context_occurrence_rect}")
+                specific_error_rect = _get_specific_error_rect_in_context(
+                    page, context_occurrence_rect, error_text_verbatim
+                )
+                if specific_error_rect:
+                    final_rect_for_issue = specific_error_rect
+                    # print(f"Debug: Refined rect to: {final_rect_for_issue}")
+                else:
+                    # print(f"Debug: Could not refine rect, using context rect: {context_occurrence_rect}")
+                    pass # Stick with the wider context_occurrence_rect if specific not found
+        coord_dict = convert_rect_to_dict(final_rect_for_issue)
         if coord_dict:
             issue_to_update['pdf_coordinates_list'] = [coord_dict]
             issue_to_update['is_mapped_to_pdf'] = True
     return mapped_count
+# ... (rest of pdf_processing.py, including extract_majority_font_text_directly and extract_plain_text_from_original_pdf) ...
 def extract_majority_font_text_directly(pdf_path: str) -> str:
     """
     try:
         doc_orig_text = fitz.open(pdf_path)
         full_text_parts = [page.get_text("text") for page in doc_orig_text]
+        # print(full_text_parts) # This was the user's debug print, can be noisy
         return "".join(full_text_parts)
     except Exception as e:
         print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")