texmetrics-regex-checks-gradio-1-devtesting

Sleeping

App Files Files Community

samyak152002 commited on May 17

Commit

0918bea

verified ·

1 Parent(s): 808a964

Update pdf_processing.py

Browse files

Files changed (1) hide show

pdf_processing.py +54 -99

pdf_processing.py CHANGED Viewed

@@ -2,182 +2,137 @@
 import fitz  # PyMuPDF
 import pymupdf4llm
 import os
-import tempfile
 import traceback
-from typing import Tuple, Optional, List, Dict, Any
 from collections import Counter
-def convert_rect_to_dict(rect: fitz.Rect) -> Optional[Dict[str, float]]:
     """Converts a fitz.Rect object to a dictionary."""
     if not rect or not isinstance(rect, fitz.Rect):
-        print(f"Warning: Invalid rect object received: {rect}")
         return None
     return {
-        "x0": rect.x0,
-        "y0": rect.y0,
-        "x1": rect.x1,
-        "y1": rect.y1,
-        "width": rect.width,
-        "height": rect.height
     }
 def try_map_issues_to_page_rects(
     issues_to_map_for_context: List[Dict[str, Any]],
     pdf_rects: List[fitz.Rect],
-    page_number_for_mapping: int # 1-based page number
 ) -> int:
-    """Helper function for mapping LT issues to PDF rectangles."""
     mapped_count = 0
-    num_issues_to_try = len(issues_to_map_for_context)
-    num_available_rects = len(pdf_rects)
-    limit = min(num_issues_to_try, num_available_rects)
     for i in range(limit):
         issue_to_update = issues_to_map_for_context[i]
-        if issue_to_update['is_mapped_to_pdf']: # Check the correct flag name
-            continue
         pdf_rect = pdf_rects[i]
         coord_dict = convert_rect_to_dict(pdf_rect)
         if coord_dict:
-            issue_to_update['pdf_coordinates_list'] = [coord_dict] # Store as list of dicts
             issue_to_update['is_mapped_to_pdf'] = True
             issue_to_update['mapped_page_number'] = page_number_for_mapping
             mapped_count += 1
-        else:
-            print(f"      Warning: Could not convert rect for context '{issue_to_update['context_text'][:30]}...' on page {page_number_for_mapping}")
     return mapped_count
-# The function is modified as requested.
-def extract_pdf_text(file_input: Any) -> str:
     """
-    Extracts text from a PDF, filters it to include only the majority font,
-    and then converts this filtered text to Markdown using PyMuPDF4LLM.
-    The "majority font" is defined by the combination of font name and
-    (rounded) font size that accounts for the most characters in the document.
     """
-    input_temp_file_path = None  # For when file_input is a stream
-    actual_path_to_process = None
     original_doc = None
-    new_doc = None # The new document we will build
     try:
-        # 1. Handle Input to get actual_path_to_process
-        if isinstance(file_input, str):
-            actual_path_to_process = file_input
-        elif hasattr(file_input, 'read') and callable(file_input.read):
-            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
-                input_temp_file_path = temp_file_obj.name
-                file_input.seek(0) # Ensure reading from the beginning of the stream
-                temp_file_obj.write(file_input.read())
-            actual_path_to_process = input_temp_file_path
-        else:
-            raise ValueError("Input 'file_input' must be a file path (str) or a file-like object.")
-        original_doc = fitz.open(actual_path_to_process)
         if not original_doc.page_count:
-            print("PDF has no pages.")
-            if input_temp_file_path and os.path.exists(input_temp_file_path):
-                 os.remove(input_temp_file_path) # clean up if we exit early
             return ""
-        # 2. Collect Font Data & Text from all Spans
-        all_spans_details: List[Dict[str, Any]] = [] # Explicitly type for clarity
         font_char_counts: Counter = Counter()
-        print(f"Original PDF ('{os.path.basename(actual_path_to_process if isinstance(actual_path_to_process, str) else 'stream')}') has {original_doc.page_count} pages. Analyzing fonts...")
         for page_num in range(original_doc.page_count):
             page = original_doc[page_num]
             text_dict = page.get_text("dict")
             for block in text_dict.get("blocks", []):
-                if block.get("type") == 0: # Process only text blocks (type 0)
                     for line in block.get("lines", []):
                         for span in line.get("spans", []):
                             font_name = span["font"]
                             font_size_rounded = int(round(span["size"]))
                             text = span["text"]
                             span_detail = {
-                                "text": text,
-                                "font_name": font_name,
                                 "font_size_rounded": font_size_rounded,
                                 "original_font_size": span["size"],
-                                "bbox": span["bbox"],
-                                "page_num": page_num
                             }
                             all_spans_details.append(span_detail)
                             font_char_counts[(font_name, font_size_rounded)] += len(text)
         if not font_char_counts:
-            print("No text with font information found in PDF.")
-            # Cleanup and return if no text info
-            if original_doc: original_doc.close()
-            if input_temp_file_path and os.path.exists(input_temp_file_path):
-                os.remove(input_temp_file_path)
             return ""
-        # 3. Determine Majority Font
         majority_font_tuple_info = font_char_counts.most_common(1)[0]
         (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
         char_count = majority_font_tuple_info[1]
-        print(f"Majority font combination: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt (with {char_count} characters).")
-        # 4. Create a New PDF Document with Only the Majority Font Text
         new_doc = fitz.Document()
-        print("Constructing new PDF with text from majority font only...")
         for p_num in range(original_doc.page_count):
             original_page_for_dim = original_doc[p_num]
             new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width,
                                             height=original_page_for_dim.rect.height)
             spans_to_write = [
                 s_detail for s_detail in all_spans_details
                 if s_detail["page_num"] == p_num and \
                    s_detail["font_name"] == majority_font_name and \
                    s_detail["font_size_rounded"] == majority_font_size_rounded
             ]
             for span_data in spans_to_write:
                 text_to_insert = span_data["text"]
                 original_bbox = fitz.Rect(span_data["bbox"])
                 font_size_for_render = span_data["original_font_size"]
-                insertion_result = new_pdf_page.insert_textbox(
-                    original_bbox,
-                    text_to_insert,
-                    fontsize=font_size_for_render,
-                    fontname="helv", # Using Helvetica for simplicity
-                    align=0
-                )
-                if insertion_result < 0:
-                    print(f"Warning: Textbox insertion for '{text_to_insert[:30].replace(chr(10), ' ')}...' in rect {original_bbox} on new page {p_num} might have issues (code: {insertion_result}).")
-        print(f"New PDF constructed with {new_doc.page_count} pages.")
-        # 5. Convert the In-Memory Filtered PDF Document to Markdown
         if new_doc.page_count > 0:
-            print(f"Converting filtered PDF Document object to Markdown using pymupdf4llm...")
             markdown_text = pymupdf4llm.to_markdown(new_doc)
         else:
-            print("The new PDF document (filtered) is empty. No markdown will be generated.")
-            markdown_text = ""
-        print(f"Total Markdown text length from filtered PDF: {len(markdown_text)} characters.")
         return markdown_text
     except Exception as e:
-        print(f"Error in extract_pdf_text: {str(e)}")
-        traceback.print_exc()
         return ""
     finally:
-        if original_doc:
-            original_doc.close()
-        if new_doc:
-            new_doc.close()
-        if input_temp_file_path and os.path.exists(input_temp_file_path):
-            try:
-                os.remove(input_temp_file_path)
-                print(f"Cleaned up temporary input file: {input_temp_file_path}")
-            except Exception as e_clean:
-                print(f"Error cleaning up temporary input file {input_temp_file_path}: {e_clean}")

 import fitz  # PyMuPDF
 import pymupdf4llm
 import os
 import traceback
+from typing import Any, Dict, List # Use standard List, Dict
 from collections import Counter
+def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
     """Converts a fitz.Rect object to a dictionary."""
     if not rect or not isinstance(rect, fitz.Rect):
+        # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
         return None
     return {
+        "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
+        "width": rect.width, "height": rect.height
     }
 def try_map_issues_to_page_rects(
     issues_to_map_for_context: List[Dict[str, Any]],
     pdf_rects: List[fitz.Rect],
+    page_number_for_mapping: int
 ) -> int:
     mapped_count = 0
+    limit = min(len(issues_to_map_for_context), len(pdf_rects))
     for i in range(limit):
         issue_to_update = issues_to_map_for_context[i]
+        if issue_to_update['is_mapped_to_pdf']: continue
         pdf_rect = pdf_rects[i]
         coord_dict = convert_rect_to_dict(pdf_rect)
         if coord_dict:
+            issue_to_update['pdf_coordinates_list'] = [coord_dict]
             issue_to_update['is_mapped_to_pdf'] = True
             issue_to_update['mapped_page_number'] = page_number_for_mapping
             mapped_count += 1
     return mapped_count
+def extract_font_filtered_markdown(pdf_path: str) -> str:
     """
+    Extracts text from PDF at pdf_path, filters by majority font,
+    builds a new PDF in memory, and converts it to Markdown using PyMuPDF4LLM.
+    Expects pdf_path to be a valid path to a PDF file.
     """
     original_doc = None
+    new_doc = None
     try:
+        original_doc = fitz.open(pdf_path)
         if not original_doc.page_count:
+            print("FontFilter: PDF has no pages.")
             return ""
+        all_spans_details: List[Dict[str, Any]] = []
         font_char_counts: Counter = Counter()
+        pdf_basename = os.path.basename(pdf_path)
+        print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
         for page_num in range(original_doc.page_count):
             page = original_doc[page_num]
             text_dict = page.get_text("dict")
             for block in text_dict.get("blocks", []):
+                if block.get("type") == 0:
                     for line in block.get("lines", []):
                         for span in line.get("spans", []):
                             font_name = span["font"]
                             font_size_rounded = int(round(span["size"]))
                             text = span["text"]
                             span_detail = {
+                                "text": text, "font_name": font_name,
                                 "font_size_rounded": font_size_rounded,
                                 "original_font_size": span["size"],
+                                "bbox": span["bbox"], "page_num": page_num
                             }
                             all_spans_details.append(span_detail)
                             font_char_counts[(font_name, font_size_rounded)] += len(text)
         if not font_char_counts:
+            print("FontFilter: No text with font information found in PDF.")
             return ""
         majority_font_tuple_info = font_char_counts.most_common(1)[0]
         (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
         char_count = majority_font_tuple_info[1]
+        print(f"FontFilter: Majority font: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count} chars).")
         new_doc = fitz.Document()
+        # print("FontFilter: Constructing new PDF with majority font text...") # Can be verbose
         for p_num in range(original_doc.page_count):
             original_page_for_dim = original_doc[p_num]
             new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width,
                                             height=original_page_for_dim.rect.height)
             spans_to_write = [
                 s_detail for s_detail in all_spans_details
                 if s_detail["page_num"] == p_num and \
                    s_detail["font_name"] == majority_font_name and \
                    s_detail["font_size_rounded"] == majority_font_size_rounded
             ]
             for span_data in spans_to_write:
                 text_to_insert = span_data["text"]
                 original_bbox = fitz.Rect(span_data["bbox"])
                 font_size_for_render = span_data["original_font_size"]
+                new_pdf_page.insert_textbox(
+                    original_bbox, text_to_insert, fontsize=font_size_for_render,
+                    fontname="helv", align=0
+                ) # Ignoring insertion_result for brevity here
+        # print(f"FontFilter: New PDF constructed with {new_doc.page_count} pages.")
+        markdown_text = ""
         if new_doc.page_count > 0:
+            # print(f"FontFilter: Converting filtered PDF Document object to Markdown...") # Verbose
             markdown_text = pymupdf4llm.to_markdown(new_doc)
         else:
+            print("FontFilter: The new PDF (filtered) is empty. No markdown generated.")
+        # print(f"FontFilter: Markdown from filtered PDF length: {len(markdown_text)} chars.")
         return markdown_text
+    except Exception as e:
+        print(f"Error in extract_font_filtered_markdown for '{pdf_path}': {e}\n{traceback.format_exc()}")
+        return ""
+    finally:
+        if original_doc: original_doc.close()
+        if new_doc: new_doc.close()
+def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
+    """
+    Extracts raw plain text from the PDF at pdf_path without any filtering.
+    Expects pdf_path to be a valid path to a PDF file.
+    """
+    doc_orig_text = None
+    try:
+        doc_orig_text = fitz.open(pdf_path)
+        full_text_parts = [page.get_text("text") for page in doc_orig_text]
+        # print(f"OriginalTextExtract: Extracted {len(doc_orig_text.page_count)} pages of plain text from '{os.path.basename(pdf_path)}'.")
+        return "".join(full_text_parts)
     except Exception as e:
+        print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
         return ""
     finally:
+        if doc_orig_text: doc_orig_text.close()