texmetrics-regex-checks-gradio-1-devtesting

Sleeping

App Files Files Community

samyak152002 commited on May 17

Commit

070b77e

verified ·

1 Parent(s): fab5be2

Update main_analyzer.py

Browse files

Files changed (1) hide show

main_analyzer.py +120 -92

main_analyzer.py CHANGED Viewed

@@ -4,127 +4,155 @@ import os
 import tempfile
 import re
 import traceback
-from typing import Tuple, Dict, Any, List, Optional
 from collections import defaultdict
-# Import functions from our refactored modules
-from pdf_processing import extract_pdf_text, try_map_issues_to_page_rects # convert_rect_to_dict is used by try_map_issues
-from text_utils import convert_markdown_to_plain_text
 from content_analysis import (
     check_metadata, check_disclosures, check_figures_and_tables,
-    check_references_summary, check_structure, check_language_issues_and_regex,
     check_figure_order, check_reference_order
 )
 def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
     doc_for_mapping = None
-    temp_fitz_file_path = None
     try:
-        markdown_text = extract_pdf_text(filepath_or_stream)
-        if not markdown_text:
-            return {"error": "Failed to extract text (Markdown) from PDF."}, None
-        plain_text_for_general_checks = convert_markdown_to_plain_text(markdown_text)
-        cleaned_plain_text_for_regex = re.sub(r'\s+', ' ', plain_text_for_general_checks.replace('\n', ' ')).strip()
-        language_and_regex_issue_report = check_language_issues_and_regex(markdown_text)
-        if "error" in language_and_regex_issue_report:
-            return {"error": f"Language/Regex check error: {language_and_regex_issue_report['error']}"}, None
-        detailed_issues_for_mapping = language_and_regex_issue_report.get("issues_list", [])
-        if detailed_issues_for_mapping:
-            if isinstance(filepath_or_stream, str):
-                pdf_path_for_fitz = filepath_or_stream
-            elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
-                filepath_or_stream.seek(0)
-                temp_fitz_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
-                temp_fitz_file_path = temp_fitz_file.name
-                temp_fitz_file.write(filepath_or_stream.read())
-                temp_fitz_file.close()
-                pdf_path_for_fitz = temp_fitz_file_path
-            else:
-                return {"error": "Invalid PDF input for coordinate mapping."}, None
             try:
-                doc_for_mapping = fitz.open(pdf_path_for_fitz)
                 if doc_for_mapping.page_count > 0:
-                    print(f"\n--- Mapping {len(detailed_issues_for_mapping)} Issues (filtered) to PDF Coordinates ---")
-                    if detailed_issues_for_mapping:
-                        for page_idx in range(doc_for_mapping.page_count):
-                            page = doc_for_mapping[page_idx]
-                            current_page_num_1_based = page_idx + 1
-                            unmapped_issues_on_this_page_by_context = defaultdict(list)
-                            for issue_dict in detailed_issues_for_mapping:
-                                if not issue_dict['is_mapped_to_pdf']:
-                                    unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
-                            if not unmapped_issues_on_this_page_by_context:
-                                if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
-                                continue
-                            for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
-                                if not ctx_str.strip(): continue
-                                try:
-                                    pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
-                                    if pdf_rects:
-                                        try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
-                                except Exception as search_exc:
-                                    print(f"Warning: Error searching for context '{ctx_str[:30]}' on page {current_page_num_1_based}: {search_exc}")
-                        total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
-                        print(f"Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
-                    else:
-                        print("No language/regex issues found within the defined content boundaries to map.")
             except Exception as e_map:
-                print(f"Error during PDF coordinate mapping: {e_map}")
-                traceback.print_exc()
             finally:
                 if doc_for_mapping: doc_for_mapping.close()
-                if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
-                    os.unlink(temp_fitz_file_path)
         final_formatted_issues_list = []
         for issue_data in detailed_issues_for_mapping:
-            page_num_for_json = 0
-            coords_for_json = []
-            if issue_data['is_mapped_to_pdf'] and issue_data['pdf_coordinates_list']:
-                coord_dict = issue_data['pdf_coordinates_list'][0]
-                coords_for_json = [coord_dict['x0'], coord_dict['y0'], coord_dict['x1'], coord_dict['y1']]
-                page_num_for_json = issue_data['mapped_page_number']
             final_formatted_issues_list.append({
-                "message": issue_data['message'], "context": issue_data['context_text'],
-                "suggestions": issue_data['replacements_suggestion'], "category": issue_data['category_name'],
-                "rule_id": issue_data['ruleId'], "offset": issue_data['offset_in_text'],
-                "length": issue_data['error_length'], "coordinates": coords_for_json,
-                "page": page_num_for_json
             })
         results = {
             "issues": final_formatted_issues_list,
-            "document_checks": {
-                "metadata": check_metadata(cleaned_plain_text_for_regex),
-                "disclosures": check_disclosures(cleaned_plain_text_for_regex),
-                "figures_and_tables": check_figures_and_tables(cleaned_plain_text_for_regex),
-                "references_summary": check_references_summary(cleaned_plain_text_for_regex),
-                "structure": check_structure(cleaned_plain_text_for_regex),
-                "figure_order_analysis": check_figure_order(cleaned_plain_text_for_regex),
-                "reference_order_analysis": check_reference_order(cleaned_plain_text_for_regex),
-                "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_plain_text_for_regex, re.IGNORECASE)),
-                "readability_issues_detected": False,
-            }
         }
         return results, None
     except Exception as e:
-        print(f"Overall analysis error in analyze_pdf: {e}")
-        traceback.print_exc()
-        if doc_for_mapping: doc_for_mapping.close()
-        if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
-            os.unlink(temp_fitz_file_path)
-        return {"error": str(e)}, None

 import tempfile
 import re
 import traceback
+from typing import Tuple, Dict, Any, List
 from collections import defaultdict
+from pdf_processing import (
+    extract_font_filtered_markdown,
+    extract_plain_text_from_original_pdf,
+    try_map_issues_to_page_rects
+)
 from content_analysis import (
     check_metadata, check_disclosures, check_figures_and_tables,
+    check_references_summary, check_structure,
     check_figure_order, check_reference_order
 )
+from language_checker import perform_language_checks
+from regex_checker import perform_regex_checks
+# text_utils.convert_markdown_to_plain_text is used by language_checker
+# config.py is imported in app.py
 def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
+    original_pdf_access_path = None
+    temp_file_for_stream_path = None
     doc_for_mapping = None
     try:
+        if isinstance(filepath_or_stream, str):
+            original_pdf_access_path = filepath_or_stream
+        elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
+                temp_file_for_stream_path = temp_file_obj.name
+                filepath_or_stream.seek(0)
+                temp_file_obj.write(filepath_or_stream.read())
+            original_pdf_access_path = temp_file_for_stream_path
+            print(f"Analyzer: Original PDF stream saved to temp file: {original_pdf_access_path}")
+        else:
+            return {"error": "Invalid PDF input type. Must be path or file-like object."}, None
+        if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
+             return {"error": f"PDF path '{original_pdf_access_path}' does not exist or is invalid."}, None
+        # 1. Unfiltered Plain Text (for general and regex checks)
+        print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
+        raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
+        pdf_size = os.path.getsize(original_pdf_access_path)
+        if not raw_unfiltered_plain_text and pdf_size > 0 :
+             print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
+        cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
+        # 2. Font-Filtered Markdown (for LanguageTool checks)
+        print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
+        markdown_text_from_filtered_pdf = extract_font_filtered_markdown(original_pdf_access_path)
+        if not markdown_text_from_filtered_pdf and pdf_size > 0 :
+            print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
+        # 3. Perform all checks
+        document_check_results = {
+            "metadata": check_metadata(cleaned_unfiltered_plain_text),
+            "disclosures": check_disclosures(cleaned_unfiltered_plain_text),
+            "figures_and_tables": check_figures_and_tables(cleaned_unfiltered_plain_text),
+            "references_summary": check_references_summary(cleaned_unfiltered_plain_text),
+            "structure": check_structure(cleaned_unfiltered_plain_text),
+            "figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
+            "reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
+            "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
+            "readability_issues_detected": False,
+        }
+        print("Analyzer: Performing regex checks...")
+        regex_report = perform_regex_checks(cleaned_unfiltered_plain_text)
+        if "error" in regex_report: print(f"Analyzer: Error in regex checks: {regex_report['error']}")
+        regex_issues = regex_report.get("issues_list", [])
+        print("Analyzer: Performing language checks...")
+        lt_report = perform_language_checks(markdown_text_from_filtered_pdf)
+        if "error" in lt_report: print(f"Analyzer: Error in LanguageTool checks: {lt_report['error']}")
+        lt_issues = lt_report.get("issues_list", [])
+        detailed_issues_for_mapping = regex_issues + lt_issues
+        # 4. Coordinate Mapping (against the original PDF)
+        if detailed_issues_for_mapping:
             try:
+                doc_for_mapping = fitz.open(original_pdf_access_path)
                 if doc_for_mapping.page_count > 0:
+                    print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
+                    for page_idx in range(doc_for_mapping.page_count):
+                        page = doc_for_mapping[page_idx]
+                        current_page_num_1_based = page_idx + 1
+                        unmapped_issues_on_this_page_by_context = defaultdict(list)
+                        for issue_dict in detailed_issues_for_mapping:
+                            if not issue_dict['is_mapped_to_pdf']:
+                                unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
+                        if not unmapped_issues_on_this_page_by_context:
+                            if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
+                            continue
+                        for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
+                            if not ctx_str or not ctx_str.strip(): continue
+                            try:
+                                pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
+                                if pdf_rects:
+                                    try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
+                            except Exception as search_exc:
+                                print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
+                    total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
+                    print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
             except Exception as e_map:
+                print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
             finally:
                 if doc_for_mapping: doc_for_mapping.close()
+        else:
+            print("Analyzer: No detailed issues from regex or language checks to map.")
+        # 5. Format final list of issues
         final_formatted_issues_list = []
         for issue_data in detailed_issues_for_mapping:
+            coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
+            coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
+            # Filter out None coordinates that might arise from empty coords dict
+            coords_for_json = [c for c in coords_for_json if c is not None]
             final_formatted_issues_list.append({
+                "message": issue_data.get('message', 'N/A'),
+                "context": issue_data.get('context_text', 'N/A'),
+                "suggestions": issue_data.get('replacements_suggestion', []),
+                "category": issue_data.get('category_name', 'Unknown'),
+                "rule_id": issue_data.get('ruleId', 'N/A'),
+                "offset": issue_data.get('offset_in_text', -1),
+                "length": issue_data.get('error_length', 0),
+                "coordinates": coords_for_json if len(coords_for_json) == 4 else [], # Ensure 4 coords or empty
+                "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
+                "source_check_type": issue_data.get('source_check_type', 'N/A')
             })
         results = {
             "issues": final_formatted_issues_list,
+            "document_checks": document_check_results
         }
         return results, None
     except Exception as e:
+        print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
+        return {"error": f"Overall analysis error: {str(e)}"}, None
+    finally:
+        if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
+            try:
+                os.remove(temp_file_for_stream_path)
+                print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
+            except Exception as e_clean:
+                print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")