texmetrics-regex-checks-gradio-1-devtesting

Sleeping

App Files Files Community

samyak152002 commited on 11 days ago

Commit

36623de

verified ·

1 Parent(s): f9e77fb

Update language_checker.py

Browse files

Files changed (1) hide show

language_checker.py +99 -20

language_checker.py CHANGED Viewed

@@ -3,15 +3,78 @@ import re
 import traceback
 from typing import List, Dict, Any
 import language_tool_python
 from text_utils import convert_markdown_to_plain_text
 # config.py (setting JAVA_HOME) should be imported early in app.py
 def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
     """
     Performs LanguageTool checks on plain text derived from font-filtered Markdown.
     Filters issues to only include those between "abstract" and "references/bibliography"
     found within this specific text.
     """
     if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
         print("LT_Checker: Input Markdown text is empty.")
@@ -25,16 +88,33 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
         print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
         return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
     text_for_lt_analysis_lower = text_for_lt_analysis.lower()
     abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
     content_start_index = abstract_match.start() if abstract_match else 0
     if abstract_match:
         print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
     else:
         print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
-    # Determine end boundary (references or bibliography)
     references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
     bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
     content_end_index = len(text_for_lt_analysis)
@@ -61,35 +141,30 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
         raw_lt_matches = tool.check(text_for_lt_analysis)
         lt_issues_in_range = 0
         for idx, match in enumerate(raw_lt_matches):
             if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue  # Common rule to ignore
             if not (content_start_index <= match.offset < content_end_index):
                 continue
             lt_issues_in_range += 1
-            # Text of the error itself
-            error_text_verbatim = match.matchedText # The actual text that LanguageTool flagged
-            # New context extraction for ~10 words:
-            words_around = 1  # Number of words to try and get on each side
-            # Text before the error
             pre_error_text = text_for_lt_analysis[:match.offset]
             words_before = pre_error_text.split()[-words_around:]
-            # Text after the error
             post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
             words_after = post_error_text.split()[:words_around]
-            # Combine to form the new wider context
             context_parts = []
-            if words_before:
-                context_parts.append(" ".join(words_before))
-            context_parts.append(error_text_verbatim)  # The actual error phrase
-            if words_after:
-                context_parts.append(" ".join(words_after))
             wider_context_str = " ".join(context_parts)
             processed_lt_issues.append({
@@ -97,7 +172,7 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
                 'ruleId': match.ruleId,
                 'message': match.message,
                 'context_text': wider_context_str,
-                'error_text_verbatim': error_text_verbatim, # Store the verbatim error text
                 'offset_in_text': match.offset,
                 'error_length': match.errorLength,
                 'replacements_suggestion': match.replacements[:3] if match.replacements else [],
@@ -107,7 +182,11 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
                 'pdf_coordinates_list': [],
                 'mapped_page_number': -1
             })
-        print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
         return {
             "total_issues": len(processed_lt_issues),

 import traceback
 from typing import List, Dict, Any
 import language_tool_python
+import logging # For more persistent error messages
 from text_utils import convert_markdown_to_plain_text
 # config.py (setting JAVA_HOME) should be imported early in app.py
+# Import SpanMarkerModel
+try:
+    from span_marker import SpanMarkerModel
+    SPAN_MARKER_AVAILABLE = True
+except ImportError:
+    SPAN_MARKER_AVAILABLE = False
+    SpanMarkerModel = None # Placeholder if not available
+    print("LT_Checker: Warning: span_marker library not found. Acronym filtering will be disabled.")
+    print("LT_Checker: Please install it via 'pip install span_marker'")
+# --- Global SpanMarker Model for Acronyms ---
+_span_marker_model_acronyms = None
+_span_marker_model_loaded_successfully = False
+_span_marker_model_load_attempted = False
+SPAN_MARKER_ACRONYM_MODEL_NAME = "tomaarsen/span-marker-bert-base-uncased-acronyms"
+def _load_span_marker_model_if_needed():
+    global _span_marker_model_acronyms, _span_marker_model_loaded_successfully, _span_marker_model_load_attempted
+    if not SPAN_MARKER_AVAILABLE or _span_marker_model_load_attempted:
+        return
+    _span_marker_model_load_attempted = True
+    try:
+        print(f"LT_Checker: Attempting to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' for acronym detection...")
+        # Ensure you have torch installed, or the appropriate backend for SpanMarkerModel
+        _span_marker_model_acronyms = SpanMarkerModel.from_pretrained(SPAN_MARKER_ACRONYM_MODEL_NAME)
+        _span_marker_model_loaded_successfully = True
+        print(f"LT_Checker: SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' loaded successfully.")
+    except Exception as e:
+        _span_marker_model_loaded_successfully = False
+        print(f"LT_Checker: CRITICAL ERROR loading SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}")
+        print(f"LT_Checker: Acronym filtering will be disabled. Please check your installation and model availability.")
+        logging.error(f"Failed to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}", exc_info=True)
+# Attempt to load the model when the module is first imported.
+# This might slightly delay the initial import if the model is large.
+_load_span_marker_model_if_needed()
+def _is_text_acronym_related(text_to_check: str, acronym_entities: List[Dict[str, Any]]) -> bool:
+    """
+    Checks if the text_to_check contains any of the acronyms (long or short form)
+    identified by the SpanMarker model.
+    """
+    if not acronym_entities or not text_to_check:
+        return False
+    text_to_check_lower = text_to_check.lower()
+    for entity in acronym_entities:
+        acronym_span = entity.get('span', '')
+        if acronym_span: # Ensure span is not empty
+            # Check if the identified acronym span is present in the text flagged by LanguageTool
+            if acronym_span.lower() in text_to_check_lower:
+                # print(f"Debug AcronymFilter: Text '{text_to_check}' (from LT) contains detected acronym '{acronym_span}'. Filtering.")
+                return True
+    return False
 def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
     """
     Performs LanguageTool checks on plain text derived from font-filtered Markdown.
     Filters issues to only include those between "abstract" and "references/bibliography"
     found within this specific text.
+    Also filters out issues related to acronyms identified by SpanMarker.
     """
     if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
         print("LT_Checker: Input Markdown text is empty.")
         print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
         return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
+    # --- Acronym Detection using SpanMarker ---
+    acronym_entities = []
+    if _span_marker_model_loaded_successfully and _span_marker_model_acronyms:
+        try:
+            # print(f"LT_Checker: Running SpanMarker on text of length {len(text_for_lt_analysis)} for acronyms.")
+            acronym_entities = _span_marker_model_acronyms.predict(text_for_lt_analysis)
+            # if acronym_entities:
+            #     print(f"LT_Checker: SpanMarker found {len(acronym_entities)} acronym entities. Examples: {[e['span'] for e in acronym_entities[:3]]}")
+        except Exception as sm_e:
+            print(f"LT_Checker: Error during SpanMarker prediction: {sm_e}")
+            logging.warning(f"SpanMarker prediction failed: {sm_e}", exc_info=True)
+            # Proceed without acronym filtering if prediction fails
+            acronym_entities = []
+    elif SPAN_MARKER_AVAILABLE and not _span_marker_model_loaded_successfully:
+        print("LT_Checker: SpanMarker model was available but not loaded successfully. Acronym filtering disabled for this run.")
     text_for_lt_analysis_lower = text_for_lt_analysis.lower()
     abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
     content_start_index = abstract_match.start() if abstract_match else 0
+    # ... (rest of abstract/references boundary logic as before) ...
     if abstract_match:
         print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
     else:
         print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
     references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
     bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
     content_end_index = len(text_for_lt_analysis)
         raw_lt_matches = tool.check(text_for_lt_analysis)
         lt_issues_in_range = 0
+        filtered_acronym_issues = 0
         for idx, match in enumerate(raw_lt_matches):
             if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue  # Common rule to ignore
+            # --- Acronym Filtering Step ---
+            if acronym_entities and _is_text_acronym_related(match.matchedText, acronym_entities):
+                filtered_acronym_issues += 1
+                continue # Skip this LanguageTool match as it's related to a detected acronym
             if not (content_start_index <= match.offset < content_end_index):
                 continue
             lt_issues_in_range += 1
+            error_text_verbatim = match.matchedText
+            words_around = 1
             pre_error_text = text_for_lt_analysis[:match.offset]
             words_before = pre_error_text.split()[-words_around:]
             post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
             words_after = post_error_text.split()[:words_around]
             context_parts = []
+            if words_before: context_parts.append(" ".join(words_before))
+            context_parts.append(error_text_verbatim)
+            if words_after: context_parts.append(" ".join(words_after))
             wider_context_str = " ".join(context_parts)
             processed_lt_issues.append({
                 'ruleId': match.ruleId,
                 'message': match.message,
                 'context_text': wider_context_str,
+                'error_text_verbatim': error_text_verbatim,
                 'offset_in_text': match.offset,
                 'error_length': match.errorLength,
                 'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                 'pdf_coordinates_list': [],
                 'mapped_page_number': -1
             })
+        print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues.")
+        if acronym_entities:
+            print(f"LT_Checker: Filtered out {filtered_acronym_issues} LT issues due to acronym detection.")
+        print(f"LT_Checker: {lt_issues_in_range} LT issues within defined content range (after acronym filtering).")
         return {
             "total_issues": len(processed_lt_issues),