texmetrics-regex-checks-gradio-1-devtesting

Sleeping

App Files Files Community

samyak152002 commited on May 18

Commit

3770ab0

verified ·

1 Parent(s): 82c3ba5

Update language_checker.py

Browse files

Files changed (1) hide show

language_checker.py +5 -13

language_checker.py CHANGED Viewed

@@ -68,8 +68,8 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
                 continue
             lt_issues_in_range += 1
-            # Current context extraction:
-            # context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
             # New context extraction for ~10 words:
             words_around = 1  # Number of words to try and get on each side
@@ -78,9 +78,6 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
             pre_error_text = text_for_lt_analysis[:match.offset]
             words_before = pre_error_text.split()[-words_around:]
-            # Text of the error itself
-            error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
             # Text after the error
             post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
             words_after = post_error_text.split()[:words_around]
@@ -89,23 +86,18 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
             context_parts = []
             if words_before:
                 context_parts.append(" ".join(words_before))
-            context_parts.append(error_text)  # The actual error phrase
             if words_after:
                 context_parts.append(" ".join(words_after))
             wider_context_str = " ".join(context_parts)
-            # Ensure there's a small buffer around the error to make it ~10 words total if error is short
-            # This can be refined further based on average word length or by counting words more precisely.
-            # A simpler approach using character offsets could also be used, e.g.:
-            # context_start_char = max(0, match.offset - 50) # Approx 50 chars before
-            # context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
-            # wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
             processed_lt_issues.append({
                 '_internal_id': f"lt_{idx}",
                 'ruleId': match.ruleId,
                 'message': match.message,
-                'context_text': wider_context_str,  # Use the new wider context
                 'offset_in_text': match.offset,
                 'error_length': match.errorLength,
                 'replacements_suggestion': match.replacements[:3] if match.replacements else [],

                 continue
             lt_issues_in_range += 1
+            # Text of the error itself
+            error_text_verbatim = match.matchedText # The actual text that LanguageTool flagged
             # New context extraction for ~10 words:
             words_around = 1  # Number of words to try and get on each side
             pre_error_text = text_for_lt_analysis[:match.offset]
             words_before = pre_error_text.split()[-words_around:]
             # Text after the error
             post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
             words_after = post_error_text.split()[:words_around]
             context_parts = []
             if words_before:
                 context_parts.append(" ".join(words_before))
+            context_parts.append(error_text_verbatim)  # The actual error phrase
             if words_after:
                 context_parts.append(" ".join(words_after))
             wider_context_str = " ".join(context_parts)
             processed_lt_issues.append({
                 '_internal_id': f"lt_{idx}",
                 'ruleId': match.ruleId,
                 'message': match.message,
+                'context_text': wider_context_str,
+                'error_text_verbatim': error_text_verbatim, # Store the verbatim error text
                 'offset_in_text': match.offset,
                 'error_length': match.errorLength,
                 'replacements_suggestion': match.replacements[:3] if match.replacements else [],