Update language_checker.py
Browse files- language_checker.py +5 -13
language_checker.py
CHANGED
|
@@ -68,8 +68,8 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
| 68 |
continue
|
| 69 |
lt_issues_in_range += 1
|
| 70 |
|
| 71 |
-
#
|
| 72 |
-
|
| 73 |
|
| 74 |
# New context extraction for ~10 words:
|
| 75 |
words_around = 1 # Number of words to try and get on each side
|
|
@@ -78,9 +78,6 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
| 78 |
pre_error_text = text_for_lt_analysis[:match.offset]
|
| 79 |
words_before = pre_error_text.split()[-words_around:]
|
| 80 |
|
| 81 |
-
# Text of the error itself
|
| 82 |
-
error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
|
| 83 |
-
|
| 84 |
# Text after the error
|
| 85 |
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
|
| 86 |
words_after = post_error_text.split()[:words_around]
|
|
@@ -89,23 +86,18 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
| 89 |
context_parts = []
|
| 90 |
if words_before:
|
| 91 |
context_parts.append(" ".join(words_before))
|
| 92 |
-
context_parts.append(
|
| 93 |
if words_after:
|
| 94 |
context_parts.append(" ".join(words_after))
|
| 95 |
|
| 96 |
wider_context_str = " ".join(context_parts)
|
| 97 |
-
# Ensure there's a small buffer around the error to make it ~10 words total if error is short
|
| 98 |
-
# This can be refined further based on average word length or by counting words more precisely.
|
| 99 |
-
# A simpler approach using character offsets could also be used, e.g.:
|
| 100 |
-
# context_start_char = max(0, match.offset - 50) # Approx 50 chars before
|
| 101 |
-
# context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
|
| 102 |
-
# wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
|
| 103 |
|
| 104 |
processed_lt_issues.append({
|
| 105 |
'_internal_id': f"lt_{idx}",
|
| 106 |
'ruleId': match.ruleId,
|
| 107 |
'message': match.message,
|
| 108 |
-
'context_text': wider_context_str,
|
|
|
|
| 109 |
'offset_in_text': match.offset,
|
| 110 |
'error_length': match.errorLength,
|
| 111 |
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
|
|
|
|
| 68 |
continue
|
| 69 |
lt_issues_in_range += 1
|
| 70 |
|
| 71 |
+
# Text of the error itself
|
| 72 |
+
error_text_verbatim = match.matchedText # The actual text that LanguageTool flagged
|
| 73 |
|
| 74 |
# New context extraction for ~10 words:
|
| 75 |
words_around = 1 # Number of words to try and get on each side
|
|
|
|
| 78 |
pre_error_text = text_for_lt_analysis[:match.offset]
|
| 79 |
words_before = pre_error_text.split()[-words_around:]
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
# Text after the error
|
| 82 |
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
|
| 83 |
words_after = post_error_text.split()[:words_around]
|
|
|
|
| 86 |
context_parts = []
|
| 87 |
if words_before:
|
| 88 |
context_parts.append(" ".join(words_before))
|
| 89 |
+
context_parts.append(error_text_verbatim) # The actual error phrase
|
| 90 |
if words_after:
|
| 91 |
context_parts.append(" ".join(words_after))
|
| 92 |
|
| 93 |
wider_context_str = " ".join(context_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
processed_lt_issues.append({
|
| 96 |
'_internal_id': f"lt_{idx}",
|
| 97 |
'ruleId': match.ruleId,
|
| 98 |
'message': match.message,
|
| 99 |
+
'context_text': wider_context_str,
|
| 100 |
+
'error_text_verbatim': error_text_verbatim, # Store the verbatim error text
|
| 101 |
'offset_in_text': match.offset,
|
| 102 |
'error_length': match.errorLength,
|
| 103 |
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
|