samyak152002 commited on
Commit
36623de
·
verified ·
1 Parent(s): f9e77fb

Update language_checker.py

Browse files
Files changed (1) hide show
  1. language_checker.py +99 -20
language_checker.py CHANGED
@@ -3,15 +3,78 @@ import re
3
  import traceback
4
  from typing import List, Dict, Any
5
  import language_tool_python
 
6
 
7
  from text_utils import convert_markdown_to_plain_text
8
  # config.py (setting JAVA_HOME) should be imported early in app.py
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
11
  """
12
  Performs LanguageTool checks on plain text derived from font-filtered Markdown.
13
  Filters issues to only include those between "abstract" and "references/bibliography"
14
  found within this specific text.
 
15
  """
16
  if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
17
  print("LT_Checker: Input Markdown text is empty.")
@@ -25,16 +88,33 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
25
  print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
26
  return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  text_for_lt_analysis_lower = text_for_lt_analysis.lower()
29
 
30
  abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
31
  content_start_index = abstract_match.start() if abstract_match else 0
 
32
  if abstract_match:
33
  print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
34
  else:
35
  print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
36
 
37
- # Determine end boundary (references or bibliography)
38
  references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
39
  bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
40
  content_end_index = len(text_for_lt_analysis)
@@ -61,35 +141,30 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
61
  raw_lt_matches = tool.check(text_for_lt_analysis)
62
 
63
  lt_issues_in_range = 0
 
 
64
  for idx, match in enumerate(raw_lt_matches):
65
  if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
66
 
 
 
 
 
 
67
  if not (content_start_index <= match.offset < content_end_index):
68
  continue
69
  lt_issues_in_range += 1
70
 
71
- # Text of the error itself
72
- error_text_verbatim = match.matchedText # The actual text that LanguageTool flagged
73
-
74
- # New context extraction for ~10 words:
75
- words_around = 1 # Number of words to try and get on each side
76
-
77
- # Text before the error
78
  pre_error_text = text_for_lt_analysis[:match.offset]
79
  words_before = pre_error_text.split()[-words_around:]
80
-
81
- # Text after the error
82
  post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
83
  words_after = post_error_text.split()[:words_around]
84
-
85
- # Combine to form the new wider context
86
  context_parts = []
87
- if words_before:
88
- context_parts.append(" ".join(words_before))
89
- context_parts.append(error_text_verbatim) # The actual error phrase
90
- if words_after:
91
- context_parts.append(" ".join(words_after))
92
-
93
  wider_context_str = " ".join(context_parts)
94
 
95
  processed_lt_issues.append({
@@ -97,7 +172,7 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
97
  'ruleId': match.ruleId,
98
  'message': match.message,
99
  'context_text': wider_context_str,
100
- 'error_text_verbatim': error_text_verbatim, # Store the verbatim error text
101
  'offset_in_text': match.offset,
102
  'error_length': match.errorLength,
103
  'replacements_suggestion': match.replacements[:3] if match.replacements else [],
@@ -107,7 +182,11 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
107
  'pdf_coordinates_list': [],
108
  'mapped_page_number': -1
109
  })
110
- print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
 
 
 
 
111
 
112
  return {
113
  "total_issues": len(processed_lt_issues),
 
3
  import traceback
4
  from typing import List, Dict, Any
5
  import language_tool_python
6
+ import logging # For more persistent error messages
7
 
8
  from text_utils import convert_markdown_to_plain_text
9
  # config.py (setting JAVA_HOME) should be imported early in app.py
10
 
11
+ # Import SpanMarkerModel
12
+ try:
13
+ from span_marker import SpanMarkerModel
14
+ SPAN_MARKER_AVAILABLE = True
15
+ except ImportError:
16
+ SPAN_MARKER_AVAILABLE = False
17
+ SpanMarkerModel = None # Placeholder if not available
18
+ print("LT_Checker: Warning: span_marker library not found. Acronym filtering will be disabled.")
19
+ print("LT_Checker: Please install it via 'pip install span_marker'")
20
+
21
+
22
+ # --- Global SpanMarker Model for Acronyms ---
23
+ _span_marker_model_acronyms = None
24
+ _span_marker_model_loaded_successfully = False
25
+ _span_marker_model_load_attempted = False
26
+
27
+ SPAN_MARKER_ACRONYM_MODEL_NAME = "tomaarsen/span-marker-bert-base-uncased-acronyms"
28
+
29
+ def _load_span_marker_model_if_needed():
30
+ global _span_marker_model_acronyms, _span_marker_model_loaded_successfully, _span_marker_model_load_attempted
31
+
32
+ if not SPAN_MARKER_AVAILABLE or _span_marker_model_load_attempted:
33
+ return
34
+
35
+ _span_marker_model_load_attempted = True
36
+ try:
37
+ print(f"LT_Checker: Attempting to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' for acronym detection...")
38
+ # Ensure you have torch installed, or the appropriate backend for SpanMarkerModel
39
+ _span_marker_model_acronyms = SpanMarkerModel.from_pretrained(SPAN_MARKER_ACRONYM_MODEL_NAME)
40
+ _span_marker_model_loaded_successfully = True
41
+ print(f"LT_Checker: SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' loaded successfully.")
42
+ except Exception as e:
43
+ _span_marker_model_loaded_successfully = False
44
+ print(f"LT_Checker: CRITICAL ERROR loading SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}")
45
+ print(f"LT_Checker: Acronym filtering will be disabled. Please check your installation and model availability.")
46
+ logging.error(f"Failed to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}", exc_info=True)
47
+
48
+ # Attempt to load the model when the module is first imported.
49
+ # This might slightly delay the initial import if the model is large.
50
+ _load_span_marker_model_if_needed()
51
+
52
+
53
+ def _is_text_acronym_related(text_to_check: str, acronym_entities: List[Dict[str, Any]]) -> bool:
54
+ """
55
+ Checks if the text_to_check contains any of the acronyms (long or short form)
56
+ identified by the SpanMarker model.
57
+ """
58
+ if not acronym_entities or not text_to_check:
59
+ return False
60
+
61
+ text_to_check_lower = text_to_check.lower()
62
+ for entity in acronym_entities:
63
+ acronym_span = entity.get('span', '')
64
+ if acronym_span: # Ensure span is not empty
65
+ # Check if the identified acronym span is present in the text flagged by LanguageTool
66
+ if acronym_span.lower() in text_to_check_lower:
67
+ # print(f"Debug AcronymFilter: Text '{text_to_check}' (from LT) contains detected acronym '{acronym_span}'. Filtering.")
68
+ return True
69
+ return False
70
+
71
+
72
  def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
73
  """
74
  Performs LanguageTool checks on plain text derived from font-filtered Markdown.
75
  Filters issues to only include those between "abstract" and "references/bibliography"
76
  found within this specific text.
77
+ Also filters out issues related to acronyms identified by SpanMarker.
78
  """
79
  if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
80
  print("LT_Checker: Input Markdown text is empty.")
 
88
  print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
89
  return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
90
 
91
+ # --- Acronym Detection using SpanMarker ---
92
+ acronym_entities = []
93
+ if _span_marker_model_loaded_successfully and _span_marker_model_acronyms:
94
+ try:
95
+ # print(f"LT_Checker: Running SpanMarker on text of length {len(text_for_lt_analysis)} for acronyms.")
96
+ acronym_entities = _span_marker_model_acronyms.predict(text_for_lt_analysis)
97
+ # if acronym_entities:
98
+ # print(f"LT_Checker: SpanMarker found {len(acronym_entities)} acronym entities. Examples: {[e['span'] for e in acronym_entities[:3]]}")
99
+ except Exception as sm_e:
100
+ print(f"LT_Checker: Error during SpanMarker prediction: {sm_e}")
101
+ logging.warning(f"SpanMarker prediction failed: {sm_e}", exc_info=True)
102
+ # Proceed without acronym filtering if prediction fails
103
+ acronym_entities = []
104
+ elif SPAN_MARKER_AVAILABLE and not _span_marker_model_loaded_successfully:
105
+ print("LT_Checker: SpanMarker model was available but not loaded successfully. Acronym filtering disabled for this run.")
106
+
107
+
108
  text_for_lt_analysis_lower = text_for_lt_analysis.lower()
109
 
110
  abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
111
  content_start_index = abstract_match.start() if abstract_match else 0
112
+ # ... (rest of abstract/references boundary logic as before) ...
113
  if abstract_match:
114
  print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
115
  else:
116
  print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
117
 
 
118
  references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
119
  bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
120
  content_end_index = len(text_for_lt_analysis)
 
141
  raw_lt_matches = tool.check(text_for_lt_analysis)
142
 
143
  lt_issues_in_range = 0
144
+ filtered_acronym_issues = 0
145
+
146
  for idx, match in enumerate(raw_lt_matches):
147
  if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
148
 
149
+ # --- Acronym Filtering Step ---
150
+ if acronym_entities and _is_text_acronym_related(match.matchedText, acronym_entities):
151
+ filtered_acronym_issues += 1
152
+ continue # Skip this LanguageTool match as it's related to a detected acronym
153
+
154
  if not (content_start_index <= match.offset < content_end_index):
155
  continue
156
  lt_issues_in_range += 1
157
 
158
+ error_text_verbatim = match.matchedText
159
+ words_around = 1
 
 
 
 
 
160
  pre_error_text = text_for_lt_analysis[:match.offset]
161
  words_before = pre_error_text.split()[-words_around:]
 
 
162
  post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
163
  words_after = post_error_text.split()[:words_around]
 
 
164
  context_parts = []
165
+ if words_before: context_parts.append(" ".join(words_before))
166
+ context_parts.append(error_text_verbatim)
167
+ if words_after: context_parts.append(" ".join(words_after))
 
 
 
168
  wider_context_str = " ".join(context_parts)
169
 
170
  processed_lt_issues.append({
 
172
  'ruleId': match.ruleId,
173
  'message': match.message,
174
  'context_text': wider_context_str,
175
+ 'error_text_verbatim': error_text_verbatim,
176
  'offset_in_text': match.offset,
177
  'error_length': match.errorLength,
178
  'replacements_suggestion': match.replacements[:3] if match.replacements else [],
 
182
  'pdf_coordinates_list': [],
183
  'mapped_page_number': -1
184
  })
185
+
186
+ print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues.")
187
+ if acronym_entities:
188
+ print(f"LT_Checker: Filtered out {filtered_acronym_issues} LT issues due to acronym detection.")
189
+ print(f"LT_Checker: {lt_issues_in_range} LT issues within defined content range (after acronym filtering).")
190
 
191
  return {
192
  "total_issues": len(processed_lt_issues),