Update app.py
Browse files
app.py
CHANGED
|
@@ -19,6 +19,10 @@ import gradio as gr
|
|
| 19 |
# Set JAVA_HOME environment variable (from target script)
|
| 20 |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# --- Functions for PDF to Markdown to Plain Text ---
|
| 24 |
def convert_markdown_to_plain_text(markdown_text: str) -> str:
|
|
@@ -175,6 +179,7 @@ def check_structure(plain_text: str) -> Dict[str, bool]:
|
|
| 175 |
"abstract_structure": "structured abstract" in text_lower
|
| 176 |
}
|
| 177 |
|
|
|
|
| 178 |
def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
|
| 179 |
"""
|
| 180 |
Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
|
|
@@ -230,7 +235,9 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
| 230 |
processed_issues: List[Dict[str, Any]] = []
|
| 231 |
|
| 232 |
try:
|
|
|
|
| 233 |
tool = language_tool_python.LanguageTool('en-US')
|
|
|
|
| 234 |
raw_lt_matches = tool.check(text_for_analysis)
|
| 235 |
|
| 236 |
# Define a set of rule IDs to ignore
|
|
@@ -250,7 +257,7 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
| 250 |
continue
|
| 251 |
lt_issues_in_range +=1
|
| 252 |
|
| 253 |
-
context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
|
| 254 |
processed_issues.append({
|
| 255 |
'_internal_id': f"lt_{idx}",
|
| 256 |
'ruleId': match.ruleId,
|
|
|
|
| 19 |
# Set JAVA_HOME environment variable (from target script)
|
| 20 |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
| 21 |
|
| 22 |
+
global_constants = {
|
| 23 |
+
"CONTEXT_LENGTH" : 3
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
|
| 27 |
# --- Functions for PDF to Markdown to Plain Text ---
|
| 28 |
def convert_markdown_to_plain_text(markdown_text: str) -> str:
|
|
|
|
| 179 |
"abstract_structure": "structured abstract" in text_lower
|
| 180 |
}
|
| 181 |
|
| 182 |
+
|
| 183 |
def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
|
| 184 |
"""
|
| 185 |
Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
|
|
|
|
| 235 |
processed_issues: List[Dict[str, Any]] = []
|
| 236 |
|
| 237 |
try:
|
| 238 |
+
|
| 239 |
tool = language_tool_python.LanguageTool('en-US')
|
| 240 |
+
print(text_for_analysis)
|
| 241 |
raw_lt_matches = tool.check(text_for_analysis)
|
| 242 |
|
| 243 |
# Define a set of rule IDs to ignore
|
|
|
|
| 257 |
continue
|
| 258 |
lt_issues_in_range +=1
|
| 259 |
|
| 260 |
+
context_str = text_for_analysis[match.offset - global_constants["CONTEXT_LENGTH"] : match.offset + match.errorLength + global_constants["CONTEXT_LENGTH"]]
|
| 261 |
processed_issues.append({
|
| 262 |
'_internal_id': f"lt_{idx}",
|
| 263 |
'ruleId': match.ruleId,
|