Spaces:
Sleeping
Sleeping
| import os | |
| # Set cache dirs (must match Dockerfile env vars) | |
| os.environ['HOME'] = '/app' | |
| os.environ['HF_HOME'] = '/app/.hf_cache' | |
| os.environ['LANGTOOL_HOME'] = '/app/.ltool_cache' | |
| os.environ['XDG_CACHE_HOME'] = '/app/.cache' | |
| import language_tool_python | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| class ParagraphCorrector: | |
| def __init__(self): | |
| """Initialize correction models with conservative settings""" | |
| # Grammar tool with increased timeout | |
| self.grammar_tool = language_tool_python.LanguageTool( | |
| 'en-US', | |
| config={'maxTextLength': 100000} | |
| ) | |
| # Conservative grammar correction model | |
| self.grammar_model = pipeline( | |
| "text2text-generation", | |
| model="vennify/t5-base-grammar-correction", | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| def correct_sentence(self, sentence: str) -> str: | |
| """Correct a single sentence conservatively""" | |
| # Basic grammar/spelling correction | |
| matches = self.grammar_tool.check(sentence) | |
| corrected = language_tool_python.utils.correct(sentence, matches) | |
| # Light neural correction | |
| result = self.grammar_model( | |
| corrected, | |
| max_length=256, | |
| num_beams=3, | |
| temperature=0.3, # Low temperature for minimal changes | |
| early_stopping=True | |
| ) | |
| return result[0]['generated_text'] | |
| def conservative_correction(self, text: str) -> str: | |
| """Process text while preserving original structure""" | |
| if not text.strip(): | |
| return text | |
| # Split into sentences while preserving delimiters | |
| sentences = [] | |
| current = "" | |
| for char in text: | |
| current += char | |
| if char in {'.', '!', '?'}: | |
| sentences.append(current) | |
| current = "" | |
| if current: | |
| sentences.append(current) | |
| # Correct each sentence individually | |
| corrected_sentences = [] | |
| for sentence in sentences: | |
| if sentence.strip(): | |
| corrected = self.correct_sentence(sentence) | |
| corrected_sentences.append(corrected) | |
| else: | |
| corrected_sentences.append(sentence) | |
| return ''.join(corrected_sentences) |