Spaces:
Sleeping
Sleeping
import os | |
# Set cache dirs (must match Dockerfile env vars) | |
os.environ['HOME'] = '/app' | |
os.environ['HF_HOME'] = '/app/.hf_cache' | |
os.environ['LANGTOOL_HOME'] = '/app/.ltool_cache' | |
os.environ['XDG_CACHE_HOME'] = '/app/.cache' | |
import language_tool_python | |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
import torch | |
class ParagraphCorrector: | |
def __init__(self): | |
"""Initialize correction models with conservative settings""" | |
# Grammar tool with increased timeout | |
self.grammar_tool = language_tool_python.LanguageTool( | |
'en-US', | |
config={'maxTextLength': 100000} | |
) | |
# Conservative grammar correction model | |
self.grammar_model = pipeline( | |
"text2text-generation", | |
model="vennify/t5-base-grammar-correction", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
def correct_sentence(self, sentence: str) -> str: | |
"""Correct a single sentence conservatively""" | |
# Basic grammar/spelling correction | |
matches = self.grammar_tool.check(sentence) | |
corrected = language_tool_python.utils.correct(sentence, matches) | |
# Light neural correction | |
result = self.grammar_model( | |
corrected, | |
max_length=256, | |
num_beams=3, | |
temperature=0.3, # Low temperature for minimal changes | |
early_stopping=True | |
) | |
return result[0]['generated_text'] | |
def conservative_correction(self, text: str) -> str: | |
"""Process text while preserving original structure""" | |
if not text.strip(): | |
return text | |
# Split into sentences while preserving delimiters | |
sentences = [] | |
current = "" | |
for char in text: | |
current += char | |
if char in {'.', '!', '?'}: | |
sentences.append(current) | |
current = "" | |
if current: | |
sentences.append(current) | |
# Correct each sentence individually | |
corrected_sentences = [] | |
for sentence in sentences: | |
if sentence.strip(): | |
corrected = self.correct_sentence(sentence) | |
corrected_sentences.append(corrected) | |
else: | |
corrected_sentences.append(sentence) | |
return ''.join(corrected_sentences) |