Spaces:
Sleeping
Sleeping
File size: 2,352 Bytes
34b6727 e8cccdd b92070e e8cccdd b92070e 47d18ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
# Set cache dirs (must match Dockerfile env vars)
os.environ['HOME'] = '/app'
os.environ['HF_HOME'] = '/app/.hf_cache'
os.environ['LANGTOOL_HOME'] = '/app/.ltool_cache'
os.environ['XDG_CACHE_HOME'] = '/app/.cache'
import language_tool_python
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
class ParagraphCorrector:
def __init__(self):
"""Initialize correction models with conservative settings"""
# Grammar tool with increased timeout
self.grammar_tool = language_tool_python.LanguageTool(
'en-US',
config={'maxTextLength': 100000}
)
# Conservative grammar correction model
self.grammar_model = pipeline(
"text2text-generation",
model="vennify/t5-base-grammar-correction",
device=0 if torch.cuda.is_available() else -1
)
def correct_sentence(self, sentence: str) -> str:
"""Correct a single sentence conservatively"""
# Basic grammar/spelling correction
matches = self.grammar_tool.check(sentence)
corrected = language_tool_python.utils.correct(sentence, matches)
# Light neural correction
result = self.grammar_model(
corrected,
max_length=256,
num_beams=3,
temperature=0.3, # Low temperature for minimal changes
early_stopping=True
)
return result[0]['generated_text']
def conservative_correction(self, text: str) -> str:
"""Process text while preserving original structure"""
if not text.strip():
return text
# Split into sentences while preserving delimiters
sentences = []
current = ""
for char in text:
current += char
if char in {'.', '!', '?'}:
sentences.append(current)
current = ""
if current:
sentences.append(current)
# Correct each sentence individually
corrected_sentences = []
for sentence in sentences:
if sentence.strip():
corrected = self.correct_sentence(sentence)
corrected_sentences.append(corrected)
else:
corrected_sentences.append(sentence)
return ''.join(corrected_sentences) |