File size: 2,352 Bytes
34b6727
e8cccdd
 
b92070e
e8cccdd
 
 
 
b92070e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d18ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os

# Set cache dirs (must match Dockerfile env vars)
os.environ['HOME'] = '/app'
os.environ['HF_HOME'] = '/app/.hf_cache'
os.environ['LANGTOOL_HOME'] = '/app/.ltool_cache'
os.environ['XDG_CACHE_HOME'] = '/app/.cache'


import language_tool_python
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

class ParagraphCorrector:
    def __init__(self):
        """Initialize correction models with conservative settings"""
        # Grammar tool with increased timeout
        self.grammar_tool = language_tool_python.LanguageTool(
            'en-US',
            config={'maxTextLength': 100000}
        )

        # Conservative grammar correction model
        self.grammar_model = pipeline(
            "text2text-generation",
            model="vennify/t5-base-grammar-correction",
            device=0 if torch.cuda.is_available() else -1
        )

    def correct_sentence(self, sentence: str) -> str:
        """Correct a single sentence conservatively"""
        # Basic grammar/spelling correction
        matches = self.grammar_tool.check(sentence)
        corrected = language_tool_python.utils.correct(sentence, matches)

        # Light neural correction
        result = self.grammar_model(
            corrected,
            max_length=256,
            num_beams=3,
            temperature=0.3,  # Low temperature for minimal changes
            early_stopping=True
        )
        return result[0]['generated_text']

    def conservative_correction(self, text: str) -> str:
        """Process text while preserving original structure"""
        if not text.strip():
            return text

        # Split into sentences while preserving delimiters
        sentences = []
        current = ""
        for char in text:
            current += char
            if char in {'.', '!', '?'}:
                sentences.append(current)
                current = ""
        if current:
            sentences.append(current)

        # Correct each sentence individually
        corrected_sentences = []
        for sentence in sentences:
            if sentence.strip():
                corrected = self.correct_sentence(sentence)
                corrected_sentences.append(corrected)
            else:
                corrected_sentences.append(sentence)

        return ''.join(corrected_sentences)