adgw
/

Joblib
File size: 3,233 Bytes
5c8f9d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e55029f
5c8f9d2
 
 
 
e55029f
5c8f9d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef22613
5c8f9d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef22613
5c8f9d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""

G艂贸wny modu艂 biblioteki zawieraj膮cy klas臋 TextAnalyzer.

"""
import spacy
import textstat
import re
from typing import Dict, List, Tuple, Iterable

from . import constants
from .features import base_features, linguistic_features, regex_features, spacy_features, structural_features

class TextAnalyzer:
    """

    G艂贸wna klasa do kompleksowej analizy tekstu w j臋zyku polskim.

    """
    def __init__(self):
        """

        Inicjalizuje analizator.

        """
        try:
            self.nlp = spacy.load(constants.SPACY_MODEL_PL)
            self.nlp.max_length = constants.NLP_MAX_LENGTH
        except OSError:
            print(f"B艂膮d: Nie znaleziono modelu spaCy '{constants.SPACY_MODEL_PL}'.")
            print(f"python -m spacy download {constants.SPACY_MODEL_PL}")
            raise
        textstat.set_lang('pl_PL')

    def _preprocess(self, text: str) -> Tuple:
        text_lower = text.lower()
        words = text.split()
        words_lower = text_lower.split()
        lines = text.splitlines()
        sentences = re.findall(r'[^.!?]+[.!?]', text)
        return text_lower, words, words_lower, lines, sentences

    def analyze(self, text: str) -> Dict[str, float]:
        """Analizuje pojedynczy tekst"""
        doc = self.nlp(text)
        return self._analyze_single_doc(text, doc)

    def _analyze_single_doc(self, text: str, doc: spacy.tokens.Doc) -> Dict[str, float]:
        """Wewn臋trzna logika analizy dla pojedynczego tekstu i obiektu doc."""
        if not isinstance(text, str) or not text.strip():
            return {feature_name: 0.0 for feature_name in constants.COLUMN_ORDER}

        text_lower, words, words_lower, lines, sentences = self._preprocess(text)
        
        all_features = {}
        all_features.update(base_features.calculate_all_base_features(text, text_lower, words, words_lower, lines))
        all_features.update(linguistic_features.calculate_all_linguistic_features(text, text_lower, words, words_lower, sentences))
        all_features.update(structural_features.calculate_all_structural_features(text, lines, sentences))
        all_features.update(regex_features.calculate_all_regex_features(text))
        all_features.update(spacy_features.calculate_all_spacy_features(doc, text, sentences))
        
        return all_features

    def analyze_batch(self, texts: Iterable[str], batch_size: int = 100) -> Iterable[Dict[str, float]]:
        """

        Analizuje wsadowo kolekcj臋 tekst贸w u偶ywaj膮c nlp.pipe dla maksymalnej wydajno艣ci.



        Args:

            texts (Iterable[str]): Kolekcja (np. lista) tekst贸w do analizy.

            batch_size (int): Rozmiar paczki przekazywanej do spaCy.



        Yields:

            Iterable[Dict[str, float]]: Generator zwracaj膮cy s艂ownik cech dla ka偶dego tekstu.

        """
        # U偶ywamy nlp.pipe, kt贸ry jest generatorem i przetwarza teksty wsadowo
        docs = self.nlp.pipe(texts, batch_size=batch_size)
        
        # Przetwarzamy ka偶dy dokument z generatora
        for i, doc in enumerate(docs):
            original_text = texts[i]
            yield self._analyze_single_doc(original_text, doc)