|
"""
|
|
G艂贸wny modu艂 biblioteki zawieraj膮cy klas臋 TextAnalyzer.
|
|
"""
|
|
import spacy
|
|
import textstat
|
|
import re
|
|
from typing import Dict, List, Tuple, Iterable
|
|
|
|
from . import constants
|
|
from .features import base_features, linguistic_features, regex_features, spacy_features, structural_features
|
|
|
|
class TextAnalyzer:
|
|
"""
|
|
G艂贸wna klasa do kompleksowej analizy tekstu w j臋zyku polskim.
|
|
"""
|
|
def __init__(self):
|
|
"""
|
|
Inicjalizuje analizator.
|
|
"""
|
|
try:
|
|
self.nlp = spacy.load(constants.SPACY_MODEL_PL)
|
|
self.nlp.max_length = constants.NLP_MAX_LENGTH
|
|
except OSError:
|
|
print(f"B艂膮d: Nie znaleziono modelu spaCy '{constants.SPACY_MODEL_PL}'.")
|
|
print(f"python -m spacy download {constants.SPACY_MODEL_PL}")
|
|
raise
|
|
textstat.set_lang('pl_PL')
|
|
|
|
def _preprocess(self, text: str) -> Tuple:
|
|
text_lower = text.lower()
|
|
words = text.split()
|
|
words_lower = text_lower.split()
|
|
lines = text.splitlines()
|
|
sentences = re.findall(r'[^.!?]+[.!?]', text)
|
|
return text_lower, words, words_lower, lines, sentences
|
|
|
|
def analyze(self, text: str) -> Dict[str, float]:
|
|
"""Analizuje pojedynczy tekst"""
|
|
doc = self.nlp(text)
|
|
return self._analyze_single_doc(text, doc)
|
|
|
|
def _analyze_single_doc(self, text: str, doc: spacy.tokens.Doc) -> Dict[str, float]:
|
|
"""Wewn臋trzna logika analizy dla pojedynczego tekstu i obiektu doc."""
|
|
if not isinstance(text, str) or not text.strip():
|
|
return {feature_name: 0.0 for feature_name in constants.COLUMN_ORDER}
|
|
|
|
text_lower, words, words_lower, lines, sentences = self._preprocess(text)
|
|
|
|
all_features = {}
|
|
all_features.update(base_features.calculate_all_base_features(text, text_lower, words, words_lower, lines))
|
|
all_features.update(linguistic_features.calculate_all_linguistic_features(text, text_lower, words, words_lower, sentences))
|
|
all_features.update(structural_features.calculate_all_structural_features(text, lines, sentences))
|
|
all_features.update(regex_features.calculate_all_regex_features(text))
|
|
all_features.update(spacy_features.calculate_all_spacy_features(doc, text, sentences))
|
|
|
|
return all_features
|
|
|
|
def analyze_batch(self, texts: Iterable[str], batch_size: int = 100) -> Iterable[Dict[str, float]]:
|
|
"""
|
|
Analizuje wsadowo kolekcj臋 tekst贸w u偶ywaj膮c nlp.pipe dla maksymalnej wydajno艣ci.
|
|
|
|
Args:
|
|
texts (Iterable[str]): Kolekcja (np. lista) tekst贸w do analizy.
|
|
batch_size (int): Rozmiar paczki przekazywanej do spaCy.
|
|
|
|
Yields:
|
|
Iterable[Dict[str, float]]: Generator zwracaj膮cy s艂ownik cech dla ka偶dego tekstu.
|
|
"""
|
|
|
|
docs = self.nlp.pipe(texts, batch_size=batch_size)
|
|
|
|
|
|
for i, doc in enumerate(docs):
|
|
original_text = texts[i]
|
|
yield self._analyze_single_doc(original_text, doc) |