quality_classifier_pl / text_analyzer /analyzer.py

fix

e55029f verified 2 months ago

3.23 kB

	"""
	Główny moduł biblioteki zawierający klasę TextAnalyzer.
	"""
	import spacy
	import textstat
	import re
	from typing import Dict, List, Tuple, Iterable

	from . import constants
	from .features import base_features, linguistic_features, regex_features, spacy_features, structural_features

	class TextAnalyzer:
	"""
	Główna klasa do kompleksowej analizy tekstu w języku polskim.
	"""
	def __init__(self):
	"""
	Inicjalizuje analizator.
	"""
	try:
	self.nlp = spacy.load(constants.SPACY_MODEL_PL)
	self.nlp.max_length = constants.NLP_MAX_LENGTH
	except OSError:
	print(f"Błąd: Nie znaleziono modelu spaCy '{constants.SPACY_MODEL_PL}'.")
	print(f"python -m spacy download {constants.SPACY_MODEL_PL}")
	raise
	textstat.set_lang('pl_PL')

	def _preprocess(self, text: str) -> Tuple:
	text_lower = text.lower()
	words = text.split()
	words_lower = text_lower.split()
	lines = text.splitlines()
	sentences = re.findall(r'[^.!?]+[.!?]', text)
	return text_lower, words, words_lower, lines, sentences

	def analyze(self, text: str) -> Dict[str, float]:
	"""Analizuje pojedynczy tekst"""
	doc = self.nlp(text)
	return self._analyze_single_doc(text, doc)

	def _analyze_single_doc(self, text: str, doc: spacy.tokens.Doc) -> Dict[str, float]:
	"""Wewnętrzna logika analizy dla pojedynczego tekstu i obiektu doc."""
	if not isinstance(text, str) or not text.strip():
	return {feature_name: 0.0 for feature_name in constants.COLUMN_ORDER}

	text_lower, words, words_lower, lines, sentences = self._preprocess(text)

	all_features = {}
	all_features.update(base_features.calculate_all_base_features(text, text_lower, words, words_lower, lines))
	all_features.update(linguistic_features.calculate_all_linguistic_features(text, text_lower, words, words_lower, sentences))
	all_features.update(structural_features.calculate_all_structural_features(text, lines, sentences))
	all_features.update(regex_features.calculate_all_regex_features(text))
	all_features.update(spacy_features.calculate_all_spacy_features(doc, text, sentences))

	return all_features

	def analyze_batch(self, texts: Iterable[str], batch_size: int = 100) -> Iterable[Dict[str, float]]:
	"""
	Analizuje wsadowo kolekcję tekstów używając nlp.pipe dla maksymalnej wydajności.

	Args:
	texts (Iterable[str]): Kolekcja (np. lista) tekstów do analizy.
	batch_size (int): Rozmiar paczki przekazywanej do spaCy.

	Yields:
	Iterable[Dict[str, float]]: Generator zwracający słownik cech dla każdego tekstu.
	"""
	# Używamy nlp.pipe, który jest generatorem i przetwarza teksty wsadowo
	docs = self.nlp.pipe(texts, batch_size=batch_size)

	# Przetwarzamy każdy dokument z generatora
	for i, doc in enumerate(docs):
	original_text = texts[i]
	yield self._analyze_single_doc(original_text, doc)