Spaces:

kltn20133118
/

demo_obsei

Sleeping

App Files Files Community

demo_obsei / obsei_module /obsei /analyzer /pii_analyzer.py

kltn20133118

Upload 337 files

dbaa71b verified 7 months ago

raw

history blame

7.69 kB

	import logging
	from typing import Any, Dict, List, Optional

	from presidio_analyzer import AnalyzerEngine, EntityRecognizer
	from presidio_anonymizer import AnonymizerEngine
	from presidio_analyzer.nlp_engine import NlpEngineProvider
	from presidio_anonymizer.entities.engine import OperatorConfig
	from pydantic import BaseModel, Field, PrivateAttr

	from obsei.analyzer.base_analyzer import (
	BaseAnalyzer,
	BaseAnalyzerConfig,
	)
	from obsei.payload import TextPayload

	logger = logging.getLogger(__name__)


	class PresidioModelConfig(BaseModel):
	lang_code: Optional[str] = Field("en")
	model_name: Optional[str] = Field("en_core_web_lg")


	class PresidioEngineConfig(BaseModel):
	nlp_engine_name: Optional[str] = Field("spacy")
	models: Optional[List[PresidioModelConfig]] = None

	def __init__(self, **data: Any):
	super().__init__(**data)

	if not self.models or len(self.models) == 0:
	self.models = [PresidioModelConfig()]


	class PresidioAnonymizerConfig(OperatorConfig, BaseModel): # type: ignore
	def __init__(self, anonymizer_name: str, params: Optional[Dict[str, Any]] = None):
	super().__init__(anonymizer_name=anonymizer_name, params=params)

	class Config:
	arbitrary_types_allowed = True


	class PresidioPIIAnalyzerConfig(BaseAnalyzerConfig):
	TYPE: str = "PresidioPII"
	# To find more details refer https://microsoft.github.io/presidio/anonymizer/
	anonymizers_config: Optional[Dict[str, PresidioAnonymizerConfig]] = None
	# To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
	# By default it will search for all the supported entities
	entities: Optional[List[str]] = None
	analyze_only: Optional[bool] = False
	replace_original_text: Optional[bool] = True
	# Whether the analysis decision process steps returned in the response
	return_decision_process: Optional[bool] = False


	class PresidioPIIAnalyzer(BaseAnalyzer):
	_analyzer: AnalyzerEngine = PrivateAttr()
	_anonymizer: AnonymizerEngine = PrivateAttr()
	TYPE: str = "PresidioPII"
	engine_config: Optional[PresidioEngineConfig] = None
	# To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
	# To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/
	entity_recognizers: Optional[List[EntityRecognizer]] = None
	# To find more details refer https://microsoft.github.io/presidio/anonymizer/
	anonymizers_config: Optional[Dict[str, OperatorConfig]] = None

	def __init__(self, **data: Any):
	super().__init__(**data)

	if not self.engine_config:
	self.engine_config = PresidioEngineConfig()

	if not self.engine_config.models or len(self.engine_config.models) == 0:
	self.engine_config.models = [PresidioModelConfig()]

	# If spacy engine then load Spacy models and select languages
	languages = []
	for model_config in self.engine_config.models:
	languages.append(model_config.lang_code)

	# Check SpacyNlpEngine.engine_name
	if (
	self.engine_config.nlp_engine_name == "spacy"
	and model_config.model_name is not None
	):
	try:
	spacy_model = __import__(model_config.model_name)
	spacy_model.load()
	logger.info(
	f"Spacy model {model_config.model_name} is already downloaded"
	)
	except:
	logger.warning(
	f"Spacy model {model_config.model_name} is not downloaded"
	)
	logger.warning(
	f"Downloading spacy model {model_config.model_name}, it might take some time"
	)
	from spacy.cli import download # type: ignore

	download(model_config.model_name)

	# Create NLP engine based on configuration
	provider = NlpEngineProvider(nlp_configuration=self.engine_config.dict())
	nlp_engine = provider.create_engine()

	# Pass the created NLP engine and supported_languages to the AnalyzerEngine
	self._analyzer = AnalyzerEngine(
	nlp_engine=nlp_engine, supported_languages=languages
	)

	# self._analyzer.registry.load_predefined_recognizers()
	if self.entity_recognizers:
	for entity_recognizer in self.entity_recognizers:
	self._analyzer.registry.add_recognizer(entity_recognizer)

	# Initialize the anonymizer with logger
	self._anonymizer = AnonymizerEngine()

	def analyze_input( # type: ignore[override]
	self,
	source_response_list: List[TextPayload],
	analyzer_config: Optional[PresidioPIIAnalyzerConfig] = None,
	language: Optional[str] = "en",
	**kwargs: Any,
	) -> List[TextPayload]:
	if analyzer_config is None:
	raise ValueError("analyzer_config can't be None")

	analyzer_output: List[TextPayload] = []

	for batch_responses in self.batchify(source_response_list, self.batch_size):
	for source_response in batch_responses:
	analyzer_result = self._analyzer.analyze(
	text=source_response.processed_text,
	entities=analyzer_config.entities,
	return_decision_process=analyzer_config.return_decision_process,
	language=language,
	)

	anonymized_result = None
	if not analyzer_config.analyze_only:
	anonymizers_config = (
	analyzer_config.anonymizers_config or self.anonymizers_config
	)

	if (
	source_response.processed_text is not None
	and len(source_response.processed_text) > 0
	):
	anonymized_result = self._anonymizer.anonymize(
	text=source_response.processed_text,
	operators=anonymizers_config,
	analyzer_results=analyzer_result,
	)

	if (
	analyzer_config.replace_original_text
	and anonymized_result is not None
	):
	text = anonymized_result.text
	else:
	text = source_response.processed_text

	segmented_data = {
	"pii_data": {
	"analyzer_result": [vars(result) for result in analyzer_result],
	"anonymized_result": None
	if not anonymized_result
	else [vars(item) for item in anonymized_result.items],
	"anonymized_text": None
	if not anonymized_result
	else anonymized_result.text,
	}
	}
	if source_response.segmented_data:
	segmented_data = {
	**segmented_data,
	**source_response.segmented_data,
	}

	analyzer_output.append(
	TextPayload(
	processed_text=text,
	meta=source_response.meta,
	segmented_data=segmented_data,
	source_name=source_response.source_name,
	)
	)

	return analyzer_output