Spaces:

thlinhares
/

docling

Sleeping

App Files Files Community

thlinhares commited on Feb 23

Commit

8864085

verified ·

1 Parent(s): 4d79540

Update analyzers/ner_analyzer.py

Browse files

Files changed (1) hide show

analyzers/ner_analyzer.py +65 -24

analyzers/ner_analyzer.py CHANGED Viewed

@@ -4,65 +4,106 @@ import torch
 from typing import List, Tuple
 import logging
 from .base_analyzer import BaseAnalyzer
-from huggingface_hub import hf_api
 logger = logging.getLogger(__name__)
 class NERAnalyzer(BaseAnalyzer):
     def __init__(self):
-        self.model_name = "pierreguillou/ner-bert-base-pt"  # Modelo NER para português
         logger.info(f"Carregando o modelo NER: {self.model_name}")
-        # Passando o token de autenticação ao carregar o modelo
-        self.token = os.getenv("token_huggingface")
-        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name, use_auth_token=self.token)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_auth_token=self.token)
         logger.info("Modelo NER e tokenizador carregados com sucesso")
     def extract_entities(self, text: str) -> List[Tuple[str, str]]:
         logger.debug("Iniciando extração de entidades com NER")
-        inputs = self.tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
-        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # Converter ids de volta para tokens
         with torch.no_grad():
-            outputs = self.model(**inputs).logits
-        predictions = torch.argmax(outputs, dim=2)
         entities = []
         for token, prediction in zip(tokens, predictions[0].numpy()):
             entity_label = self.model.config.id2label[prediction]
-            if entity_label != "O":  # Ignorar tokens não relacionados a entidades
-                entities.append((token, entity_label))
         logger.info(f"Entidades extraídas: {entities}")
         return entities
     def extract_representatives(self, entities: List[Tuple[str, str]]) -> List[str]:
         representatives = []
         current_entity = []
         current_label = None
         for token, label in entities:
-            if label == current_label:
                 current_entity.append(token)
             else:
                 if current_entity:
-                    representatives.append(" ".join(current_entity))
                 current_entity = [token]
                 current_label = label
         if current_entity:
-            representatives.append(" ".join(current_entity))
         logger.info(f"Representantes extraídos: {representatives}")
         return representatives
     def analyze(self, text: str) -> List[str]:
         entities = self.extract_entities(text)
         return self.extract_representatives(entities)
     def format_output(self, representatives: List[str]) -> str:
         output = "ANÁLISE DO CONTRATO SOCIAL (NER)\n\n"
-        output += "REPRESENTANTES IDENTIFICADOS:\n"
         for rep in representatives:
             output += f"- {rep}\n"
-        return output

 from typing import List, Tuple
 import logging
 from .base_analyzer import BaseAnalyzer
 logger = logging.getLogger(__name__)
 class NERAnalyzer(BaseAnalyzer):
     def __init__(self):
+        self.model_name = "jpbahiaz/bert-base-portuguese-ner"  # Modelo NER mais leve para português
         logger.info(f"Carregando o modelo NER: {self.model_name}")
+        # Carregando o modelo e tokenizer sem necessidade de token de autenticação
+        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        # Definindo as labels que queremos extrair (pessoas e organizações)
+        self.target_labels = ['B-PESSOA', 'I-PESSOA', 'B-ORGANIZACAO', 'I-ORGANIZACAO']
         logger.info("Modelo NER e tokenizador carregados com sucesso")
     def extract_entities(self, text: str) -> List[Tuple[str, str]]:
         logger.debug("Iniciando extração de entidades com NER")
+        # Pré-processamento do texto
+        inputs = self.tokenizer(
+            text,
+            max_length=512,
+            truncation=True,
+            return_tensors="pt",
+            padding=True
+        )
+        # Obtendo tokens
+        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+        # Fazendo a predição
         with torch.no_grad():
+            outputs = self.model(**inputs)
+            predictions = torch.argmax(outputs.logits, dim=2)
         entities = []
         for token, prediction in zip(tokens, predictions[0].numpy()):
             entity_label = self.model.config.id2label[prediction]
+            # Filtrando apenas pessoas e organizações
+            if entity_label in self.target_labels:
+                # Removendo prefixos especiais do tokenizer
+                if token.startswith("##"):
+                    token = token[2:]
+                # Ignorando tokens especiais
+                if token not in ["[CLS]", "[SEP]", "[PAD]"]:
+                    entities.append((token, entity_label))
         logger.info(f"Entidades extraídas: {entities}")
         return entities
     def extract_representatives(self, entities: List[Tuple[str, str]]) -> List[str]:
+        if not entities:
+            return []
         representatives = []
         current_entity = []
         current_label = None
         for token, label in entities:
+            # Verificando se é continuação da mesma entidade
+            is_same_entity = (
+                (label.startswith('B-') and current_label and current_label.endswith(label[2:])) or
+                (label.startswith('I-') and current_label and current_label.endswith(label[2:]))
+            )
+            if is_same_entity:
                 current_entity.append(token)
             else:
                 if current_entity:
+                    representatives.append("".join(current_entity).replace(" ##", ""))
                 current_entity = [token]
                 current_label = label
+        # Adicionando a última entidade
         if current_entity:
+            representatives.append("".join(current_entity).replace(" ##", ""))
+        # Removendo duplicatas e limpando
+        representatives = list(set(representatives))
+        representatives = [rep.strip() for rep in representatives if len(rep.strip()) > 1]
         logger.info(f"Representantes extraídos: {representatives}")
         return representatives
     def analyze(self, text: str) -> List[str]:
         entities = self.extract_entities(text)
         return self.extract_representatives(entities)
     def format_output(self, representatives: List[str]) -> str:
         output = "ANÁLISE DO CONTRATO SOCIAL (NER)\n\n"
+        if not representatives:
+            output += "Nenhum representante ou empresa identificado.\n"
+            return output
+        output += "REPRESENTANTES E EMPRESAS IDENTIFICADOS:\n"
         for rep in representatives:
             output += f"- {rep}\n"
+        return output