Spaces:

thlinhares
/

docling

Sleeping

App Files Files Community

thlinhares commited on Feb 23

Commit

4cc6b62

verified ·

1 Parent(s): 647a6cb

Delete app_ner.py

Browse files

Files changed (1) hide show

app_ner.py +0 -99

app_ner.py DELETED Viewed

@@ -1,99 +0,0 @@
-import gradio as gr
-from docling.document_converter import DocumentConverter
-from transformers import AutoModelForTokenClassification, AutoTokenizer
-import torch
-import spacy
-import logging
-# Configuração do logger
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-# Carregar o modelo e o tokenizador
-model_name = "dominguesm/ner-legal-bert-base-cased-ptbr"
-logger.info(f"Carregando o modelo: {model_name}")
-model = AutoModelForTokenClassification.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-logger.info("Modelo e tokenizador carregados com sucesso.")
-# Carrega modelo de NLP pt-br
-#nlp = spacy.load("pt_core_news_sm")
-# Função para realizar a inferência com o modelo de NER
-def extract_entities(text):
-    logger.debug(f"Iniciando extração de entidades para o texto: {text[:50]}...")
-    # Tokenize o texto
-    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
-    tokens = inputs.tokens()
-    logger.debug("Obtendo previsões do modelo.")
-    # Obtenha as previsões do modelo
-    with torch.no_grad():
-        outputs = model(**inputs).logits
-    predictions = torch.argmax(outputs, dim=2)
-    # Extrair as entidades reconhecidas
-    entities = []
-    logger.debug("Extraindo entidades.")
-    for token, prediction in zip(tokens, predictions[0].numpy()):
-        entity_label = model.config.id2label[prediction]
-        if entity_label != "O":  # "O" significa que não é uma entidade
-            entities.append((token, entity_label))
-    logger.info(f"Entidades extraídas: {entities}")
-    return entities
-# Função para extrais os representantes legais
-def extract_representatives(entities):
-    logger.debug("Iniciando extração de representantes legais.")
-    representatives = []
-    current_person = ""
-    current_organization = ""
-    for token, label in entities:
-        if label == "B-PESSOA" or label == "I-PESSOA":
-            # Concatenando os tokens da pessoa
-            current_person += token.replace("##", "")  # Remover o "##" das partes do token
-        else:
-            if current_person:
-                representatives.append(current_person)
-                current_person = ""  # Resetar para a próxima pessoa
-        if label == "B-ORGANIZACAO" or label == "I-ORGANIZACAO":
-            # Concatenando os tokens da organização
-            current_organization += token.replace("##", "")
-        else:
-            if current_organization:
-                representatives.append(current_organization)
-                current_organization = ""  # Resetar para a próxima organização
-    # Adicionar a última pessoa ou organização, caso o texto termine sem delimitadores
-    if current_person:
-        representatives.append(current_person)
-    if current_organization:
-        representatives.append(current_organization)
-    logger.info(f"Representantes extraídos: {representatives}")
-    return representatives
-# Função para converter o documento e extrair as entidades jurídicas
-def convert_document(file):
-    logger.debug("Iniciando conversão do documento.")
-    converter = DocumentConverter()
-    logger.info(f"Convertendo o documento: {file.name}")
-    result = converter.convert(file.name)
-    document_text = result.document.export_to_text()
-    # Extrair as entidades jurídicas do texto convertido
-    entities = extract_entities(document_text)
-    # Extrair representantes legais
-    legal_representatives = extract_representatives(entities)
-    return document_text, legal_representatives
-# Interface do Gradio
-demo = gr.Interface(fn=convert_document, inputs="file", outputs=["text", "json"])
-logger.info("Iniciando a interface Gradio.")
-demo.launch()