Spaces:

thlinhares
/

docling

Running

App Files Files Community

thlinhares commited on Feb 23

Commit

c491a68

verified ·

1 Parent(s): 7070f0c

Create new_app.py

Browse files

Files changed (1) hide show

new_app.py +200 -0

new_app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import gradio as gr
+from docling.document_converter import DocumentConverter
+import google.generativeai as genai
+import re
+import os
+import logging
+import json
+from typing import Dict, List, Tuple
+from datetime import datetime
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+import torch
+import spacy
+# Configuração de logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(f'contract_analyzer_{datetime.now().strftime("%Y%m%d")}.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# Configuração da API do Gemini
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+if not GOOGLE_API_KEY:
+    logger.error("GOOGLE_API_KEY não encontrada nas variáveis de ambiente")
+    raise ValueError("GOOGLE_API_KEY não configurada")
+genai.configure(api_key=GOOGLE_API_KEY)
+logger.info("API Gemini configurada com sucesso")
+# Carregar o modelo NER e tokenizador
+model_name = "dominguesm/ner-legal-bert-base-cased-ptbr"
+logger.info(f"Carregando o modelo NER: {model_name}")
+ner_model = AutoModelForTokenClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+logger.info("Modelo NER e tokenizador carregados com sucesso")
+# Funções do App 1 (Gemini)
+def extract_json_from_response(response_text: str) -> str:
+    json_content = response_text.strip()
+    if json_content.startswith('```'):
+        json_content = json_content.split('\n', 1)[1]
+    if json_content.endswith('```'):
+        json_content = json_content.rsplit('\n', 1)[0]
+    return json_content.strip()
+def extract_legal_representatives_gemini(contract_text: str) -> Dict:
+    logger.info("Iniciando extração de representantes legais com Gemini")
+    try:
+        model = genai.GenerativeModel('gemini-pro')
+        prompt = """
+        Analise o seguinte contrato social e extraia:
+        1. Todos os sócios e seus percentuais de participação
+        2. Todos os administradores mencionados
+        Formate a resposta como um dicionário JSON com as seguintes chaves:
+        - "socios": lista de dicionários com "nome" e "participacao"
+        - "administradores": lista de nomes
+        Contrato Social:
+        {contract_text}
+        """
+        response = model.generate_content(prompt.format(contract_text=contract_text))
+        json_content = extract_json_from_response(response.text)
+        result = json.loads(json_content)
+        return result
+    except Exception as e:
+        logger.error(f"Erro na análise Gemini: {str(e)}")
+        return {
+            "socios": [],
+            "administradores": [],
+            "erro": str(e)
+        }
+# Funções do App 2 (NER)
+def extract_entities(text: str) -> List[Tuple[str, str]]:
+    logger.debug("Iniciando extração de entidades com NER")
+    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
+    tokens = inputs.tokens()
+    with torch.no_grad():
+        outputs = ner_model(**inputs).logits
+        predictions = torch.argmax(outputs, dim=2)
+    entities = []
+    for token, prediction in zip(tokens, predictions[0].numpy()):
+        entity_label = ner_model.config.id2label[prediction]
+        if entity_label != "O":
+            entities.append((token, entity_label))
+    return entities
+def extract_representatives_ner(entities: List[Tuple[str, str]]) -> List[str]:
+    representatives = []
+    current_person = ""
+    current_organization = ""
+    for token, label in entities:
+        if label in ["B-PESSOA", "I-PESSOA"]:
+            current_person += token.replace("##", "")
+        else:
+            if current_person:
+                representatives.append(current_person)
+                current_person = ""
+        if label in ["B-ORGANIZACAO", "I-ORGANIZACAO"]:
+            current_organization += token.replace("##", "")
+        else:
+            if current_organization:
+                representatives.append(current_organization)
+                current_organization = ""
+    if current_person:
+        representatives.append(current_person)
+    if current_organization:
+        representatives.append(current_organization)
+    return representatives
+def format_output_gemini(analysis_result: Dict) -> str:
+    output = "ANÁLISE DO CONTRATO SOCIAL (Gemini)\n\n"
+    output += "SÓCIOS:\n"
+    for socio in analysis_result.get("socios", []):
+        participacao = socio.get('participacao', 'Não especificada')
+        participacao_str = f"{participacao}%" if participacao is not None else "Participação não especificada"
+        output += f"- {socio['nome']}: {participacao_str}\n"
+    output += "\nADMINISTRADORES:\n"
+    for admin in analysis_result.get("administradores", []):
+        output += f"- {admin}\n"
+    if "erro" in analysis_result:
+        output += f"\nERRO: {analysis_result['erro']}"
+    return output
+def format_output_ner(representatives: List[str]) -> str:
+    output = "ANÁLISE DO CONTRATO SOCIAL (NER)\n\n"
+    output += "REPRESENTANTES IDENTIFICADOS:\n"
+    for rep in representatives:
+        output += f"- {rep}\n"
+    return output
+# Função principal que processa o documento
+def analyze_contract(file, analysis_type: str):
+    logger.info(f"Iniciando análise do arquivo usando {analysis_type}: {file.name}")
+    try:
+        converter = DocumentConverter()
+        result = converter.convert(file.name)
+        document_text = result.document.export_to_text()
+        if analysis_type == "Gemini":
+            analysis_result = extract_legal_representatives_gemini(document_text)
+            output = format_output_gemini(analysis_result)
+        else:  # NER
+            entities = extract_entities(document_text)
+            representatives = extract_representatives_ner(entities)
+            output = format_output_ner(representatives)
+        return document_text, output
+    except Exception as e:
+        logger.error(f"Erro durante análise do contrato: {str(e)}")
+        return "", f"Erro ao processar o arquivo: {str(e)}"
+# Criar interface Gradio
+try:
+    logger.info("Iniciando configuração da interface Gradio")
+    iface = gr.Interface(
+        fn=analyze_contract,
+        inputs=[
+            "file",
+            gr.Radio(
+                choices=["Gemini", "NER"],
+                label="Tipo de Análise",
+                value="Gemini"
+            )
+        ],
+        outputs=[
+            gr.Textbox(label="Texto do Contrato"),
+            gr.Textbox(label="Resultado da Análise")
+        ],
+        title="Analisador de Contratos Sociais",
+        description="Este aplicativo analisa contratos sociais usando Gemini ou NER para identificar representantes legais.",
+    )
+    logger.info("Interface Gradio configurada com sucesso")
+except Exception as e:
+    logger.error(f"Erro ao configurar interface Gradio: {str(e)}")
+    raise
+if __name__ == "__main__":
+    logger.info("Iniciando aplicação")
+    iface.launch()