Spaces:

luanpoppe
/

vella-backend

Sleeping

App Files Files Community

luanpoppe commited on Jan 31

Commit

78209bc

1 Parent(s): 234f840

fix: versão funcional após fixs feitos em todo o fluxo

Browse files

Files changed (11) hide show

_antigos/resumos/serializer.py +2 -2
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py +11 -8
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +47 -57
_utils/gerar_relatorio_modelo_usuario/prompts.py +79 -36
_utils/gerar_relatorio_modelo_usuario/utils.py +49 -0
_utils/handle_files.py +38 -28
_utils/models/gerar_relatorio.py +1 -0
_utils/resumo_completo_cursor.py +1 -1
_utils/splitters/Splitter_class.py +83 -43
_utils/vector_stores/Vector_store_class.py +14 -25
gerar_documento/serializer.py +2 -2

_antigos/resumos/serializer.py CHANGED Viewed

@@ -25,5 +25,5 @@ class ResumoCursorSerializer(serializers.Serializer):
     user_message = serializers.CharField(required=False, default="")
     model = serializers.CharField(required=False, default=default_model)
     hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
-    chunk_size = serializers.IntegerField(required=False, default=5000)
-    chunk_overlap = serializers.IntegerField(required=False, default=1600)

     user_message = serializers.CharField(required=False, default="")
     model = serializers.CharField(required=False, default=default_model)
     hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
+    chunk_size = serializers.IntegerField(required=False, default=3500)
+    chunk_overlap = serializers.IntegerField(required=False, default=800)

_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import List, Dict, Tuple, Optional
 from pydantic import SecretStr
 from _utils.vector_stores.Vector_store_class import VectorStore
@@ -222,19 +222,22 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
                 prompt_auxiliar.format(context="\n\n".join(contexts))
             )
-            self.resumo_gerado = resumo_auxiliar_do_documento.content
             prompt_gerar_documento = PromptTemplate(
                 template=self.prompt_gerar_documento,
                 input_variables=["context"],
             )
-            documento_gerado = llm.invoke(
-                prompt_gerar_documento.format(
-                    context=self.resumo_gerado,
-                    # modelo_usuario=serializer.data["modelo"],
-                )
-            ).content
             # Split the response into paragraphs
             summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]

 import os
+from typing import List, Dict, Tuple, Optional, cast
 from pydantic import SecretStr
 from _utils.vector_stores.Vector_store_class import VectorStore
                 prompt_auxiliar.format(context="\n\n".join(contexts))
             )
+            self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
             prompt_gerar_documento = PromptTemplate(
                 template=self.prompt_gerar_documento,
                 input_variables=["context"],
             )
+            documento_gerado = cast(
+                str,
+                llm.invoke(
+                    prompt_gerar_documento.format(
+                        context=self.resumo_gerado,
+                        # modelo_usuario=serializer.data["modelo"],
+                    )
+                ).content,
+            )
             # Split the response into paragraphs
             summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]

_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED Viewed

@@ -7,6 +7,9 @@ from _utils.gerar_relatorio_modelo_usuario.prompts import (
 )
 from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 from _utils.chains.Chain_class import Chain
 from _utils.handle_files import return_document_list_with_llama_parser
 from _utils.prompts.Prompt_class import Prompt
 from _utils.splitters.Splitter_class import Splitter
@@ -50,13 +53,13 @@ class ContextualRetriever:
         self.claude_context_model = claude_context_model
     async def contextualize_all_chunks(
-        self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
     ) -> List[ContextualizedChunk]:
         """Add context to all chunks"""
         contextualized_chunks = []
         full_text = ""
         for x in full_text_as_array:
-            full_text += x.page_content
         prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
@@ -100,7 +103,6 @@ class ContextualRetriever:
             contextualized_chunks = contextualized_chunks + task.result()
-        print("\n\ncontextualized_chunks", contextualized_chunks)
         return contextualized_chunks
     # ORIGINAL
@@ -131,54 +133,29 @@ class ContextualRetriever:
         lista_contador.append(0)
         print("contador: ", len(lista_contador))
-        all_pages_contents = ""
-        contador = 1
-        for chunk in chunks:
-            page_number = chunk.page_number - 1
-            page_content = single_page_text[page_number].page_content
-            all_pages_contents += page_content
-            contador += 1
-        context = await self.llm_generate_context(
-            page_content, chunks, response_auxiliar_summary
-        )
-        context = (
-            context.replace("document_id: ", "")
-            .replace("document_id:", "")
-            .replace("DOCUMENT_ID: ", "")
-            .replace("DOCUMENT_ID: ", "")
-        )
-        # print("context: ", context)
-        import re
-        pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>"  # Funciona para quando a resposta do LLM não vem com "document_id" escrito
-        # pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
-        matches = re.findall(pattern, context, re.DOTALL)
-        # Convert matches to the desired format
-        result = [
-            [int(doc_id), title.strip(), content.strip()]
-            for doc_id, title, content in matches
-        ]
-        # print("\n\nresult", result)
-        if result == "" or result == [""]:
-            print("\n\ncontext", context)
         lista_chunks = []
         for index, chunk in enumerate(chunks):
             lista_chunks.append(
                 ContextualizedChunk(
                     content=chunk.content,
                     page_number=chunk.page_number,
-                    id_do_processo=result[index][0],
                     chunk_id=chunk.chunk_id,
                     start_char=chunk.start_char,
                     end_char=chunk.end_char,
-                    context=" ".join(result[index][1:2]),
                 )
             )
@@ -207,7 +184,7 @@ class ContextualRetriever:
     #         return ""
     async def llm_generate_context(
-        self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
     ) -> str:
         """Generate contextual description using ChatOpenAI"""
         contador = 1
@@ -220,16 +197,29 @@ class ContextualRetriever:
         try:
             print("COMEÇOU A REQUISIÇÃO")
-            prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
             # response = await aclaude_answer(
             #     self.claude_client, self.claude_context_model, prompt
             # )
-            response = await agpt_answer(prompt)
-            # llms = LLM()
-            # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
-            # return cast(str, response.content)
-            return cast(str, response)
         except Exception as e:
             self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
             return ""
@@ -267,20 +257,20 @@ async def get_full_text_and_all_PDFs_chunks(
 ):
     all_PDFs_chunks = []
-    pages: List[Document] = []
     # Load and process document
     for pdf_path in listaPDFs:
-        if isBubble:
-            pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
-        else:
-            if should_use_llama_parse:
-                pages = pages + await return_document_list_with_llama_parser(pdf_path)
-            else:
-                pages = pages + PyPDFLoader(pdf_path).load()
-        chunks = splitterObject.load_and_split_document(
-            pdf_path, pages, should_use_llama_parse
         )
         all_PDFs_chunks = all_PDFs_chunks + chunks
     # Get full text for contextualization

 )
 from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 from _utils.chains.Chain_class import Chain
+from _utils.gerar_relatorio_modelo_usuario.utils import (
+    validate_many_chunks_in_one_request,
+)
 from _utils.handle_files import return_document_list_with_llama_parser
 from _utils.prompts.Prompt_class import Prompt
 from _utils.splitters.Splitter_class import Splitter
         self.claude_context_model = claude_context_model
     async def contextualize_all_chunks(
+        self, full_text_as_array: List[str], chunks: List[DocumentChunk]
     ) -> List[ContextualizedChunk]:
         """Add context to all chunks"""
         contextualized_chunks = []
         full_text = ""
         for x in full_text_as_array:
+            full_text += x
         prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
             contextualized_chunks = contextualized_chunks + task.result()
         return contextualized_chunks
     # ORIGINAL
         lista_contador.append(0)
         print("contador: ", len(lista_contador))
+        # all_pages_contents = ""
+        # contador = 1
+        # for chunk in chunks:
+        #     page_number = chunk.page_number - 1
+        #     page_content = single_page_text[page_number].page_content
+        #     all_pages_contents += page_content
+        #     contador += 1
+        result = await self.llm_generate_context(chunks, response_auxiliar_summary)
         lista_chunks = []
         for index, chunk in enumerate(chunks):
             lista_chunks.append(
                 ContextualizedChunk(
+                    contextual_summary=result[index][2],
                     content=chunk.content,
                     page_number=chunk.page_number,
+                    id_do_processo=int(result[index][0]),
                     chunk_id=chunk.chunk_id,
                     start_char=chunk.start_char,
                     end_char=chunk.end_char,
+                    context=result[index][1],
                 )
             )
     #         return ""
     async def llm_generate_context(
+        self, chunks: List[DocumentChunk], resumo_auxiliar  # , page_text: str
     ) -> str:
         """Generate contextual description using ChatOpenAI"""
         contador = 1
         try:
             print("COMEÇOU A REQUISIÇÃO")
+            prompt = contextual_prompt(resumo_auxiliar, all_chunks_contents)
             # response = await aclaude_answer(
             #     self.claude_client, self.claude_context_model, prompt
             # )
+            for attempt in range(4):
+                print(f"\n\nTENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt}")
+                raw_response = await agpt_answer(prompt)
+                response = cast(str, raw_response)
+                # llms = LLM()
+                # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
+                # return cast(str, response.content)
+                matches = validate_many_chunks_in_one_request(response)
+                # Convert matches to the desired format
+                if matches:
+                    result = [
+                        [int(doc_id), title.strip(), content.strip()]
+                        for doc_id, title, content in matches
+                    ]
+                    return cast(str, result)
+            raise ValueError(f"FORMATAÇÃO DOS CHUNKS FOI INVÁLIDA: {response}")
         except Exception as e:
             self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
             return ""
 ):
     all_PDFs_chunks = []
+    pages: List[str] = []
     # Load and process document
     for pdf_path in listaPDFs:
+        # if isBubble:
+        #     pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
+        # else:
+        #     if should_use_llama_parse:
+        #         pages = pages + await return_document_list_with_llama_parser(pdf_path)
+        #     else:
+        #         pages = pages + PyPDFLoader(pdf_path).load()
+        chunks, pages = await splitterObject.load_and_split_document(
+            pdf_path, should_use_llama_parse, isBubble
         )
         all_PDFs_chunks = all_PDFs_chunks + chunks
     # Get full text for contextualization

_utils/gerar_relatorio_modelo_usuario/prompts.py CHANGED Viewed

@@ -154,25 +154,33 @@ Formate sua resposta da seguinte maneira:
 # </chunk_context>"""
-def contextual_prompt(all_pages_contents, summary_text, chunk_content):
     return f"""
 You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
 Here's the summary of the full text of the document:
 <summary_text>
 {summary_text}
 </summary_text>
-Here are the pages where the chunks are situated:
-<page>
-{all_pages_contents}
-</page>
 You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
-1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
 2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
 3. Identify where the specific chunk fits within these themes.
 4. Create a concise context that situates the chunk within the document.
 Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
 <final_output>
 [document_id] --- [document_name] --- [brief_context_for_the_chunk]
 </final_output>
 Here are the 20 chunks to analyze:
 <user_input>
@@ -187,6 +195,9 @@ Example output structure (do not copy the content, only the format):
 </chunk_context>
 [Continue for all 20 chunks]
 Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
 """
@@ -282,33 +293,65 @@ After composing the sentence, but before presenting it as the final answer, refl
 - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
 """
-prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
-**Instructions**:
-1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
-2. **Reading the Context**: Extract the following information from `context`:
-- The name of the defendant (réu).
-- The crime they have been accused of (nome_do_crime).
-- The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
-- The date the accusation was accepted (data_do_recebimento).
-- The ID of the decision document (id_do_documento).
-3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
-4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
-```
-<formato>
-Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
-A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
-Não há outras causas interruptivas ou suspensivas da prescrição.
-</formato>
-```
-5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
-**Reminder**:
-- Do not include your chain of thought in the final output.
-- Do not add extra information or commentary beyond the specified format.
-- The final answer must be in Portuguese.
-```
-<formato>
-Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
-A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
-Não há outras causas interruptivas ou suspensivas da prescrição.
-</formato>
-"""

 # </chunk_context>"""
+# Removido do prompt abaixo após mudar para cada chunk ter 5000 caracteres:
+# Here are the pages where the chunks are situated:
+# <page>
+# {all_pages_contents}
+# </page>
+# 1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
+def contextual_prompt(summary_text, chunk_content):  # , all_pages_contents
     return f"""
 You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
 Here's the summary of the full text of the document:
 <summary_text>
 {summary_text}
 </summary_text>
 You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
+1. If there is a number between "NUM." and "- Pág", identify that number as the [document_id]. Furthermore, identify the document name (from the header).
 2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
 3. Identify where the specific chunk fits within these themes.
 4. Create a concise context that situates the chunk within the document.
 Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
 <final_output>
+<chunk_context>
 [document_id] --- [document_name] --- [brief_context_for_the_chunk]
+</chunk_context>
 </final_output>
 Here are the 20 chunks to analyze:
 <user_input>
 </chunk_context>
 [Continue for all 20 chunks]
 Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
+**Reminder**
+- The final answer must be in PORTUGUESE.
 """
 - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
 """
+# VALOR ANTIGO DE PROMPT UTILIZADO NO QUERY DA PESQUISA POR SIMILARIDADE DO VECTOR_SEARCH
+# prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
+# **Instructions**:
+# 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
+# 2. **Reading the Context**: Extract the following information from `context`:
+# - The name of the defendant (réu).
+# - The crime they have been accused of (nome_do_crime).
+# - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
+# - The date the accusation was accepted (data_do_recebimento).
+# - The ID of the decision document (id_do_documento).
+# 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
+# 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
+# ```
+# <formato>
+# Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
+# A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
+# Não há outras causas interruptivas ou suspensivas da prescrição.
+# </formato>
+# ```
+# 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
+# **Reminder**:
+# - Do not include your chain of thought in the final output.
+# - Do not add extra information or commentary beyond the specified format.
+# - The final answer must be in Portuguese.
+# ```
+# <formato>
+# Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
+# A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
+# Não há outras causas interruptivas ou suspensivas da prescrição.
+# </formato>
+# """
+prompt_auxiliar_SEM_CONTEXT = """Busque e analise os trechos mais relevantes deste processo legal, priorizando os seguintes elementos:
+Identificação do Caso:
+Nome das partes envolvidas
+Jurisdição e instância processual
+Disputa Central:
+Qual é a principal controvérsia do caso?
+Quais são os argumentos centrais apresentados por cada parte?
+Peças Processuais Essenciais:
+Petição Inicial: Identifique os pedidos, fundamentos jurídicos e fatos alegados.
+Contestação: Extraia os argumentos de defesa e eventuais preliminares processuais.
+Réplica (se houver): Destaque contrargumentos apresentados pelo autor.
+Pedido e Pedido Contraposto (se aplicável): Identifique os requerimentos de ambas as partes.
+Provas Produzidas:
+Documentos apresentados pelo autor e sua relevância.
+Documentos apresentados pelo réu e sua relevância.
+Audiências Realizadas:
+Conciliação: Houve acordo ou resistência de alguma parte?
+Instrução e Julgamento: Quais testemunhas foram ouvidas? Algum elemento probatório relevante foi destacado pelo juiz?
+Trechos Relevantes do Caso:
+Extraia e organize os principais excertos do processo que sustentam a decisão.
+Identifique precedentes ou fundamentos jurídicos citados.
+Caso haja decisão judicial, sintetize o raciocínio adotado pelo magistrado.
+Diretrizes de Análise:
+Priorize passagens de maior impacto jurídico, como fundamentos da decisão e discussões centrais do caso.
+Evite redundâncias: Se um mesmo argumento aparece repetidamente, sintetize-o.
+Mantenha a hierarquia lógica da decisão: Se houver votos divergentes ou decisões parciais, destaque essas diferenças.
+Caso haja lacunas na documentação, identifique e sinalize a ausência de informações relevantes."""

_utils/gerar_relatorio_modelo_usuario/utils.py CHANGED Viewed

@@ -1,3 +1,7 @@
 def gerar_resposta_compilada(serializer):
     return {
         "num_chunks_retrieval": serializer["num_chunks_retrieval"],
@@ -20,3 +24,48 @@ def gerar_resposta_compilada(serializer):
         "prompt_auxiliar": serializer["prompt_auxiliar"],
         "prompt_gerar_documento": serializer["prompt_gerar_documento"],
     }

+from typing import List, Tuple
+from langchain_core.documents import Document
 def gerar_resposta_compilada(serializer):
     return {
         "num_chunks_retrieval": serializer["num_chunks_retrieval"],
         "prompt_auxiliar": serializer["prompt_auxiliar"],
         "prompt_gerar_documento": serializer["prompt_gerar_documento"],
     }
+def combine_documents_without_losing_pagination(documents: list[Document]):
+    combined_text = ""
+    page_boundaries: List[Tuple[int, int, int]] = (
+        []
+    )  # (start_idx, end_idx, page_number)
+    current_position = 0
+    for document in documents:
+        start = current_position
+        combined_text += document.page_content
+        end = current_position + len(document.page_content)
+        page_number = document.metadata.get("page", len(page_boundaries) + 1)
+        page_boundaries.append((start, end, page_number))
+        current_position = end
+    return page_boundaries, combined_text
+def validate_many_chunks_in_one_request(response: str):
+    context = (
+        response.replace("document_id: ", "")
+        .replace("document_id:", "")
+        .replace("DOCUMENT_ID: ", "")
+        .replace("DOCUMENT_ID: ", "")
+    )
+    # print("context: ", context)
+    import re
+    pattern = (
+        r"\[([\d.\-]+)\]\s*---\s*\[([^]]+)\]\s*---\s*\[([^]]+)\]\s*</chunk_context>"
+    )
+    # pattern = r"\[(\d+|[-.]+)\] --- (.+?) --- (.+?)</chunk_context>"  # Funciona para quando a resposta do LLM não vem com "document_id" escrito
+    matches = re.findall(pattern, context, re.DOTALL)
+    matches_as_list = []
+    for match in list(matches):
+        resultado = match[0].replace(".", "").replace("-", "")
+        matches_as_list.append((resultado, match[1], match[2]))
+    if len(matches) == 0:
+        return False
+    return matches_as_list

_utils/handle_files.py CHANGED Viewed

@@ -28,34 +28,44 @@ def remove_pdf_temp_files(listaPDFs):
 async def return_document_list_with_llama_parser(file: str):
-    llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY_POPS")
-    documents: List[LangchainDocument] = []
-    if llama_parser_api:
-        parser = LlamaParse(
-            api_key=llama_parser_api,
-            result_type=ResultType.JSON,  # Options: 'text', 'markdown', 'json', 'structured'
-            language="pt",
-            verbose=True,
-        )
-        try:
-            parsed_document = await parser.aget_json(file)
-        except:
-            raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
-        print("parsed_document: ", parsed_document)
-        for doc in parsed_document[0].get("pages"):  # type: ignore
-            # documents.append(doc.to_langchain_format())
-            langchain_document = LangchainDocument(
-                page_content=doc.get("md"),  # type: ignore
-                metadata={
-                    "page": doc.get("page"),  # type: ignore
-                    # **doc.get("metadata", {}),  # type: ignore
-                },  # Include page number in metadata
             )
-            documents.append(langchain_document)
-        return documents
-    else:
-        raise ValueError("Não foi possível obter a API_KEY do llama parser")

 async def return_document_list_with_llama_parser(file: str):
+    llama_parser_keys = [
+        os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
+        os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
+    ]
+    for key in llama_parser_keys:
+        documents: List[LangchainDocument] = []
+        if key:
+            parser = LlamaParse(
+                api_key=key,
+                result_type=ResultType.JSON,  # Options: 'text', 'markdown', 'json', 'structured'
+                language="pt",
+                verbose=True,
             )
+            try:
+                parsed_document = await parser.aget_json(file)
+            except:
+                print(f"Error with llama parser key ending with {key[-4:]}")
+                continue  # Faz com que comece o próximo loop
+            print("parsed_document: ", parsed_document)
+            if len(parsed_document) == 0:
+                continue
+            for doc in parsed_document[0].get("pages"):  # type: ignore
+                # documents.append(doc.to_langchain_format())
+                langchain_document = LangchainDocument(
+                    page_content=doc.get("md"),  # type: ignore
+                    metadata={
+                        "page": doc.get("page"),  # type: ignore
+                        # **doc.get("metadata", {}),  # type: ignore
+                    },  # Include page number in metadata
+                )
+                documents.append(langchain_document)
+            return documents
+    # Código abaixo só é executado se o loop acima acabar e não tiver retornado um valor nenhuma vez
+    raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")

_utils/models/gerar_relatorio.py CHANGED Viewed

@@ -11,6 +11,7 @@ class DocumentChunk:
     start_char: int
     end_char: int
     id_do_processo: int = 0
 @dataclass

     start_char: int
     end_char: int
     id_do_processo: int = 0
+    contextual_summary: str = ""
 @dataclass

_utils/resumo_completo_cursor.py CHANGED Viewed

@@ -39,7 +39,7 @@ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
 async def get_llm_summary_answer_by_cursor_complete(
-    serializer, listaPDFs=None, isBubble=False
 ):
     """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
     # Configuration

 async def get_llm_summary_answer_by_cursor_complete(
+    serializer, listaPDFs, isBubble=False
 ):
     """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
     # Configuration

_utils/splitters/Splitter_class.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
-from typing import List, Dict, Tuple, Optional, cast
 from _utils.models.gerar_relatorio import (
     DocumentChunk,
 )
@@ -18,55 +22,91 @@ class Splitter:
         )
         self.chunk_metadata = {}  # Store chunk metadata for tracing
-    def load_and_split_document(
-        self, pdf_path: str, pages: List[Document] | None, should_use_llama_parse: bool
-    ) -> List[DocumentChunk]:
         """Load PDF and split into chunks with metadata"""
         # loader = PyPDFLoader(pdf_path)
-        if not pages:
-            pages = get_pdf_from_bubble(
-                pdf_path
-            )  # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
-        chunks = []
-        char_count = 0
-        for page in pages:
-            text = page.page_content
-            page_chunks = self.text_splitter.split_text(
-                text
-            )  # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
-            for chunk in page_chunks:
-                chunk_id = str(uuid.uuid4())
-                start_char = text.find(
-                    chunk
-                )  # Retorna a posição onde se encontra o chunk dentro da página inteira
-                end_char = start_char + len(chunk)
-                if should_use_llama_parse:
-                    somar_pages = 0
-                else:
-                    somar_pages = 1
-                doc_chunk = DocumentChunk(  # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
-                    content=chunk,
-                    page_number=cast(int, page.metadata.get("page"))
-                    + somar_pages,  # 1-based page numbering
-                    chunk_id=chunk_id,
-                    start_char=char_count + start_char,
-                    end_char=char_count + end_char,
                 )
-                chunks.append(doc_chunk)
-                # Store metadata for later retrieval
-                self.chunk_metadata[chunk_id] = {
-                    "page": doc_chunk.page_number,
-                    "start_char": doc_chunk.start_char,
-                    "end_char": doc_chunk.end_char,
-                }
-            char_count += len(text)
-        return chunks
     def load_and_split_text(self, text: str) -> List[DocumentChunk]:
         """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""

 from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
+from _utils.gerar_relatorio_modelo_usuario.utils import (
+    combine_documents_without_losing_pagination,
+)
+from _utils.handle_files import return_document_list_with_llama_parser
 from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
+from typing import Any, List, Dict, Tuple, Optional, cast
 from _utils.models.gerar_relatorio import (
     DocumentChunk,
 )
         )
         self.chunk_metadata = {}  # Store chunk metadata for tracing
+    async def load_and_split_document(
+        self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
+    ):
         """Load PDF and split into chunks with metadata"""
         # loader = PyPDFLoader(pdf_path)
+        # if not pages:
+        #     pages = get_pdf_from_bubble(
+        #         pdf_path
+        #     )  # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
+        initial_chunks: List[str] = []
+        if isBubble:
+            pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
+            page_boundaries, combined_text = (
+                combine_documents_without_losing_pagination(pages)
+            )
+            initial_chunks = initial_chunks + self.text_splitter.split_text(
+                combined_text
+            )
+        else:
+            if should_use_llama_parse:
+                pages = await return_document_list_with_llama_parser(pdf_path)
+                page_boundaries, combined_text = (
+                    combine_documents_without_losing_pagination(pages)
+                )
+                initial_chunks = initial_chunks + self.text_splitter.split_text(
+                    combined_text
+                )
+            else:
+                pages = PyPDFLoader(pdf_path).load()
+                page_boundaries, combined_text = (
+                    combine_documents_without_losing_pagination(pages)
                 )
+                initial_chunks = initial_chunks + self.text_splitter.split_text(
+                    combined_text
+                )
+        chunks: List[DocumentChunk] = []
+        char_count = 0
+        # for page in pages:
+        #     text = page.page_content
+        #     page_chunks = self.text_splitter.split_text(
+        #         text
+        #     )  # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
+        text_char = 0
+        for chunk in initial_chunks:
+            chunk_id = str(uuid.uuid4())
+            start_char = text_char + 1
+            end_char = start_char + len(chunk)
+            text_char = end_char
+            if should_use_llama_parse:
+                somar_pages = 0
+            else:
+                somar_pages = 1
+            page_number = 0
+            for start, end, page_number in page_boundaries:
+                if start <= start_char < end:
+                    page_number = page_number
+                    break
+            doc_chunk = DocumentChunk(  # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
+                content=chunk,
+                contextual_summary="",
+                page_number=page_number + somar_pages,  # 1-based page numbering
+                chunk_id=chunk_id,
+                start_char=char_count + start_char,
+                end_char=char_count + end_char,
+            )
+            chunks.append(doc_chunk)
+            # Store metadata for later retrieval
+            self.chunk_metadata[chunk_id] = {
+                "page": doc_chunk.page_number,
+                "start_char": doc_chunk.start_char,
+                "end_char": doc_chunk.end_char,
+            }
+            # char_count += len(text)
+        return chunks, initial_chunks
     def load_and_split_text(self, text: str) -> List[DocumentChunk]:
         """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""

_utils/vector_stores/Vector_store_class.py CHANGED Viewed

@@ -21,7 +21,7 @@ class VectorStore:
             # Prepare texts with context
             if is_contextualized_chunk:
                 texts = [
-                    f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
                     for chunk in chunks
                 ]
             else:
@@ -30,30 +30,19 @@ class VectorStore:
             # Create vector store
             metadatas = []
             for index, chunk in enumerate(chunks):
-                if is_contextualized_chunk:
-                    context = texts[index]
-                    metadatas.append(
-                        {
-                            "chunk_id": chunk.chunk_id,
-                            "id_do_processo": chunk.id_do_processo,
-                            "page": chunk.page_number,
-                            "start_char": chunk.start_char,
-                            "end_char": chunk.end_char,
-                            "context": context,
-                        }
-                    )
-                else:
-                    context = texts[index]
-                    metadatas.append(
-                        {
-                            "chunk_id": chunk.chunk_id,
-                            "id_do_processo": chunk.id_do_processo,
-                            "page": chunk.page_number,
-                            "start_char": chunk.start_char,
-                            "end_char": chunk.end_char,
-                            "context": context,
-                        }
-                    )
             vector_store = Chroma.from_texts(
                 texts=texts, metadatas=metadatas, embedding=self.embeddings

             # Prepare texts with context
             if is_contextualized_chunk:
                 texts = [
+                    f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\n{chunk.contextual_summary}\nDocument_content: {chunk.content}"
                     for chunk in chunks
                 ]
             else:
             # Create vector store
             metadatas = []
             for index, chunk in enumerate(chunks):
+                context = texts[index]
+                metadatas.append(
+                    {
+                        "chunk_id": chunk.chunk_id,
+                        "id_do_processo": str(
+                            chunk.id_do_processo
+                        ),  # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
+                        "page": chunk.page_number,
+                        "start_char": chunk.start_char,
+                        "end_char": chunk.end_char,
+                        "context": context,
+                    }
+                )
             vector_store = Chroma.from_texts(
                 texts=texts, metadatas=metadatas, embedding=self.embeddings

gerar_documento/serializer.py CHANGED Viewed

@@ -33,7 +33,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
     embedding_weight = serializers.FloatField(default=0.5)
     bm25_weight = serializers.FloatField(default=0.5)
     context_window = serializers.IntegerField(default=3)
-    chunk_overlap = serializers.IntegerField(default=1600)
     num_k_rerank = serializers.IntegerField(default=20)
     model_cohere_rerank = serializers.CharField(
         required=False, default="rerank-english-v2.0"
@@ -61,7 +61,7 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
     embedding_weight = serializers.FloatField(default=0.5)
     bm25_weight = serializers.FloatField(default=0.5)
     context_window = serializers.IntegerField(default=3)
-    chunk_overlap = serializers.IntegerField(default=1600)
     num_k_rerank = serializers.IntegerField(default=20)
     model_cohere_rerank = serializers.CharField(
         required=False, default="rerank-english-v2.0"

     embedding_weight = serializers.FloatField(default=0.5)
     bm25_weight = serializers.FloatField(default=0.5)
     context_window = serializers.IntegerField(default=3)
+    chunk_overlap = serializers.IntegerField(default=800)
     num_k_rerank = serializers.IntegerField(default=20)
     model_cohere_rerank = serializers.CharField(
         required=False, default="rerank-english-v2.0"
     embedding_weight = serializers.FloatField(default=0.5)
     bm25_weight = serializers.FloatField(default=0.5)
     context_window = serializers.IntegerField(default=3)
+    chunk_overlap = serializers.IntegerField(default=800)
     num_k_rerank = serializers.IntegerField(default=20)
     model_cohere_rerank = serializers.CharField(
         required=False, default="rerank-english-v2.0"