luanpoppe commited on
Commit
78209bc
·
1 Parent(s): 234f840

fix: versão funcional após fixs feitos em todo o fluxo

Browse files
_antigos/resumos/serializer.py CHANGED
@@ -25,5 +25,5 @@ class ResumoCursorSerializer(serializers.Serializer):
25
  user_message = serializers.CharField(required=False, default="")
26
  model = serializers.CharField(required=False, default=default_model)
27
  hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
- chunk_size = serializers.IntegerField(required=False, default=5000)
29
- chunk_overlap = serializers.IntegerField(required=False, default=1600)
 
25
  user_message = serializers.CharField(required=False, default="")
26
  model = serializers.CharField(required=False, default=default_model)
27
  hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
+ chunk_size = serializers.IntegerField(required=False, default=3500)
29
+ chunk_overlap = serializers.IntegerField(required=False, default=800)
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from typing import List, Dict, Tuple, Optional
3
 
4
  from pydantic import SecretStr
5
  from _utils.vector_stores.Vector_store_class import VectorStore
@@ -222,19 +222,22 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
222
  prompt_auxiliar.format(context="\n\n".join(contexts))
223
  )
224
 
225
- self.resumo_gerado = resumo_auxiliar_do_documento.content
226
 
227
  prompt_gerar_documento = PromptTemplate(
228
  template=self.prompt_gerar_documento,
229
  input_variables=["context"],
230
  )
231
 
232
- documento_gerado = llm.invoke(
233
- prompt_gerar_documento.format(
234
- context=self.resumo_gerado,
235
- # modelo_usuario=serializer.data["modelo"],
236
- )
237
- ).content
 
 
 
238
 
239
  # Split the response into paragraphs
240
  summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]
 
1
  import os
2
+ from typing import List, Dict, Tuple, Optional, cast
3
 
4
  from pydantic import SecretStr
5
  from _utils.vector_stores.Vector_store_class import VectorStore
 
222
  prompt_auxiliar.format(context="\n\n".join(contexts))
223
  )
224
 
225
+ self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
226
 
227
  prompt_gerar_documento = PromptTemplate(
228
  template=self.prompt_gerar_documento,
229
  input_variables=["context"],
230
  )
231
 
232
+ documento_gerado = cast(
233
+ str,
234
+ llm.invoke(
235
+ prompt_gerar_documento.format(
236
+ context=self.resumo_gerado,
237
+ # modelo_usuario=serializer.data["modelo"],
238
+ )
239
+ ).content,
240
+ )
241
 
242
  # Split the response into paragraphs
243
  summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -7,6 +7,9 @@ from _utils.gerar_relatorio_modelo_usuario.prompts import (
7
  )
8
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
9
  from _utils.chains.Chain_class import Chain
 
 
 
10
  from _utils.handle_files import return_document_list_with_llama_parser
11
  from _utils.prompts.Prompt_class import Prompt
12
  from _utils.splitters.Splitter_class import Splitter
@@ -50,13 +53,13 @@ class ContextualRetriever:
50
  self.claude_context_model = claude_context_model
51
 
52
  async def contextualize_all_chunks(
53
- self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
54
  ) -> List[ContextualizedChunk]:
55
  """Add context to all chunks"""
56
  contextualized_chunks = []
57
  full_text = ""
58
  for x in full_text_as_array:
59
- full_text += x.page_content
60
 
61
  prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
62
 
@@ -100,7 +103,6 @@ class ContextualRetriever:
100
 
101
  contextualized_chunks = contextualized_chunks + task.result()
102
 
103
- print("\n\ncontextualized_chunks", contextualized_chunks)
104
  return contextualized_chunks
105
 
106
  # ORIGINAL
@@ -131,54 +133,29 @@ class ContextualRetriever:
131
 
132
  lista_contador.append(0)
133
  print("contador: ", len(lista_contador))
134
- all_pages_contents = ""
135
- contador = 1
136
- for chunk in chunks:
137
- page_number = chunk.page_number - 1
138
- page_content = single_page_text[page_number].page_content
139
-
140
- all_pages_contents += page_content
141
- contador += 1
142
-
143
- context = await self.llm_generate_context(
144
- page_content, chunks, response_auxiliar_summary
145
- )
146
-
147
- context = (
148
- context.replace("document_id: ", "")
149
- .replace("document_id:", "")
150
- .replace("DOCUMENT_ID: ", "")
151
- .replace("DOCUMENT_ID: ", "")
152
- )
153
 
154
- # print("context: ", context)
155
- import re
156
-
157
- pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
158
- # pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
159
- matches = re.findall(pattern, context, re.DOTALL)
160
-
161
- # Convert matches to the desired format
162
- result = [
163
- [int(doc_id), title.strip(), content.strip()]
164
- for doc_id, title, content in matches
165
- ]
166
- # print("\n\nresult", result)
167
 
168
- if result == "" or result == [""]:
169
- print("\n\ncontext", context)
170
 
171
  lista_chunks = []
172
  for index, chunk in enumerate(chunks):
173
  lista_chunks.append(
174
  ContextualizedChunk(
 
175
  content=chunk.content,
176
  page_number=chunk.page_number,
177
- id_do_processo=result[index][0],
178
  chunk_id=chunk.chunk_id,
179
  start_char=chunk.start_char,
180
  end_char=chunk.end_char,
181
- context=" ".join(result[index][1:2]),
182
  )
183
  )
184
 
@@ -207,7 +184,7 @@ class ContextualRetriever:
207
  # return ""
208
 
209
  async def llm_generate_context(
210
- self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
211
  ) -> str:
212
  """Generate contextual description using ChatOpenAI"""
213
  contador = 1
@@ -220,16 +197,29 @@ class ContextualRetriever:
220
 
221
  try:
222
  print("COMEÇOU A REQUISIÇÃO")
223
- prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
224
  # response = await aclaude_answer(
225
  # self.claude_client, self.claude_context_model, prompt
226
  # )
227
 
228
- response = await agpt_answer(prompt)
229
- # llms = LLM()
230
- # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
231
- # return cast(str, response.content)
232
- return cast(str, response)
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  except Exception as e:
234
  self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
235
  return ""
@@ -267,20 +257,20 @@ async def get_full_text_and_all_PDFs_chunks(
267
  ):
268
  all_PDFs_chunks = []
269
 
270
- pages: List[Document] = []
271
 
272
  # Load and process document
273
  for pdf_path in listaPDFs:
274
- if isBubble:
275
- pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
276
- else:
277
- if should_use_llama_parse:
278
- pages = pages + await return_document_list_with_llama_parser(pdf_path)
279
- else:
280
- pages = pages + PyPDFLoader(pdf_path).load()
281
-
282
- chunks = splitterObject.load_and_split_document(
283
- pdf_path, pages, should_use_llama_parse
284
  )
285
  all_PDFs_chunks = all_PDFs_chunks + chunks
286
  # Get full text for contextualization
 
7
  )
8
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
9
  from _utils.chains.Chain_class import Chain
10
+ from _utils.gerar_relatorio_modelo_usuario.utils import (
11
+ validate_many_chunks_in_one_request,
12
+ )
13
  from _utils.handle_files import return_document_list_with_llama_parser
14
  from _utils.prompts.Prompt_class import Prompt
15
  from _utils.splitters.Splitter_class import Splitter
 
53
  self.claude_context_model = claude_context_model
54
 
55
  async def contextualize_all_chunks(
56
+ self, full_text_as_array: List[str], chunks: List[DocumentChunk]
57
  ) -> List[ContextualizedChunk]:
58
  """Add context to all chunks"""
59
  contextualized_chunks = []
60
  full_text = ""
61
  for x in full_text_as_array:
62
+ full_text += x
63
 
64
  prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
65
 
 
103
 
104
  contextualized_chunks = contextualized_chunks + task.result()
105
 
 
106
  return contextualized_chunks
107
 
108
  # ORIGINAL
 
133
 
134
  lista_contador.append(0)
135
  print("contador: ", len(lista_contador))
136
+ # all_pages_contents = ""
137
+ # contador = 1
138
+ # for chunk in chunks:
139
+ # page_number = chunk.page_number - 1
140
+ # page_content = single_page_text[page_number].page_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ # all_pages_contents += page_content
143
+ # contador += 1
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ result = await self.llm_generate_context(chunks, response_auxiliar_summary)
 
146
 
147
  lista_chunks = []
148
  for index, chunk in enumerate(chunks):
149
  lista_chunks.append(
150
  ContextualizedChunk(
151
+ contextual_summary=result[index][2],
152
  content=chunk.content,
153
  page_number=chunk.page_number,
154
+ id_do_processo=int(result[index][0]),
155
  chunk_id=chunk.chunk_id,
156
  start_char=chunk.start_char,
157
  end_char=chunk.end_char,
158
+ context=result[index][1],
159
  )
160
  )
161
 
 
184
  # return ""
185
 
186
  async def llm_generate_context(
187
+ self, chunks: List[DocumentChunk], resumo_auxiliar # , page_text: str
188
  ) -> str:
189
  """Generate contextual description using ChatOpenAI"""
190
  contador = 1
 
197
 
198
  try:
199
  print("COMEÇOU A REQUISIÇÃO")
200
+ prompt = contextual_prompt(resumo_auxiliar, all_chunks_contents)
201
  # response = await aclaude_answer(
202
  # self.claude_client, self.claude_context_model, prompt
203
  # )
204
 
205
+ for attempt in range(4):
206
+ print(f"\n\nTENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt}")
207
+ raw_response = await agpt_answer(prompt)
208
+ response = cast(str, raw_response)
209
+ # llms = LLM()
210
+ # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
211
+ # return cast(str, response.content)
212
+
213
+ matches = validate_many_chunks_in_one_request(response)
214
+ # Convert matches to the desired format
215
+
216
+ if matches:
217
+ result = [
218
+ [int(doc_id), title.strip(), content.strip()]
219
+ for doc_id, title, content in matches
220
+ ]
221
+ return cast(str, result)
222
+ raise ValueError(f"FORMATAÇÃO DOS CHUNKS FOI INVÁLIDA: {response}")
223
  except Exception as e:
224
  self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
225
  return ""
 
257
  ):
258
  all_PDFs_chunks = []
259
 
260
+ pages: List[str] = []
261
 
262
  # Load and process document
263
  for pdf_path in listaPDFs:
264
+ # if isBubble:
265
+ # pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
266
+ # else:
267
+ # if should_use_llama_parse:
268
+ # pages = pages + await return_document_list_with_llama_parser(pdf_path)
269
+ # else:
270
+ # pages = pages + PyPDFLoader(pdf_path).load()
271
+
272
+ chunks, pages = await splitterObject.load_and_split_document(
273
+ pdf_path, should_use_llama_parse, isBubble
274
  )
275
  all_PDFs_chunks = all_PDFs_chunks + chunks
276
  # Get full text for contextualization
_utils/gerar_relatorio_modelo_usuario/prompts.py CHANGED
@@ -154,25 +154,33 @@ Formate sua resposta da seguinte maneira:
154
  # </chunk_context>"""
155
 
156
 
157
- def contextual_prompt(all_pages_contents, summary_text, chunk_content):
 
 
 
 
 
 
 
 
 
 
158
  return f"""
159
  You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
160
  Here's the summary of the full text of the document:
161
  <summary_text>
162
  {summary_text}
163
  </summary_text>
164
- Here are the pages where the chunks are situated:
165
- <page>
166
- {all_pages_contents}
167
- </page>
168
  You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
169
- 1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
170
  2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
171
  3. Identify where the specific chunk fits within these themes.
172
  4. Create a concise context that situates the chunk within the document.
173
  Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
174
  <final_output>
 
175
  [document_id] --- [document_name] --- [brief_context_for_the_chunk]
 
176
  </final_output>
177
  Here are the 20 chunks to analyze:
178
  <user_input>
@@ -187,6 +195,9 @@ Example output structure (do not copy the content, only the format):
187
  </chunk_context>
188
  [Continue for all 20 chunks]
189
  Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
 
 
 
190
  """
191
 
192
 
@@ -282,33 +293,65 @@ After composing the sentence, but before presenting it as the final answer, refl
282
  - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
283
  """
284
 
285
- prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
286
- **Instructions**:
287
- 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
288
- 2. **Reading the Context**: Extract the following information from `context`:
289
- - The name of the defendant (réu).
290
- - The crime they have been accused of (nome_do_crime).
291
- - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
292
- - The date the accusation was accepted (data_do_recebimento).
293
- - The ID of the decision document (id_do_documento).
294
- 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
295
- 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
296
- ```
297
- <formato>
298
- Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
299
- A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
300
- Não outras causas interruptivas ou suspensivas da prescrição.
301
- </formato>
302
- ```
303
- 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
304
- **Reminder**:
305
- - Do not include your chain of thought in the final output.
306
- - Do not add extra information or commentary beyond the specified format.
307
- - The final answer must be in Portuguese.
308
- ```
309
- <formato>
310
- Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
311
- A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
312
- Não outras causas interruptivas ou suspensivas da prescrição.
313
- </formato>
314
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  # </chunk_context>"""
155
 
156
 
157
+ # Removido do prompt abaixo após mudar para cada chunk ter 5000 caracteres:
158
+ # Here are the pages where the chunks are situated:
159
+ # <page>
160
+ # {all_pages_contents}
161
+ # </page>
162
+
163
+
164
+ # 1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
165
+
166
+
167
+ def contextual_prompt(summary_text, chunk_content): # , all_pages_contents
168
  return f"""
169
  You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
170
  Here's the summary of the full text of the document:
171
  <summary_text>
172
  {summary_text}
173
  </summary_text>
 
 
 
 
174
  You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
175
+ 1. If there is a number between "NUM." and "- Pág", identify that number as the [document_id]. Furthermore, identify the document name (from the header).
176
  2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
177
  3. Identify where the specific chunk fits within these themes.
178
  4. Create a concise context that situates the chunk within the document.
179
  Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
180
  <final_output>
181
+ <chunk_context>
182
  [document_id] --- [document_name] --- [brief_context_for_the_chunk]
183
+ </chunk_context>
184
  </final_output>
185
  Here are the 20 chunks to analyze:
186
  <user_input>
 
195
  </chunk_context>
196
  [Continue for all 20 chunks]
197
  Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
198
+
199
+ **Reminder**
200
+ - The final answer must be in PORTUGUESE.
201
  """
202
 
203
 
 
293
  - Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
294
  """
295
 
296
+
297
+ # VALOR ANTIGO DE PROMPT UTILIZADO NO QUERY DA PESQUISA POR SIMILARIDADE DO VECTOR_SEARCH
298
+ # prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
299
+ # **Instructions**:
300
+ # 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
301
+ # 2. **Reading the Context**: Extract the following information from `context`:
302
+ # - The name of the defendant (réu).
303
+ # - The crime they have been accused of (nome_do_crime).
304
+ # - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
305
+ # - The date the accusation was accepted (data_do_recebimento).
306
+ # - The ID of the decision document (id_do_documento).
307
+ # 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
308
+ # 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
309
+ # ```
310
+ # <formato>
311
+ # Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
312
+ # A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
313
+ # Não há outras causas interruptivas ou suspensivas da prescrição.
314
+ # </formato>
315
+ # ```
316
+ # 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
317
+ # **Reminder**:
318
+ # - Do not include your chain of thought in the final output.
319
+ # - Do not add extra information or commentary beyond the specified format.
320
+ # - The final answer must be in Portuguese.
321
+ # ```
322
+ # <formato>
323
+ # Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
324
+ # A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
325
+ # Não há outras causas interruptivas ou suspensivas da prescrição.
326
+ # </formato>
327
+ # """
328
+
329
+
330
+ prompt_auxiliar_SEM_CONTEXT = """Busque e analise os trechos mais relevantes deste processo legal, priorizando os seguintes elementos:
331
+ Identificação do Caso:
332
+ Nome das partes envolvidas
333
+ Jurisdição e instância processual
334
+ Disputa Central:
335
+ Qual é a principal controvérsia do caso?
336
+ Quais são os argumentos centrais apresentados por cada parte?
337
+ Peças Processuais Essenciais:
338
+ Petição Inicial: Identifique os pedidos, fundamentos jurídicos e fatos alegados.
339
+ Contestação: Extraia os argumentos de defesa e eventuais preliminares processuais.
340
+ Réplica (se houver): Destaque contrargumentos apresentados pelo autor.
341
+ Pedido e Pedido Contraposto (se aplicável): Identifique os requerimentos de ambas as partes.
342
+ Provas Produzidas:
343
+ Documentos apresentados pelo autor e sua relevância.
344
+ Documentos apresentados pelo réu e sua relevância.
345
+ Audiências Realizadas:
346
+ Conciliação: Houve acordo ou resistência de alguma parte?
347
+ Instrução e Julgamento: Quais testemunhas foram ouvidas? Algum elemento probatório relevante foi destacado pelo juiz?
348
+ Trechos Relevantes do Caso:
349
+ Extraia e organize os principais excertos do processo que sustentam a decisão.
350
+ Identifique precedentes ou fundamentos jurídicos citados.
351
+ Caso haja decisão judicial, sintetize o raciocínio adotado pelo magistrado.
352
+
353
+ Diretrizes de Análise:
354
+ Priorize passagens de maior impacto jurídico, como fundamentos da decisão e discussões centrais do caso.
355
+ Evite redundâncias: Se um mesmo argumento aparece repetidamente, sintetize-o.
356
+ Mantenha a hierarquia lógica da decisão: Se houver votos divergentes ou decisões parciais, destaque essas diferenças.
357
+ Caso haja lacunas na documentação, identifique e sinalize a ausência de informações relevantes."""
_utils/gerar_relatorio_modelo_usuario/utils.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  def gerar_resposta_compilada(serializer):
2
  return {
3
  "num_chunks_retrieval": serializer["num_chunks_retrieval"],
@@ -20,3 +24,48 @@ def gerar_resposta_compilada(serializer):
20
  "prompt_auxiliar": serializer["prompt_auxiliar"],
21
  "prompt_gerar_documento": serializer["prompt_gerar_documento"],
22
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from langchain_core.documents import Document
3
+
4
+
5
  def gerar_resposta_compilada(serializer):
6
  return {
7
  "num_chunks_retrieval": serializer["num_chunks_retrieval"],
 
24
  "prompt_auxiliar": serializer["prompt_auxiliar"],
25
  "prompt_gerar_documento": serializer["prompt_gerar_documento"],
26
  }
27
+
28
+
29
+ def combine_documents_without_losing_pagination(documents: list[Document]):
30
+ combined_text = ""
31
+ page_boundaries: List[Tuple[int, int, int]] = (
32
+ []
33
+ ) # (start_idx, end_idx, page_number)
34
+ current_position = 0
35
+ for document in documents:
36
+ start = current_position
37
+ combined_text += document.page_content
38
+ end = current_position + len(document.page_content)
39
+ page_number = document.metadata.get("page", len(page_boundaries) + 1)
40
+ page_boundaries.append((start, end, page_number))
41
+
42
+ current_position = end
43
+ return page_boundaries, combined_text
44
+
45
+
46
+ def validate_many_chunks_in_one_request(response: str):
47
+ context = (
48
+ response.replace("document_id: ", "")
49
+ .replace("document_id:", "")
50
+ .replace("DOCUMENT_ID: ", "")
51
+ .replace("DOCUMENT_ID: ", "")
52
+ )
53
+
54
+ # print("context: ", context)
55
+ import re
56
+
57
+ pattern = (
58
+ r"\[([\d.\-]+)\]\s*---\s*\[([^]]+)\]\s*---\s*\[([^]]+)\]\s*</chunk_context>"
59
+ )
60
+ # pattern = r"\[(\d+|[-.]+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
61
+ matches = re.findall(pattern, context, re.DOTALL)
62
+
63
+ matches_as_list = []
64
+
65
+ for match in list(matches):
66
+ resultado = match[0].replace(".", "").replace("-", "")
67
+ matches_as_list.append((resultado, match[1], match[2]))
68
+
69
+ if len(matches) == 0:
70
+ return False
71
+ return matches_as_list
_utils/handle_files.py CHANGED
@@ -28,34 +28,44 @@ def remove_pdf_temp_files(listaPDFs):
28
 
29
 
30
  async def return_document_list_with_llama_parser(file: str):
31
- llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY_POPS")
32
- documents: List[LangchainDocument] = []
33
- if llama_parser_api:
34
- parser = LlamaParse(
35
- api_key=llama_parser_api,
36
- result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
37
- language="pt",
38
- verbose=True,
39
- )
40
-
41
- try:
42
- parsed_document = await parser.aget_json(file)
43
- except:
44
- raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
45
- print("parsed_document: ", parsed_document)
46
- for doc in parsed_document[0].get("pages"): # type: ignore
47
- # documents.append(doc.to_langchain_format())
48
-
49
- langchain_document = LangchainDocument(
50
- page_content=doc.get("md"), # type: ignore
51
- metadata={
52
- "page": doc.get("page"), # type: ignore
53
- # **doc.get("metadata", {}), # type: ignore
54
- }, # Include page number in metadata
55
  )
56
 
57
- documents.append(langchain_document)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- return documents
60
- else:
61
- raise ValueError("Não foi possível obter a API_KEY do llama parser")
 
28
 
29
 
30
  async def return_document_list_with_llama_parser(file: str):
31
+ llama_parser_keys = [
32
+ os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
33
+ os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
34
+ ]
35
+
36
+ for key in llama_parser_keys:
37
+ documents: List[LangchainDocument] = []
38
+ if key:
39
+ parser = LlamaParse(
40
+ api_key=key,
41
+ result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
42
+ language="pt",
43
+ verbose=True,
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
 
46
+ try:
47
+ parsed_document = await parser.aget_json(file)
48
+ except:
49
+ print(f"Error with llama parser key ending with {key[-4:]}")
50
+ continue # Faz com que comece o próximo loop
51
+ print("parsed_document: ", parsed_document)
52
+ if len(parsed_document) == 0:
53
+ continue
54
+
55
+ for doc in parsed_document[0].get("pages"): # type: ignore
56
+ # documents.append(doc.to_langchain_format())
57
+
58
+ langchain_document = LangchainDocument(
59
+ page_content=doc.get("md"), # type: ignore
60
+ metadata={
61
+ "page": doc.get("page"), # type: ignore
62
+ # **doc.get("metadata", {}), # type: ignore
63
+ }, # Include page number in metadata
64
+ )
65
+
66
+ documents.append(langchain_document)
67
+
68
+ return documents
69
 
70
+ # Código abaixo só é executado se o loop acima acabar e não tiver retornado um valor nenhuma vez
71
+ raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
 
_utils/models/gerar_relatorio.py CHANGED
@@ -11,6 +11,7 @@ class DocumentChunk:
11
  start_char: int
12
  end_char: int
13
  id_do_processo: int = 0
 
14
 
15
 
16
  @dataclass
 
11
  start_char: int
12
  end_char: int
13
  id_do_processo: int = 0
14
+ contextual_summary: str = ""
15
 
16
 
17
  @dataclass
_utils/resumo_completo_cursor.py CHANGED
@@ -39,7 +39,7 @@ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
39
 
40
 
41
  async def get_llm_summary_answer_by_cursor_complete(
42
- serializer, listaPDFs=None, isBubble=False
43
  ):
44
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
45
  # Configuration
 
39
 
40
 
41
  async def get_llm_summary_answer_by_cursor_complete(
42
+ serializer, listaPDFs, isBubble=False
43
  ):
44
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
45
  # Configuration
_utils/splitters/Splitter_class.py CHANGED
@@ -1,6 +1,10 @@
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 
 
 
 
2
  from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
3
- from typing import List, Dict, Tuple, Optional, cast
4
  from _utils.models.gerar_relatorio import (
5
  DocumentChunk,
6
  )
@@ -18,55 +22,91 @@ class Splitter:
18
  )
19
  self.chunk_metadata = {} # Store chunk metadata for tracing
20
 
21
- def load_and_split_document(
22
- self, pdf_path: str, pages: List[Document] | None, should_use_llama_parse: bool
23
- ) -> List[DocumentChunk]:
24
  """Load PDF and split into chunks with metadata"""
25
  # loader = PyPDFLoader(pdf_path)
26
- if not pages:
27
- pages = get_pdf_from_bubble(
28
- pdf_path
29
- ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
30
- chunks = []
31
- char_count = 0
32
 
33
- for page in pages:
34
- text = page.page_content
35
- page_chunks = self.text_splitter.split_text(
36
- text
37
- ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
38
-
39
- for chunk in page_chunks:
40
- chunk_id = str(uuid.uuid4())
41
- start_char = text.find(
42
- chunk
43
- ) # Retorna a posição onde se encontra o chunk dentro da página inteira
44
- end_char = start_char + len(chunk)
45
-
46
- if should_use_llama_parse:
47
- somar_pages = 0
48
- else:
49
- somar_pages = 1
50
- doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
51
- content=chunk,
52
- page_number=cast(int, page.metadata.get("page"))
53
- + somar_pages, # 1-based page numbering
54
- chunk_id=chunk_id,
55
- start_char=char_count + start_char,
56
- end_char=char_count + end_char,
57
  )
58
- chunks.append(doc_chunk)
59
 
60
- # Store metadata for later retrieval
61
- self.chunk_metadata[chunk_id] = {
62
- "page": doc_chunk.page_number,
63
- "start_char": doc_chunk.start_char,
64
- "end_char": doc_chunk.end_char,
65
- }
66
 
67
- char_count += len(text)
 
68
 
69
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
72
  """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
 
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
+ from _utils.gerar_relatorio_modelo_usuario.utils import (
3
+ combine_documents_without_losing_pagination,
4
+ )
5
+ from _utils.handle_files import return_document_list_with_llama_parser
6
  from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
7
+ from typing import Any, List, Dict, Tuple, Optional, cast
8
  from _utils.models.gerar_relatorio import (
9
  DocumentChunk,
10
  )
 
22
  )
23
  self.chunk_metadata = {} # Store chunk metadata for tracing
24
 
25
+ async def load_and_split_document(
26
+ self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
27
+ ):
28
  """Load PDF and split into chunks with metadata"""
29
  # loader = PyPDFLoader(pdf_path)
30
+ # if not pages:
31
+ # pages = get_pdf_from_bubble(
32
+ # pdf_path
33
+ # ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
34
+
35
+ initial_chunks: List[str] = []
36
 
37
+ if isBubble:
38
+ pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
39
+ page_boundaries, combined_text = (
40
+ combine_documents_without_losing_pagination(pages)
41
+ )
42
+ initial_chunks = initial_chunks + self.text_splitter.split_text(
43
+ combined_text
44
+ )
45
+ else:
46
+ if should_use_llama_parse:
47
+ pages = await return_document_list_with_llama_parser(pdf_path)
48
+ page_boundaries, combined_text = (
49
+ combine_documents_without_losing_pagination(pages)
50
+ )
51
+ initial_chunks = initial_chunks + self.text_splitter.split_text(
52
+ combined_text
53
+ )
54
+ else:
55
+ pages = PyPDFLoader(pdf_path).load()
56
+ page_boundaries, combined_text = (
57
+ combine_documents_without_losing_pagination(pages)
 
 
 
58
  )
 
59
 
60
+ initial_chunks = initial_chunks + self.text_splitter.split_text(
61
+ combined_text
62
+ )
 
 
 
63
 
64
+ chunks: List[DocumentChunk] = []
65
+ char_count = 0
66
 
67
+ # for page in pages:
68
+ # text = page.page_content
69
+ # page_chunks = self.text_splitter.split_text(
70
+ # text
71
+ # ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
72
+ text_char = 0
73
+ for chunk in initial_chunks:
74
+ chunk_id = str(uuid.uuid4())
75
+ start_char = text_char + 1
76
+ end_char = start_char + len(chunk)
77
+ text_char = end_char
78
+
79
+ if should_use_llama_parse:
80
+ somar_pages = 0
81
+ else:
82
+ somar_pages = 1
83
+
84
+ page_number = 0
85
+ for start, end, page_number in page_boundaries:
86
+ if start <= start_char < end:
87
+ page_number = page_number
88
+ break
89
+
90
+ doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
91
+ content=chunk,
92
+ contextual_summary="",
93
+ page_number=page_number + somar_pages, # 1-based page numbering
94
+ chunk_id=chunk_id,
95
+ start_char=char_count + start_char,
96
+ end_char=char_count + end_char,
97
+ )
98
+ chunks.append(doc_chunk)
99
+
100
+ # Store metadata for later retrieval
101
+ self.chunk_metadata[chunk_id] = {
102
+ "page": doc_chunk.page_number,
103
+ "start_char": doc_chunk.start_char,
104
+ "end_char": doc_chunk.end_char,
105
+ }
106
+
107
+ # char_count += len(text)
108
+
109
+ return chunks, initial_chunks
110
 
111
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
112
  """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
_utils/vector_stores/Vector_store_class.py CHANGED
@@ -21,7 +21,7 @@ class VectorStore:
21
  # Prepare texts with context
22
  if is_contextualized_chunk:
23
  texts = [
24
- f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
25
  for chunk in chunks
26
  ]
27
  else:
@@ -30,30 +30,19 @@ class VectorStore:
30
  # Create vector store
31
  metadatas = []
32
  for index, chunk in enumerate(chunks):
33
- if is_contextualized_chunk:
34
- context = texts[index]
35
- metadatas.append(
36
- {
37
- "chunk_id": chunk.chunk_id,
38
- "id_do_processo": chunk.id_do_processo,
39
- "page": chunk.page_number,
40
- "start_char": chunk.start_char,
41
- "end_char": chunk.end_char,
42
- "context": context,
43
- }
44
- )
45
- else:
46
- context = texts[index]
47
- metadatas.append(
48
- {
49
- "chunk_id": chunk.chunk_id,
50
- "id_do_processo": chunk.id_do_processo,
51
- "page": chunk.page_number,
52
- "start_char": chunk.start_char,
53
- "end_char": chunk.end_char,
54
- "context": context,
55
- }
56
- )
57
 
58
  vector_store = Chroma.from_texts(
59
  texts=texts, metadatas=metadatas, embedding=self.embeddings
 
21
  # Prepare texts with context
22
  if is_contextualized_chunk:
23
  texts = [
24
+ f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\n{chunk.contextual_summary}\nDocument_content: {chunk.content}"
25
  for chunk in chunks
26
  ]
27
  else:
 
30
  # Create vector store
31
  metadatas = []
32
  for index, chunk in enumerate(chunks):
33
+ context = texts[index]
34
+ metadatas.append(
35
+ {
36
+ "chunk_id": chunk.chunk_id,
37
+ "id_do_processo": str(
38
+ chunk.id_do_processo
39
+ ), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
40
+ "page": chunk.page_number,
41
+ "start_char": chunk.start_char,
42
+ "end_char": chunk.end_char,
43
+ "context": context,
44
+ }
45
+ )
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  vector_store = Chroma.from_texts(
48
  texts=texts, metadatas=metadatas, embedding=self.embeddings
gerar_documento/serializer.py CHANGED
@@ -33,7 +33,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
33
  embedding_weight = serializers.FloatField(default=0.5)
34
  bm25_weight = serializers.FloatField(default=0.5)
35
  context_window = serializers.IntegerField(default=3)
36
- chunk_overlap = serializers.IntegerField(default=1600)
37
  num_k_rerank = serializers.IntegerField(default=20)
38
  model_cohere_rerank = serializers.CharField(
39
  required=False, default="rerank-english-v2.0"
@@ -61,7 +61,7 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
61
  embedding_weight = serializers.FloatField(default=0.5)
62
  bm25_weight = serializers.FloatField(default=0.5)
63
  context_window = serializers.IntegerField(default=3)
64
- chunk_overlap = serializers.IntegerField(default=1600)
65
  num_k_rerank = serializers.IntegerField(default=20)
66
  model_cohere_rerank = serializers.CharField(
67
  required=False, default="rerank-english-v2.0"
 
33
  embedding_weight = serializers.FloatField(default=0.5)
34
  bm25_weight = serializers.FloatField(default=0.5)
35
  context_window = serializers.IntegerField(default=3)
36
+ chunk_overlap = serializers.IntegerField(default=800)
37
  num_k_rerank = serializers.IntegerField(default=20)
38
  model_cohere_rerank = serializers.CharField(
39
  required=False, default="rerank-english-v2.0"
 
61
  embedding_weight = serializers.FloatField(default=0.5)
62
  bm25_weight = serializers.FloatField(default=0.5)
63
  context_window = serializers.IntegerField(default=3)
64
+ chunk_overlap = serializers.IntegerField(default=800)
65
  num_k_rerank = serializers.IntegerField(default=20)
66
  model_cohere_rerank = serializers.CharField(
67
  required=False, default="rerank-english-v2.0"