Spaces:
Running
Running
luanpoppe
commited on
Commit
·
78209bc
1
Parent(s):
234f840
fix: versão funcional após fixs feitos em todo o fluxo
Browse files- _antigos/resumos/serializer.py +2 -2
- _utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py +11 -8
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +47 -57
- _utils/gerar_relatorio_modelo_usuario/prompts.py +79 -36
- _utils/gerar_relatorio_modelo_usuario/utils.py +49 -0
- _utils/handle_files.py +38 -28
- _utils/models/gerar_relatorio.py +1 -0
- _utils/resumo_completo_cursor.py +1 -1
- _utils/splitters/Splitter_class.py +83 -43
- _utils/vector_stores/Vector_store_class.py +14 -25
- gerar_documento/serializer.py +2 -2
_antigos/resumos/serializer.py
CHANGED
@@ -25,5 +25,5 @@ class ResumoCursorSerializer(serializers.Serializer):
|
|
25 |
user_message = serializers.CharField(required=False, default="")
|
26 |
model = serializers.CharField(required=False, default=default_model)
|
27 |
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
28 |
-
chunk_size = serializers.IntegerField(required=False, default=
|
29 |
-
chunk_overlap = serializers.IntegerField(required=False, default=
|
|
|
25 |
user_message = serializers.CharField(required=False, default="")
|
26 |
model = serializers.CharField(required=False, default=default_model)
|
27 |
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
28 |
+
chunk_size = serializers.IntegerField(required=False, default=3500)
|
29 |
+
chunk_overlap = serializers.IntegerField(required=False, default=800)
|
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
from typing import List, Dict, Tuple, Optional
|
3 |
|
4 |
from pydantic import SecretStr
|
5 |
from _utils.vector_stores.Vector_store_class import VectorStore
|
@@ -222,19 +222,22 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
222 |
prompt_auxiliar.format(context="\n\n".join(contexts))
|
223 |
)
|
224 |
|
225 |
-
self.resumo_gerado = resumo_auxiliar_do_documento.content
|
226 |
|
227 |
prompt_gerar_documento = PromptTemplate(
|
228 |
template=self.prompt_gerar_documento,
|
229 |
input_variables=["context"],
|
230 |
)
|
231 |
|
232 |
-
documento_gerado =
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
238 |
|
239 |
# Split the response into paragraphs
|
240 |
summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]
|
|
|
1 |
import os
|
2 |
+
from typing import List, Dict, Tuple, Optional, cast
|
3 |
|
4 |
from pydantic import SecretStr
|
5 |
from _utils.vector_stores.Vector_store_class import VectorStore
|
|
|
222 |
prompt_auxiliar.format(context="\n\n".join(contexts))
|
223 |
)
|
224 |
|
225 |
+
self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
|
226 |
|
227 |
prompt_gerar_documento = PromptTemplate(
|
228 |
template=self.prompt_gerar_documento,
|
229 |
input_variables=["context"],
|
230 |
)
|
231 |
|
232 |
+
documento_gerado = cast(
|
233 |
+
str,
|
234 |
+
llm.invoke(
|
235 |
+
prompt_gerar_documento.format(
|
236 |
+
context=self.resumo_gerado,
|
237 |
+
# modelo_usuario=serializer.data["modelo"],
|
238 |
+
)
|
239 |
+
).content,
|
240 |
+
)
|
241 |
|
242 |
# Split the response into paragraphs
|
243 |
summaries = [p.strip() for p in documento_gerado.split("\n\n") if p.strip()]
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -7,6 +7,9 @@ from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
|
7 |
)
|
8 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
9 |
from _utils.chains.Chain_class import Chain
|
|
|
|
|
|
|
10 |
from _utils.handle_files import return_document_list_with_llama_parser
|
11 |
from _utils.prompts.Prompt_class import Prompt
|
12 |
from _utils.splitters.Splitter_class import Splitter
|
@@ -50,13 +53,13 @@ class ContextualRetriever:
|
|
50 |
self.claude_context_model = claude_context_model
|
51 |
|
52 |
async def contextualize_all_chunks(
|
53 |
-
self, full_text_as_array: List[
|
54 |
) -> List[ContextualizedChunk]:
|
55 |
"""Add context to all chunks"""
|
56 |
contextualized_chunks = []
|
57 |
full_text = ""
|
58 |
for x in full_text_as_array:
|
59 |
-
full_text += x
|
60 |
|
61 |
prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
|
62 |
|
@@ -100,7 +103,6 @@ class ContextualRetriever:
|
|
100 |
|
101 |
contextualized_chunks = contextualized_chunks + task.result()
|
102 |
|
103 |
-
print("\n\ncontextualized_chunks", contextualized_chunks)
|
104 |
return contextualized_chunks
|
105 |
|
106 |
# ORIGINAL
|
@@ -131,54 +133,29 @@ class ContextualRetriever:
|
|
131 |
|
132 |
lista_contador.append(0)
|
133 |
print("contador: ", len(lista_contador))
|
134 |
-
all_pages_contents = ""
|
135 |
-
contador = 1
|
136 |
-
for chunk in chunks:
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
all_pages_contents += page_content
|
141 |
-
contador += 1
|
142 |
-
|
143 |
-
context = await self.llm_generate_context(
|
144 |
-
page_content, chunks, response_auxiliar_summary
|
145 |
-
)
|
146 |
-
|
147 |
-
context = (
|
148 |
-
context.replace("document_id: ", "")
|
149 |
-
.replace("document_id:", "")
|
150 |
-
.replace("DOCUMENT_ID: ", "")
|
151 |
-
.replace("DOCUMENT_ID: ", "")
|
152 |
-
)
|
153 |
|
154 |
-
#
|
155 |
-
|
156 |
-
|
157 |
-
pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
|
158 |
-
# pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
|
159 |
-
matches = re.findall(pattern, context, re.DOTALL)
|
160 |
-
|
161 |
-
# Convert matches to the desired format
|
162 |
-
result = [
|
163 |
-
[int(doc_id), title.strip(), content.strip()]
|
164 |
-
for doc_id, title, content in matches
|
165 |
-
]
|
166 |
-
# print("\n\nresult", result)
|
167 |
|
168 |
-
|
169 |
-
print("\n\ncontext", context)
|
170 |
|
171 |
lista_chunks = []
|
172 |
for index, chunk in enumerate(chunks):
|
173 |
lista_chunks.append(
|
174 |
ContextualizedChunk(
|
|
|
175 |
content=chunk.content,
|
176 |
page_number=chunk.page_number,
|
177 |
-
id_do_processo=result[index][0],
|
178 |
chunk_id=chunk.chunk_id,
|
179 |
start_char=chunk.start_char,
|
180 |
end_char=chunk.end_char,
|
181 |
-
context=
|
182 |
)
|
183 |
)
|
184 |
|
@@ -207,7 +184,7 @@ class ContextualRetriever:
|
|
207 |
# return ""
|
208 |
|
209 |
async def llm_generate_context(
|
210 |
-
self,
|
211 |
) -> str:
|
212 |
"""Generate contextual description using ChatOpenAI"""
|
213 |
contador = 1
|
@@ -220,16 +197,29 @@ class ContextualRetriever:
|
|
220 |
|
221 |
try:
|
222 |
print("COMEÇOU A REQUISIÇÃO")
|
223 |
-
prompt = contextual_prompt(
|
224 |
# response = await aclaude_answer(
|
225 |
# self.claude_client, self.claude_context_model, prompt
|
226 |
# )
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
except Exception as e:
|
234 |
self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
|
235 |
return ""
|
@@ -267,20 +257,20 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
267 |
):
|
268 |
all_PDFs_chunks = []
|
269 |
|
270 |
-
pages: List[
|
271 |
|
272 |
# Load and process document
|
273 |
for pdf_path in listaPDFs:
|
274 |
-
if isBubble:
|
275 |
-
|
276 |
-
else:
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
chunks = splitterObject.load_and_split_document(
|
283 |
-
pdf_path,
|
284 |
)
|
285 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
286 |
# Get full text for contextualization
|
|
|
7 |
)
|
8 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
9 |
from _utils.chains.Chain_class import Chain
|
10 |
+
from _utils.gerar_relatorio_modelo_usuario.utils import (
|
11 |
+
validate_many_chunks_in_one_request,
|
12 |
+
)
|
13 |
from _utils.handle_files import return_document_list_with_llama_parser
|
14 |
from _utils.prompts.Prompt_class import Prompt
|
15 |
from _utils.splitters.Splitter_class import Splitter
|
|
|
53 |
self.claude_context_model = claude_context_model
|
54 |
|
55 |
async def contextualize_all_chunks(
|
56 |
+
self, full_text_as_array: List[str], chunks: List[DocumentChunk]
|
57 |
) -> List[ContextualizedChunk]:
|
58 |
"""Add context to all chunks"""
|
59 |
contextualized_chunks = []
|
60 |
full_text = ""
|
61 |
for x in full_text_as_array:
|
62 |
+
full_text += x
|
63 |
|
64 |
prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
|
65 |
|
|
|
103 |
|
104 |
contextualized_chunks = contextualized_chunks + task.result()
|
105 |
|
|
|
106 |
return contextualized_chunks
|
107 |
|
108 |
# ORIGINAL
|
|
|
133 |
|
134 |
lista_contador.append(0)
|
135 |
print("contador: ", len(lista_contador))
|
136 |
+
# all_pages_contents = ""
|
137 |
+
# contador = 1
|
138 |
+
# for chunk in chunks:
|
139 |
+
# page_number = chunk.page_number - 1
|
140 |
+
# page_content = single_page_text[page_number].page_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
# all_pages_contents += page_content
|
143 |
+
# contador += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
+
result = await self.llm_generate_context(chunks, response_auxiliar_summary)
|
|
|
146 |
|
147 |
lista_chunks = []
|
148 |
for index, chunk in enumerate(chunks):
|
149 |
lista_chunks.append(
|
150 |
ContextualizedChunk(
|
151 |
+
contextual_summary=result[index][2],
|
152 |
content=chunk.content,
|
153 |
page_number=chunk.page_number,
|
154 |
+
id_do_processo=int(result[index][0]),
|
155 |
chunk_id=chunk.chunk_id,
|
156 |
start_char=chunk.start_char,
|
157 |
end_char=chunk.end_char,
|
158 |
+
context=result[index][1],
|
159 |
)
|
160 |
)
|
161 |
|
|
|
184 |
# return ""
|
185 |
|
186 |
async def llm_generate_context(
|
187 |
+
self, chunks: List[DocumentChunk], resumo_auxiliar # , page_text: str
|
188 |
) -> str:
|
189 |
"""Generate contextual description using ChatOpenAI"""
|
190 |
contador = 1
|
|
|
197 |
|
198 |
try:
|
199 |
print("COMEÇOU A REQUISIÇÃO")
|
200 |
+
prompt = contextual_prompt(resumo_auxiliar, all_chunks_contents)
|
201 |
# response = await aclaude_answer(
|
202 |
# self.claude_client, self.claude_context_model, prompt
|
203 |
# )
|
204 |
|
205 |
+
for attempt in range(4):
|
206 |
+
print(f"\n\nTENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt}")
|
207 |
+
raw_response = await agpt_answer(prompt)
|
208 |
+
response = cast(str, raw_response)
|
209 |
+
# llms = LLM()
|
210 |
+
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
211 |
+
# return cast(str, response.content)
|
212 |
+
|
213 |
+
matches = validate_many_chunks_in_one_request(response)
|
214 |
+
# Convert matches to the desired format
|
215 |
+
|
216 |
+
if matches:
|
217 |
+
result = [
|
218 |
+
[int(doc_id), title.strip(), content.strip()]
|
219 |
+
for doc_id, title, content in matches
|
220 |
+
]
|
221 |
+
return cast(str, result)
|
222 |
+
raise ValueError(f"FORMATAÇÃO DOS CHUNKS FOI INVÁLIDA: {response}")
|
223 |
except Exception as e:
|
224 |
self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
|
225 |
return ""
|
|
|
257 |
):
|
258 |
all_PDFs_chunks = []
|
259 |
|
260 |
+
pages: List[str] = []
|
261 |
|
262 |
# Load and process document
|
263 |
for pdf_path in listaPDFs:
|
264 |
+
# if isBubble:
|
265 |
+
# pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
|
266 |
+
# else:
|
267 |
+
# if should_use_llama_parse:
|
268 |
+
# pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
269 |
+
# else:
|
270 |
+
# pages = pages + PyPDFLoader(pdf_path).load()
|
271 |
+
|
272 |
+
chunks, pages = await splitterObject.load_and_split_document(
|
273 |
+
pdf_path, should_use_llama_parse, isBubble
|
274 |
)
|
275 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
276 |
# Get full text for contextualization
|
_utils/gerar_relatorio_modelo_usuario/prompts.py
CHANGED
@@ -154,25 +154,33 @@ Formate sua resposta da seguinte maneira:
|
|
154 |
# </chunk_context>"""
|
155 |
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
return f"""
|
159 |
You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
|
160 |
Here's the summary of the full text of the document:
|
161 |
<summary_text>
|
162 |
{summary_text}
|
163 |
</summary_text>
|
164 |
-
Here are the pages where the chunks are situated:
|
165 |
-
<page>
|
166 |
-
{all_pages_contents}
|
167 |
-
</page>
|
168 |
You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
|
169 |
-
1.
|
170 |
2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
|
171 |
3. Identify where the specific chunk fits within these themes.
|
172 |
4. Create a concise context that situates the chunk within the document.
|
173 |
Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
|
174 |
<final_output>
|
|
|
175 |
[document_id] --- [document_name] --- [brief_context_for_the_chunk]
|
|
|
176 |
</final_output>
|
177 |
Here are the 20 chunks to analyze:
|
178 |
<user_input>
|
@@ -187,6 +195,9 @@ Example output structure (do not copy the content, only the format):
|
|
187 |
</chunk_context>
|
188 |
[Continue for all 20 chunks]
|
189 |
Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
|
|
|
|
|
|
|
190 |
"""
|
191 |
|
192 |
|
@@ -282,33 +293,65 @@ After composing the sentence, but before presenting it as the final answer, refl
|
|
282 |
- Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
|
283 |
"""
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
- The
|
292 |
-
- The
|
293 |
-
- The
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
# </chunk_context>"""
|
155 |
|
156 |
|
157 |
+
# Removido do prompt abaixo após mudar para cada chunk ter 5000 caracteres:
|
158 |
+
# Here are the pages where the chunks are situated:
|
159 |
+
# <page>
|
160 |
+
# {all_pages_contents}
|
161 |
+
# </page>
|
162 |
+
|
163 |
+
|
164 |
+
# 1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
165 |
+
|
166 |
+
|
167 |
+
def contextual_prompt(summary_text, chunk_content): # , all_pages_contents
|
168 |
return f"""
|
169 |
You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
|
170 |
Here's the summary of the full text of the document:
|
171 |
<summary_text>
|
172 |
{summary_text}
|
173 |
</summary_text>
|
|
|
|
|
|
|
|
|
174 |
You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
|
175 |
+
1. If there is a number between "NUM." and "- Pág", identify that number as the [document_id]. Furthermore, identify the document name (from the header).
|
176 |
2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
|
177 |
3. Identify where the specific chunk fits within these themes.
|
178 |
4. Create a concise context that situates the chunk within the document.
|
179 |
Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
|
180 |
<final_output>
|
181 |
+
<chunk_context>
|
182 |
[document_id] --- [document_name] --- [brief_context_for_the_chunk]
|
183 |
+
</chunk_context>
|
184 |
</final_output>
|
185 |
Here are the 20 chunks to analyze:
|
186 |
<user_input>
|
|
|
195 |
</chunk_context>
|
196 |
[Continue for all 20 chunks]
|
197 |
Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
|
198 |
+
|
199 |
+
**Reminder**
|
200 |
+
- The final answer must be in PORTUGUESE.
|
201 |
"""
|
202 |
|
203 |
|
|
|
293 |
- Do not show the chain of thought or the reflection step. Only the final formatted sentence should be visible to the user.
|
294 |
"""
|
295 |
|
296 |
+
|
297 |
+
# VALOR ANTIGO DE PROMPT UTILIZADO NO QUERY DA PESQUISA POR SIMILARIDADE DO VECTOR_SEARCH
|
298 |
+
# prompt_auxiliar_SEM_CONTEXT = """You are a language model specialized in producing concise and well-structured legal case summaries in Portuguese. You will receive a variable `context`, which contains information about a legal case. Your task is to read the `context` carefully and produce a summary report in Portuguese, following the specific format provided below. Do not include any additional comments or reasoning steps in your final answer.
|
299 |
+
# **Instructions**:
|
300 |
+
# 1. **Chain of Thought**: Before producing your final answer, you must think through and plan your summary silently, without showing this reasoning in the final output. The final answer must only contain the required formatted report and nothing else.
|
301 |
+
# 2. **Reading the Context**: Extract the following information from `context`:
|
302 |
+
# - The name of the defendant (réu).
|
303 |
+
# - The crime they have been accused of (nome_do_crime).
|
304 |
+
# - The applicable article and subsection of the Penal Code (artigo_e_inciso_do_crime).
|
305 |
+
# - The date the accusation was accepted (data_do_recebimento).
|
306 |
+
# - The ID of the decision document (id_do_documento).
|
307 |
+
# 3. **Prescriptive Details**: If no other interruptive or suspensive causes of prescription are mentioned, confirm that there are none.
|
308 |
+
# 4. **Formatting**: Your final answer must strictly follow the format below, in Portuguese, and replace the placeholders with the appropriate information:
|
309 |
+
# ```
|
310 |
+
# <formato>
|
311 |
+
# Trata-se de Ação Penal em que o Ministério Público denunciou [nome_do_reu], pela prática do [nome_do_crime] [artigo_e_inciso_do_crime], do Código Penal.
|
312 |
+
# A denúncia foi recebida em [data_do_recebimento], conforme Decisão [id_do_documento].
|
313 |
+
# Não há outras causas interruptivas ou suspensivas da prescrição.
|
314 |
+
# </formato>
|
315 |
+
# ```
|
316 |
+
# 5. **Completeness**: If any piece of required information is missing in the `context`, note that explicitly in the final answer within the format.
|
317 |
+
# **Reminder**:
|
318 |
+
# - Do not include your chain of thought in the final output.
|
319 |
+
# - Do not add extra information or commentary beyond the specified format.
|
320 |
+
# - The final answer must be in Portuguese.
|
321 |
+
# ```
|
322 |
+
# <formato>
|
323 |
+
# Trata-se de Ação Penal em que o Ministério Público denunciou João da Silva, pela prática do furto qualificado (art. 155, §4º, inciso II do Código Penal).
|
324 |
+
# A denúncia foi recebida em 12/03/2021, conforme Decisão 20210312-01.
|
325 |
+
# Não há outras causas interruptivas ou suspensivas da prescrição.
|
326 |
+
# </formato>
|
327 |
+
# """
|
328 |
+
|
329 |
+
|
330 |
+
prompt_auxiliar_SEM_CONTEXT = """Busque e analise os trechos mais relevantes deste processo legal, priorizando os seguintes elementos:
|
331 |
+
Identificação do Caso:
|
332 |
+
Nome das partes envolvidas
|
333 |
+
Jurisdição e instância processual
|
334 |
+
Disputa Central:
|
335 |
+
Qual é a principal controvérsia do caso?
|
336 |
+
Quais são os argumentos centrais apresentados por cada parte?
|
337 |
+
Peças Processuais Essenciais:
|
338 |
+
Petição Inicial: Identifique os pedidos, fundamentos jurídicos e fatos alegados.
|
339 |
+
Contestação: Extraia os argumentos de defesa e eventuais preliminares processuais.
|
340 |
+
Réplica (se houver): Destaque contrargumentos apresentados pelo autor.
|
341 |
+
Pedido e Pedido Contraposto (se aplicável): Identifique os requerimentos de ambas as partes.
|
342 |
+
Provas Produzidas:
|
343 |
+
Documentos apresentados pelo autor e sua relevância.
|
344 |
+
Documentos apresentados pelo réu e sua relevância.
|
345 |
+
Audiências Realizadas:
|
346 |
+
Conciliação: Houve acordo ou resistência de alguma parte?
|
347 |
+
Instrução e Julgamento: Quais testemunhas foram ouvidas? Algum elemento probatório relevante foi destacado pelo juiz?
|
348 |
+
Trechos Relevantes do Caso:
|
349 |
+
Extraia e organize os principais excertos do processo que sustentam a decisão.
|
350 |
+
Identifique precedentes ou fundamentos jurídicos citados.
|
351 |
+
Caso haja decisão judicial, sintetize o raciocínio adotado pelo magistrado.
|
352 |
+
|
353 |
+
Diretrizes de Análise:
|
354 |
+
Priorize passagens de maior impacto jurídico, como fundamentos da decisão e discussões centrais do caso.
|
355 |
+
Evite redundâncias: Se um mesmo argumento aparece repetidamente, sintetize-o.
|
356 |
+
Mantenha a hierarquia lógica da decisão: Se houver votos divergentes ou decisões parciais, destaque essas diferenças.
|
357 |
+
Caso haja lacunas na documentação, identifique e sinalize a ausência de informações relevantes."""
|
_utils/gerar_relatorio_modelo_usuario/utils.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
def gerar_resposta_compilada(serializer):
|
2 |
return {
|
3 |
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
|
@@ -20,3 +24,48 @@ def gerar_resposta_compilada(serializer):
|
|
20 |
"prompt_auxiliar": serializer["prompt_auxiliar"],
|
21 |
"prompt_gerar_documento": serializer["prompt_gerar_documento"],
|
22 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
from langchain_core.documents import Document
|
3 |
+
|
4 |
+
|
5 |
def gerar_resposta_compilada(serializer):
|
6 |
return {
|
7 |
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
|
|
|
24 |
"prompt_auxiliar": serializer["prompt_auxiliar"],
|
25 |
"prompt_gerar_documento": serializer["prompt_gerar_documento"],
|
26 |
}
|
27 |
+
|
28 |
+
|
29 |
+
def combine_documents_without_losing_pagination(documents: list[Document]):
|
30 |
+
combined_text = ""
|
31 |
+
page_boundaries: List[Tuple[int, int, int]] = (
|
32 |
+
[]
|
33 |
+
) # (start_idx, end_idx, page_number)
|
34 |
+
current_position = 0
|
35 |
+
for document in documents:
|
36 |
+
start = current_position
|
37 |
+
combined_text += document.page_content
|
38 |
+
end = current_position + len(document.page_content)
|
39 |
+
page_number = document.metadata.get("page", len(page_boundaries) + 1)
|
40 |
+
page_boundaries.append((start, end, page_number))
|
41 |
+
|
42 |
+
current_position = end
|
43 |
+
return page_boundaries, combined_text
|
44 |
+
|
45 |
+
|
46 |
+
def validate_many_chunks_in_one_request(response: str):
|
47 |
+
context = (
|
48 |
+
response.replace("document_id: ", "")
|
49 |
+
.replace("document_id:", "")
|
50 |
+
.replace("DOCUMENT_ID: ", "")
|
51 |
+
.replace("DOCUMENT_ID: ", "")
|
52 |
+
)
|
53 |
+
|
54 |
+
# print("context: ", context)
|
55 |
+
import re
|
56 |
+
|
57 |
+
pattern = (
|
58 |
+
r"\[([\d.\-]+)\]\s*---\s*\[([^]]+)\]\s*---\s*\[([^]]+)\]\s*</chunk_context>"
|
59 |
+
)
|
60 |
+
# pattern = r"\[(\d+|[-.]+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
|
61 |
+
matches = re.findall(pattern, context, re.DOTALL)
|
62 |
+
|
63 |
+
matches_as_list = []
|
64 |
+
|
65 |
+
for match in list(matches):
|
66 |
+
resultado = match[0].replace(".", "").replace("-", "")
|
67 |
+
matches_as_list.append((resultado, match[1], match[2]))
|
68 |
+
|
69 |
+
if len(matches) == 0:
|
70 |
+
return False
|
71 |
+
return matches_as_list
|
_utils/handle_files.py
CHANGED
@@ -28,34 +28,44 @@ def remove_pdf_temp_files(listaPDFs):
|
|
28 |
|
29 |
|
30 |
async def return_document_list_with_llama_parser(file: str):
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
|
45 |
-
print("parsed_document: ", parsed_document)
|
46 |
-
for doc in parsed_document[0].get("pages"): # type: ignore
|
47 |
-
# documents.append(doc.to_langchain_format())
|
48 |
-
|
49 |
-
langchain_document = LangchainDocument(
|
50 |
-
page_content=doc.get("md"), # type: ignore
|
51 |
-
metadata={
|
52 |
-
"page": doc.get("page"), # type: ignore
|
53 |
-
# **doc.get("metadata", {}), # type: ignore
|
54 |
-
}, # Include page number in metadata
|
55 |
)
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
raise ValueError("Não foi possível obter a API_KEY do llama parser")
|
|
|
28 |
|
29 |
|
30 |
async def return_document_list_with_llama_parser(file: str):
|
31 |
+
llama_parser_keys = [
|
32 |
+
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
33 |
+
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
34 |
+
]
|
35 |
+
|
36 |
+
for key in llama_parser_keys:
|
37 |
+
documents: List[LangchainDocument] = []
|
38 |
+
if key:
|
39 |
+
parser = LlamaParse(
|
40 |
+
api_key=key,
|
41 |
+
result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
|
42 |
+
language="pt",
|
43 |
+
verbose=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
)
|
45 |
|
46 |
+
try:
|
47 |
+
parsed_document = await parser.aget_json(file)
|
48 |
+
except:
|
49 |
+
print(f"Error with llama parser key ending with {key[-4:]}")
|
50 |
+
continue # Faz com que comece o próximo loop
|
51 |
+
print("parsed_document: ", parsed_document)
|
52 |
+
if len(parsed_document) == 0:
|
53 |
+
continue
|
54 |
+
|
55 |
+
for doc in parsed_document[0].get("pages"): # type: ignore
|
56 |
+
# documents.append(doc.to_langchain_format())
|
57 |
+
|
58 |
+
langchain_document = LangchainDocument(
|
59 |
+
page_content=doc.get("md"), # type: ignore
|
60 |
+
metadata={
|
61 |
+
"page": doc.get("page"), # type: ignore
|
62 |
+
# **doc.get("metadata", {}), # type: ignore
|
63 |
+
}, # Include page number in metadata
|
64 |
+
)
|
65 |
+
|
66 |
+
documents.append(langchain_document)
|
67 |
+
|
68 |
+
return documents
|
69 |
|
70 |
+
# Código abaixo só é executado se o loop acima acabar e não tiver retornado um valor nenhuma vez
|
71 |
+
raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
|
|
_utils/models/gerar_relatorio.py
CHANGED
@@ -11,6 +11,7 @@ class DocumentChunk:
|
|
11 |
start_char: int
|
12 |
end_char: int
|
13 |
id_do_processo: int = 0
|
|
|
14 |
|
15 |
|
16 |
@dataclass
|
|
|
11 |
start_char: int
|
12 |
end_char: int
|
13 |
id_do_processo: int = 0
|
14 |
+
contextual_summary: str = ""
|
15 |
|
16 |
|
17 |
@dataclass
|
_utils/resumo_completo_cursor.py
CHANGED
@@ -39,7 +39,7 @@ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
|
39 |
|
40 |
|
41 |
async def get_llm_summary_answer_by_cursor_complete(
|
42 |
-
serializer, listaPDFs
|
43 |
):
|
44 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
45 |
# Configuration
|
|
|
39 |
|
40 |
|
41 |
async def get_llm_summary_answer_by_cursor_complete(
|
42 |
+
serializer, listaPDFs, isBubble=False
|
43 |
):
|
44 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
45 |
# Configuration
|
_utils/splitters/Splitter_class.py
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
|
|
|
|
|
|
|
|
2 |
from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
|
3 |
-
from typing import List, Dict, Tuple, Optional, cast
|
4 |
from _utils.models.gerar_relatorio import (
|
5 |
DocumentChunk,
|
6 |
)
|
@@ -18,55 +22,91 @@ class Splitter:
|
|
18 |
)
|
19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
20 |
|
21 |
-
def load_and_split_document(
|
22 |
-
self, pdf_path: str,
|
23 |
-
)
|
24 |
"""Load PDF and split into chunks with metadata"""
|
25 |
# loader = PyPDFLoader(pdf_path)
|
26 |
-
if not pages:
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
)
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
chunk_id=chunk_id,
|
55 |
-
start_char=char_count + start_char,
|
56 |
-
end_char=char_count + end_char,
|
57 |
)
|
58 |
-
chunks.append(doc_chunk)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
"start_char": doc_chunk.start_char,
|
64 |
-
"end_char": doc_chunk.end_char,
|
65 |
-
}
|
66 |
|
67 |
-
|
|
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
72 |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
+
from _utils.gerar_relatorio_modelo_usuario.utils import (
|
3 |
+
combine_documents_without_losing_pagination,
|
4 |
+
)
|
5 |
+
from _utils.handle_files import return_document_list_with_llama_parser
|
6 |
from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
|
7 |
+
from typing import Any, List, Dict, Tuple, Optional, cast
|
8 |
from _utils.models.gerar_relatorio import (
|
9 |
DocumentChunk,
|
10 |
)
|
|
|
22 |
)
|
23 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
24 |
|
25 |
+
async def load_and_split_document(
|
26 |
+
self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
|
27 |
+
):
|
28 |
"""Load PDF and split into chunks with metadata"""
|
29 |
# loader = PyPDFLoader(pdf_path)
|
30 |
+
# if not pages:
|
31 |
+
# pages = get_pdf_from_bubble(
|
32 |
+
# pdf_path
|
33 |
+
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
34 |
+
|
35 |
+
initial_chunks: List[str] = []
|
36 |
|
37 |
+
if isBubble:
|
38 |
+
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
|
39 |
+
page_boundaries, combined_text = (
|
40 |
+
combine_documents_without_losing_pagination(pages)
|
41 |
+
)
|
42 |
+
initial_chunks = initial_chunks + self.text_splitter.split_text(
|
43 |
+
combined_text
|
44 |
+
)
|
45 |
+
else:
|
46 |
+
if should_use_llama_parse:
|
47 |
+
pages = await return_document_list_with_llama_parser(pdf_path)
|
48 |
+
page_boundaries, combined_text = (
|
49 |
+
combine_documents_without_losing_pagination(pages)
|
50 |
+
)
|
51 |
+
initial_chunks = initial_chunks + self.text_splitter.split_text(
|
52 |
+
combined_text
|
53 |
+
)
|
54 |
+
else:
|
55 |
+
pages = PyPDFLoader(pdf_path).load()
|
56 |
+
page_boundaries, combined_text = (
|
57 |
+
combine_documents_without_losing_pagination(pages)
|
|
|
|
|
|
|
58 |
)
|
|
|
59 |
|
60 |
+
initial_chunks = initial_chunks + self.text_splitter.split_text(
|
61 |
+
combined_text
|
62 |
+
)
|
|
|
|
|
|
|
63 |
|
64 |
+
chunks: List[DocumentChunk] = []
|
65 |
+
char_count = 0
|
66 |
|
67 |
+
# for page in pages:
|
68 |
+
# text = page.page_content
|
69 |
+
# page_chunks = self.text_splitter.split_text(
|
70 |
+
# text
|
71 |
+
# ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
72 |
+
text_char = 0
|
73 |
+
for chunk in initial_chunks:
|
74 |
+
chunk_id = str(uuid.uuid4())
|
75 |
+
start_char = text_char + 1
|
76 |
+
end_char = start_char + len(chunk)
|
77 |
+
text_char = end_char
|
78 |
+
|
79 |
+
if should_use_llama_parse:
|
80 |
+
somar_pages = 0
|
81 |
+
else:
|
82 |
+
somar_pages = 1
|
83 |
+
|
84 |
+
page_number = 0
|
85 |
+
for start, end, page_number in page_boundaries:
|
86 |
+
if start <= start_char < end:
|
87 |
+
page_number = page_number
|
88 |
+
break
|
89 |
+
|
90 |
+
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
91 |
+
content=chunk,
|
92 |
+
contextual_summary="",
|
93 |
+
page_number=page_number + somar_pages, # 1-based page numbering
|
94 |
+
chunk_id=chunk_id,
|
95 |
+
start_char=char_count + start_char,
|
96 |
+
end_char=char_count + end_char,
|
97 |
+
)
|
98 |
+
chunks.append(doc_chunk)
|
99 |
+
|
100 |
+
# Store metadata for later retrieval
|
101 |
+
self.chunk_metadata[chunk_id] = {
|
102 |
+
"page": doc_chunk.page_number,
|
103 |
+
"start_char": doc_chunk.start_char,
|
104 |
+
"end_char": doc_chunk.end_char,
|
105 |
+
}
|
106 |
+
|
107 |
+
# char_count += len(text)
|
108 |
+
|
109 |
+
return chunks, initial_chunks
|
110 |
|
111 |
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
112 |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
_utils/vector_stores/Vector_store_class.py
CHANGED
@@ -21,7 +21,7 @@ class VectorStore:
|
|
21 |
# Prepare texts with context
|
22 |
if is_contextualized_chunk:
|
23 |
texts = [
|
24 |
-
f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
|
25 |
for chunk in chunks
|
26 |
]
|
27 |
else:
|
@@ -30,30 +30,19 @@ class VectorStore:
|
|
30 |
# Create vector store
|
31 |
metadatas = []
|
32 |
for index, chunk in enumerate(chunks):
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
context = texts[index]
|
47 |
-
metadatas.append(
|
48 |
-
{
|
49 |
-
"chunk_id": chunk.chunk_id,
|
50 |
-
"id_do_processo": chunk.id_do_processo,
|
51 |
-
"page": chunk.page_number,
|
52 |
-
"start_char": chunk.start_char,
|
53 |
-
"end_char": chunk.end_char,
|
54 |
-
"context": context,
|
55 |
-
}
|
56 |
-
)
|
57 |
|
58 |
vector_store = Chroma.from_texts(
|
59 |
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
|
|
21 |
# Prepare texts with context
|
22 |
if is_contextualized_chunk:
|
23 |
texts = [
|
24 |
+
f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\n{chunk.contextual_summary}\nDocument_content: {chunk.content}"
|
25 |
for chunk in chunks
|
26 |
]
|
27 |
else:
|
|
|
30 |
# Create vector store
|
31 |
metadatas = []
|
32 |
for index, chunk in enumerate(chunks):
|
33 |
+
context = texts[index]
|
34 |
+
metadatas.append(
|
35 |
+
{
|
36 |
+
"chunk_id": chunk.chunk_id,
|
37 |
+
"id_do_processo": str(
|
38 |
+
chunk.id_do_processo
|
39 |
+
), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
|
40 |
+
"page": chunk.page_number,
|
41 |
+
"start_char": chunk.start_char,
|
42 |
+
"end_char": chunk.end_char,
|
43 |
+
"context": context,
|
44 |
+
}
|
45 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
vector_store = Chroma.from_texts(
|
48 |
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
gerar_documento/serializer.py
CHANGED
@@ -33,7 +33,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
|
|
33 |
embedding_weight = serializers.FloatField(default=0.5)
|
34 |
bm25_weight = serializers.FloatField(default=0.5)
|
35 |
context_window = serializers.IntegerField(default=3)
|
36 |
-
chunk_overlap = serializers.IntegerField(default=
|
37 |
num_k_rerank = serializers.IntegerField(default=20)
|
38 |
model_cohere_rerank = serializers.CharField(
|
39 |
required=False, default="rerank-english-v2.0"
|
@@ -61,7 +61,7 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
61 |
embedding_weight = serializers.FloatField(default=0.5)
|
62 |
bm25_weight = serializers.FloatField(default=0.5)
|
63 |
context_window = serializers.IntegerField(default=3)
|
64 |
-
chunk_overlap = serializers.IntegerField(default=
|
65 |
num_k_rerank = serializers.IntegerField(default=20)
|
66 |
model_cohere_rerank = serializers.CharField(
|
67 |
required=False, default="rerank-english-v2.0"
|
|
|
33 |
embedding_weight = serializers.FloatField(default=0.5)
|
34 |
bm25_weight = serializers.FloatField(default=0.5)
|
35 |
context_window = serializers.IntegerField(default=3)
|
36 |
+
chunk_overlap = serializers.IntegerField(default=800)
|
37 |
num_k_rerank = serializers.IntegerField(default=20)
|
38 |
model_cohere_rerank = serializers.CharField(
|
39 |
required=False, default="rerank-english-v2.0"
|
|
|
61 |
embedding_weight = serializers.FloatField(default=0.5)
|
62 |
bm25_weight = serializers.FloatField(default=0.5)
|
63 |
context_window = serializers.IntegerField(default=3)
|
64 |
+
chunk_overlap = serializers.IntegerField(default=800)
|
65 |
num_k_rerank = serializers.IntegerField(default=20)
|
66 |
model_cohere_rerank = serializers.CharField(
|
67 |
required=False, default="rerank-english-v2.0"
|