luanpoppe commited on
Commit
e70ffc1
·
1 Parent(s): 8f3dc39

feat: adicionando possibilidade de requisição com gemini, adicionando possibilidade de requsição com deepseek, colocando como padrão não utilizar o llama parse, mudar o padrão para realizar poucas requisições do contextual e lidar com as respostas contendo vários chunks de uma vez só

Browse files
.env.example CHANGED
@@ -6,4 +6,7 @@ LANGCHAIN_API_KEY=""
6
  CLAUDE_API_KEY=""
7
  COHERE_API_KEY=""
8
  BUBBLE_TOKEN=""
9
- LLAMA_CLOUD_API_KEY=""
 
 
 
 
6
  CLAUDE_API_KEY=""
7
  COHERE_API_KEY=""
8
  BUBBLE_TOKEN=""
9
+ LLAMA_CLOUD_API_KEY_POPS=""
10
+ LLAMA_CLOUD_API_KEY_PEIXE=""
11
+ DEEPSEEKK_API_KEY=""
12
+ GOOGLE_API_KEY_PEIXE=""
_utils/LLMs/LLM_class.py CHANGED
@@ -1,4 +1,13 @@
 
 
 
 
1
  from setup.environment import default_model
 
 
 
 
 
2
 
3
 
4
  class LLM:
@@ -7,3 +16,20 @@ class LLM:
7
 
8
  # def create_GPT_model(self, model=default_model):
9
  # return ChatOpen()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from langchain_openai import OpenAI
2
+ from typing import cast
3
+ from openai import OpenAI
4
+ from pydantic import SecretStr
5
  from setup.environment import default_model
6
+ from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
7
+ import os
8
+
9
+ deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
10
+ google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
11
 
12
 
13
  class LLM:
 
16
 
17
  # def create_GPT_model(self, model=default_model):
18
  # return ChatOpen()
19
+
20
+ def deepseek(self):
21
+ return ChatOpenAI(
22
+ api_key=SecretStr(deepseek_api_key),
23
+ base_url="https://api.deepseek.com/v1",
24
+ model="deepseek-chat",
25
+ )
26
+
27
+ def googleGemini(self):
28
+ return ChatGoogleGenerativeAI(
29
+ api_key=SecretStr(google_api_key),
30
+ model="gemini-1.5-flash",
31
+ temperature=0,
32
+ max_tokens=None,
33
+ timeout=None,
34
+ max_retries=2,
35
+ )
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
 
 
3
  from _utils.gerar_relatorio_modelo_usuario.prompts import (
4
  prompt_auxiliar_do_contextual_prompt,
5
  create_prompt_auxiliar_do_contextual_prompt,
@@ -11,7 +12,7 @@ from _utils.prompts.Prompt_class import Prompt
11
  from _utils.splitters.Splitter_class import Splitter
12
  from setup.easy_imports import PyPDFLoader
13
  from langchain_openai import ChatOpenAI
14
- from typing import List, Dict, Tuple, Optional
15
  from anthropic import Anthropic, AsyncAnthropic
16
  import logging
17
  from langchain.schema import Document
@@ -48,101 +49,211 @@ class ContextualRetriever:
48
  self.bm25 = None
49
  self.claude_context_model = claude_context_model
50
 
51
- async def llm_generate_context(
52
- self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
53
- ) -> str:
54
- """Generate contextual description using ChatOpenAI"""
55
- try:
56
- print("COMEÇOU A REQUISIÇÃO")
57
- prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
58
- # response = await aclaude_answer(
59
- # self.claude_client, self.claude_context_model, prompt
60
- # )
61
-
62
- response = await agpt_answer(prompt)
63
- return response
64
- except Exception as e:
65
- self.logger.error(
66
- f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
67
- )
68
- return ""
69
-
70
- # def gerar_resumo_auxiliar_do_contextual_embedding(self):
71
- # prompt = Prompt().create_prompt_template(
72
- # "", prompt_auxiliar_do_contextual_prompt
73
- # )
74
- # Chain(prompt, ChatOpenAI())
75
- # return
76
-
77
- async def create_contextualized_chunk(
78
- self, chunk, single_page_text, response_auxiliar_summary
79
- ):
80
- lista_contador.append(0)
81
- print("contador: ", len(lista_contador))
82
- # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
83
- # page_content = ""
84
- # for i in range(
85
- # max(0, chunk.page_number - 1),
86
- # min(len(single_page_text), chunk.page_number + 2),
87
- # ):
88
- # page_content += single_page_text[i].page_content if single_page_text[i] else ""
89
- page_number = chunk.page_number - 1
90
- page_content = single_page_text[page_number].page_content
91
-
92
- context = await self.llm_generate_context(
93
- page_content, chunk, response_auxiliar_summary
94
- )
95
- return ContextualizedChunk(
96
- content=chunk.content,
97
- page_number=chunk.page_number,
98
- chunk_id=chunk.chunk_id,
99
- start_char=chunk.start_char,
100
- end_char=chunk.end_char,
101
- context=context,
102
- )
103
-
104
  async def contextualize_all_chunks(
105
  self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
106
  ) -> List[ContextualizedChunk]:
107
  """Add context to all chunks"""
108
  contextualized_chunks = []
109
- lista_contador = []
110
  full_text = ""
111
  for x in full_text_as_array:
112
  full_text += x.page_content
113
 
114
- # prompt_auxiliar_summary = prompt_obj.create_prompt_template(
115
- # "", prompt_auxiliar_do_contextual_prompt
116
- # ).invoke({"PROCESSO_JURIDICO": full_text})
117
 
118
- # response_auxiliar_summary = await ChatOpenAI(max_tokens=128000).ainvoke(
119
- # prompt_auxiliar_summary
 
120
  # )
121
 
122
- prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
 
 
 
123
 
124
- print("\n\n\nprompt_auxiliar_summary: ", prompt_auxiliar_summary)
125
 
126
- response_auxiliar_summary = await aclaude_answer(
127
- self.claude_client, self.claude_context_model, prompt_auxiliar_summary
 
 
 
128
  )
129
 
130
- print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary)
131
-
132
  async with asyncio.TaskGroup() as tg:
133
  tasks = [
134
  tg.create_task(
135
  self.create_contextualized_chunk(
136
- chunk, full_text_as_array, response_auxiliar_summary
137
  )
138
  )
139
- for chunk in chunks
 
140
  ]
141
 
142
- contextualized_chunks = [task.result() for task in tasks]
 
 
 
 
 
 
143
 
 
144
  return contextualized_chunks
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  async def get_full_text_and_all_PDFs_chunks(
148
  listaPDFs: List[str],
@@ -159,7 +270,9 @@ async def get_full_text_and_all_PDFs_chunks(
159
  pages = pages + await return_document_list_with_llama_parser(pdf_path)
160
  else:
161
  pages = pages + get_pdf_from_bubble(pdf_path)
162
- chunks = splitterObject.load_and_split_document(pdf_path, pages)
 
 
163
  all_PDFs_chunks = all_PDFs_chunks + chunks
164
  # Get full text for contextualization
165
  # loader = PyPDFLoader(pdf_path)
@@ -170,17 +283,10 @@ async def get_full_text_and_all_PDFs_chunks(
170
  return all_PDFs_chunks, pages # , full_text
171
 
172
 
173
- async def contextualize_chunk_based_on_serializer(
174
- serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
175
- ):
176
- if serializer["should_have_contextual_chunks"]:
177
- contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
178
- pages, all_PDFs_chunks
179
- )
180
- chunks_passados = contextualized_chunks
181
- is_contextualized_chunk = True
182
- else:
183
- chunks_passados = all_PDFs_chunks
184
- is_contextualized_chunk = False
185
-
186
- return chunks_passados, is_contextualized_chunk
 
1
  import os
2
 
3
+ from _utils.LLMs.LLM_class import LLM
4
  from _utils.gerar_relatorio_modelo_usuario.prompts import (
5
  prompt_auxiliar_do_contextual_prompt,
6
  create_prompt_auxiliar_do_contextual_prompt,
 
12
  from _utils.splitters.Splitter_class import Splitter
13
  from setup.easy_imports import PyPDFLoader
14
  from langchain_openai import ChatOpenAI
15
+ from typing import List, Dict, Tuple, Optional, cast
16
  from anthropic import Anthropic, AsyncAnthropic
17
  import logging
18
  from langchain.schema import Document
 
49
  self.bm25 = None
50
  self.claude_context_model = claude_context_model
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  async def contextualize_all_chunks(
53
  self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
54
  ) -> List[ContextualizedChunk]:
55
  """Add context to all chunks"""
56
  contextualized_chunks = []
 
57
  full_text = ""
58
  for x in full_text_as_array:
59
  full_text += x.page_content
60
 
61
+ prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
62
+
63
+ print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
64
 
65
+ # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
66
+ # response_auxiliar_summary = await aclaude_answer(
67
+ # self.claude_client, self.claude_context_model, prompt_auxiliar_summary
68
  # )
69
 
70
+ llms = LLM()
71
+ response_auxiliar_summary = await llms.googleGemini().ainvoke(
72
+ [HumanMessage(content=prompt_auxiliar_summary)]
73
+ )
74
 
75
+ print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)
76
 
77
+ lista_de_listas_cada_com_20_chunks = [
78
+ chunks[i : i + 20] for i in range(0, len(chunks), 20)
79
+ ]
80
+ print(
81
+ "lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
82
  )
83
 
 
 
84
  async with asyncio.TaskGroup() as tg:
85
  tasks = [
86
  tg.create_task(
87
  self.create_contextualized_chunk(
88
+ chunk, full_text_as_array, response_auxiliar_summary.content
89
  )
90
  )
91
+ # for chunk in chunks # ORIGINAL
92
+ for chunk in lista_de_listas_cada_com_20_chunks
93
  ]
94
 
95
+ # contextualized_chunks = [task.result() for task in tasks]
96
+ contextualized_chunks = []
97
+ for task in tasks:
98
+ # print("\n\ntask", task)
99
+ # print("\n\ntask.result()", task.result())
100
+
101
+ contextualized_chunks = contextualized_chunks + task.result()
102
 
103
+ print("\n\ncontextualized_chunks", contextualized_chunks)
104
  return contextualized_chunks
105
 
106
+ # ORIGINAL
107
+ # async def create_contextualized_chunk(
108
+ # self, chunk, single_page_text, response_auxiliar_summary
109
+ # ):
110
+ # lista_contador.append(0)
111
+ # print("contador: ", len(lista_contador))
112
+ # page_number = chunk.page_number - 1
113
+ # page_content = single_page_text[page_number].page_content
114
+
115
+ # context = await self.llm_generate_context(
116
+ # page_content, chunk, response_auxiliar_summary
117
+ # )
118
+ # print("context: ", context)
119
+ # return ContextualizedChunk(
120
+ # content=chunk.content,
121
+ # page_number=chunk.page_number,
122
+ # chunk_id=chunk.chunk_id,
123
+ # start_char=chunk.start_char,
124
+ # end_char=chunk.end_char,
125
+ # context=context,
126
+ # )
127
+
128
+ async def create_contextualized_chunk(
129
+ self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
130
+ ):
131
+
132
+ lista_contador.append(0)
133
+ print("contador: ", len(lista_contador))
134
+ all_pages_contents = ""
135
+ contador = 1
136
+ for chunk in chunks:
137
+ page_number = chunk.page_number - 1
138
+ page_content = single_page_text[page_number].page_content
139
+
140
+ all_pages_contents += page_content
141
+ contador += 1
142
+
143
+ context = await self.llm_generate_context(
144
+ page_content, chunks, response_auxiliar_summary
145
+ )
146
+
147
+ context = (
148
+ context.replace("document_id: ", "")
149
+ .replace("document_id:", "")
150
+ .replace("DOCUMENT_ID: ", "")
151
+ .replace("DOCUMENT_ID: ", "")
152
+ )
153
+
154
+ # print("context: ", context)
155
+ import re
156
+
157
+ pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
158
+ # pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
159
+ matches = re.findall(pattern, context, re.DOTALL)
160
+
161
+ # Convert matches to the desired format
162
+ result = [
163
+ [int(doc_id), title.strip(), content.strip()]
164
+ for doc_id, title, content in matches
165
+ ]
166
+ # print("\n\nresult", result)
167
+
168
+ lista_chunks = []
169
+ for index, chunk in enumerate(chunks):
170
+ lista_chunks.append(
171
+ ContextualizedChunk(
172
+ content=chunk.content,
173
+ page_number=chunk.page_number,
174
+ chunk_id=result[index][0],
175
+ start_char=chunk.start_char,
176
+ end_char=chunk.end_char,
177
+ context=" ".join(result[index][1:2]),
178
+ )
179
+ )
180
+
181
+ return lista_chunks
182
+
183
+ # ORIGINAL
184
+ # async def llm_generate_context(
185
+ # self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
186
+ # ) -> str:
187
+ # """Generate contextual description using ChatOpenAI"""
188
+ # try:
189
+ # print("COMEÇOU A REQUISIÇÃO")
190
+ # prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
191
+ # # response = await aclaude_answer(
192
+ # # self.claude_client, self.claude_context_model, prompt
193
+ # # )
194
+
195
+ # # response = await agpt_answer(prompt)
196
+ # llms = LLM()
197
+ # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
198
+ # return cast(str, response.content)
199
+ # except Exception as e:
200
+ # self.logger.error(
201
+ # f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
202
+ # )
203
+ # return ""
204
+
205
+ async def llm_generate_context(
206
+ self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
207
+ ) -> str:
208
+ """Generate contextual description using ChatOpenAI"""
209
+ contador = 1
210
+ all_chunks_contents = ""
211
+
212
+ for chunk in chunks:
213
+ all_chunks_contents += chunk.content
214
+ all_chunks_contents += f"\n\n CHUNK {contador}:\n"
215
+ contador += 1
216
+
217
+ try:
218
+ print("COMEÇOU A REQUISIÇÃO")
219
+ prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
220
+ # response = await aclaude_answer(
221
+ # self.claude_client, self.claude_context_model, prompt
222
+ # )
223
+
224
+ response = await agpt_answer(prompt)
225
+ # llms = LLM()
226
+ # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
227
+ # return cast(str, response.content)
228
+ return cast(str, response)
229
+ except Exception as e:
230
+ self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
231
+ return ""
232
+
233
+ # def gerar_resumo_auxiliar_do_contextual_embedding(self):
234
+ # prompt = Prompt().create_prompt_template(
235
+ # "", prompt_auxiliar_do_contextual_prompt
236
+ # )
237
+ # Chain(prompt, ChatOpenAI())
238
+ # return
239
+
240
+
241
+ # Primeira função chamada do arquivo
242
+ async def contextualize_chunk_based_on_serializer(
243
+ serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
244
+ ):
245
+ if serializer["should_have_contextual_chunks"]:
246
+ contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
247
+ pages, all_PDFs_chunks
248
+ )
249
+ chunks_passados = contextualized_chunks
250
+ is_contextualized_chunk = True
251
+ else:
252
+ chunks_passados = all_PDFs_chunks
253
+ is_contextualized_chunk = False
254
+
255
+ return chunks_passados, is_contextualized_chunk
256
+
257
 
258
  async def get_full_text_and_all_PDFs_chunks(
259
  listaPDFs: List[str],
 
270
  pages = pages + await return_document_list_with_llama_parser(pdf_path)
271
  else:
272
  pages = pages + get_pdf_from_bubble(pdf_path)
273
+ chunks = splitterObject.load_and_split_document(
274
+ pdf_path, pages, should_use_llama_parse
275
+ )
276
  all_PDFs_chunks = all_PDFs_chunks + chunks
277
  # Get full text for contextualization
278
  # loader = PyPDFLoader(pdf_path)
 
283
  return all_PDFs_chunks, pages # , full_text
284
 
285
 
286
+ # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
287
+ # page_content = ""
288
+ # for i in range(
289
+ # max(0, chunk.page_number - 1),
290
+ # min(len(single_page_text), chunk.page_number + 2),
291
+ # ):
292
+ # page_content += single_page_text[i].page_content if single_page_text[i] else ""
 
 
 
 
 
 
 
_utils/gerar_relatorio_modelo_usuario/prompts.py CHANGED
@@ -117,40 +117,77 @@ Formate sua resposta da seguinte maneira:
117
  </resumo_final>"""
118
 
119
 
120
- def contextual_prompt(single_page_text, summary_text, chunk_content):
121
- return f"""You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze a chunk of text from a larger document and provide a brief context for it.
122
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  Here's the summary of the full text of the document:
124
  <summary_text>
125
  {summary_text}
126
  </summary_text>
127
-
128
- Here's the single page where the chunk is situated:
129
-
130
- <single_page>
131
- {single_page_text}
132
- </single_page>
133
-
134
- And here's the specific chunk to contextualize:
135
- <chunk>
136
- {chunk_content}
137
- </chunk>
138
-
139
- Follow these steps:
140
- 1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
141
- 2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
142
  3. Identify where the specific chunk fits within these themes.
143
  4. Create a concise context that situates the chunk within the document.
144
-
145
- With this informations, your response should be a single, concise paragraph that includes:
146
- - The document ID
147
- - The document name
148
- - A brief context for the chunk
149
-
150
- Example final output structure (do not copy the content, only the format):
151
- <chunk_context>
152
- [Single paragraph with document ID, name, and chunk context]
153
- </chunk_context>"""
 
 
 
 
 
 
 
 
154
 
155
 
156
  # return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
 
117
  </resumo_final>"""
118
 
119
 
120
+ # ORIGINAL
121
+ # def contextual_prompt(single_page_text, summary_text, chunk_content):
122
+ # return f"""You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze a chunk of text from a larger document and provide a brief context for it.
123
+
124
+ # Here's the summary of the full text of the document:
125
+ # <summary_text>
126
+ # {summary_text}
127
+ # </summary_text>
128
+
129
+ # Here's the single page where the chunk is situated:
130
+
131
+ # <single_page>
132
+ # {single_page_text}
133
+ # </single_page>
134
+
135
+ # And here's the specific chunk to contextualize:
136
+ # <chunk>
137
+ # {chunk_content}
138
+ # </chunk>
139
+
140
+ # Follow these steps:
141
+ # 1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
142
+ # 2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
143
+ # 3. Identify where the specific chunk fits within these themes.
144
+ # 4. Create a concise context that situates the chunk within the document.
145
+
146
+ # With this informations, your response should be a single, concise paragraph that includes:
147
+ # - The document ID
148
+ # - The document name
149
+ # - A brief context for the chunk
150
+
151
+ # Example final output structure (do not copy the content, only the format):
152
+ # <chunk_context>
153
+ # [Single paragraph with document ID, name, and chunk context]
154
+ # </chunk_context>"""
155
+
156
+
157
+ def contextual_prompt(all_pages_contents, summary_text, chunk_content):
158
+ return f"""
159
+ You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
160
  Here's the summary of the full text of the document:
161
  <summary_text>
162
  {summary_text}
163
  </summary_text>
164
+ Here are the pages where the chunks are situated:
165
+ <page>
166
+ {all_pages_contents}
167
+ </page>
168
+ You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
169
+ 1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
170
+ 2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
 
 
 
 
 
 
 
 
171
  3. Identify where the specific chunk fits within these themes.
172
  4. Create a concise context that situates the chunk within the document.
173
+ Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
174
+ <final_output>
175
+ [document_id] --- [document_name] --- [brief_context_for_the_chunk]
176
+ </final_output>
177
+ Here are the 20 chunks to analyze:
178
+ <user_input>
179
+ {chunk_content}
180
+ </user_input>
181
+ Example output structure (do not copy the content, only the format):
182
+ 1. <chunk_context>
183
+ [document_id] --- [document_title] --- [brief_context_for_the_chunk]
184
+ </chunk_context>
185
+ 2.<chunk_context>
186
+ [document_id] --- [document_title] --- [brief_context_for_the_chunk]
187
+ </chunk_context>
188
+ [Continue for all 20 chunks]
189
+ Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
190
+ """
191
 
192
 
193
  # return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
_utils/handle_files.py CHANGED
@@ -19,7 +19,7 @@ def handle_pdf_files_from_serializer(files):
19
  temp_file.write(chunk)
20
  temp_file_path = temp_file.name # Get the path of the temporary file
21
  listaPDFs.append(temp_file_path)
22
- print("listaPDFs: ", listaPDFs)
23
  return listaPDFs
24
 
25
 
@@ -29,7 +29,7 @@ def remove_pdf_temp_files(listaPDFs):
29
 
30
 
31
  async def return_document_list_with_llama_parser(file: str):
32
- llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
33
  documents: List[LangchainDocument] = []
34
  if llama_parser_api:
35
  parser = LlamaParse(
@@ -39,7 +39,11 @@ async def return_document_list_with_llama_parser(file: str):
39
  verbose=True,
40
  )
41
 
42
- parsed_document = await parser.aget_json(file)
 
 
 
 
43
  for doc in parsed_document[0].get("pages"): # type: ignore
44
  # documents.append(doc.to_langchain_format())
45
 
 
19
  temp_file.write(chunk)
20
  temp_file_path = temp_file.name # Get the path of the temporary file
21
  listaPDFs.append(temp_file_path)
22
+ print("\n\nlistaPDFs: ", listaPDFs)
23
  return listaPDFs
24
 
25
 
 
29
 
30
 
31
  async def return_document_list_with_llama_parser(file: str):
32
+ llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY_POPS")
33
  documents: List[LangchainDocument] = []
34
  if llama_parser_api:
35
  parser = LlamaParse(
 
39
  verbose=True,
40
  )
41
 
42
+ try:
43
+ parsed_document = await parser.aget_json(file)
44
+ except:
45
+ raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
46
+ print("parsed_document: ", parsed_document)
47
  for doc in parsed_document[0].get("pages"): # type: ignore
48
  # documents.append(doc.to_langchain_format())
49
 
_utils/resumo_completo_cursor.py CHANGED
@@ -105,7 +105,7 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
105
  for x in structured_summaries:
106
  texto_completo = texto_completo + x["content"] + "\n"
107
 
108
- print("\n\ntexto_completo: ", texto_completo)
109
 
110
  return {
111
  "resultado": structured_summaries,
 
105
  for x in structured_summaries:
106
  texto_completo = texto_completo + x["content"] + "\n"
107
 
108
+ print("\n\ntexto_completo[0: 1000]: ", texto_completo[0:1000])
109
 
110
  return {
111
  "resultado": structured_summaries,
_utils/splitters/Splitter_class.py CHANGED
@@ -19,7 +19,7 @@ class Splitter:
19
  self.chunk_metadata = {} # Store chunk metadata for tracing
20
 
21
  def load_and_split_document(
22
- self, pdf_path: str, pages: List[Document] | None
23
  ) -> List[DocumentChunk]:
24
  """Load PDF and split into chunks with metadata"""
25
  # loader = PyPDFLoader(pdf_path)
@@ -43,10 +43,14 @@ class Splitter:
43
  ) # Retorna a posição onde se encontra o chunk dentro da página inteira
44
  end_char = start_char + len(chunk)
45
 
 
 
 
 
46
  doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
47
  content=chunk,
48
  page_number=cast(int, page.metadata.get("page"))
49
- + 1, # 1-based page numbering
50
  chunk_id=chunk_id,
51
  start_char=char_count + start_char,
52
  end_char=char_count + end_char,
 
19
  self.chunk_metadata = {} # Store chunk metadata for tracing
20
 
21
  def load_and_split_document(
22
+ self, pdf_path: str, pages: List[Document] | None, should_use_llama_parse: bool
23
  ) -> List[DocumentChunk]:
24
  """Load PDF and split into chunks with metadata"""
25
  # loader = PyPDFLoader(pdf_path)
 
43
  ) # Retorna a posição onde se encontra o chunk dentro da página inteira
44
  end_char = start_char + len(chunk)
45
 
46
+ if should_use_llama_parse:
47
+ somar_pages = 0
48
+ else:
49
+ somar_pages = 1
50
  doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
51
  content=chunk,
52
  page_number=cast(int, page.metadata.get("page"))
53
+ + somar_pages, # 1-based page numbering
54
  chunk_id=chunk_id,
55
  start_char=char_count + start_char,
56
  end_char=char_count + end_char,
gerar_documento/serializer.py CHANGED
@@ -73,4 +73,4 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
73
  gpt_temperature = serializers.FloatField(default=0)
74
  id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
75
  should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
76
- should_use_llama_parse = serializers.BooleanField(required=False, default=True) # type: ignore
 
73
  gpt_temperature = serializers.FloatField(default=0)
74
  id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
75
  should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
76
+ should_use_llama_parse = serializers.BooleanField(required=False, default=False) # type: ignore
setup/easy_imports.py CHANGED
@@ -14,6 +14,7 @@ from langchain.prompts import PromptTemplate
14
  from langchain_core.prompts import ChatPromptTemplate
15
  from langchain_community.document_loaders import PyPDFLoader
16
  from langchain_community.vectorstores import Chroma
 
17
 
18
  # from langchain_community.chat_models import ChatOpenAI
19
  from langchain_openai import ChatOpenAI
 
14
  from langchain_core.prompts import ChatPromptTemplate
15
  from langchain_community.document_loaders import PyPDFLoader
16
  from langchain_community.vectorstores import Chroma
17
+ from langchain_google_genai import ChatGoogleGenerativeAI
18
 
19
  # from langchain_community.chat_models import ChatOpenAI
20
  from langchain_openai import ChatOpenAI