luanpoppe commited on
Commit
d07865c
·
1 Parent(s): 39fc36b

feat: pequenas melhorias

Browse files
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py CHANGED
@@ -20,15 +20,12 @@ from _utils.models.gerar_relatorio import (
20
  )
21
  from modelos_usuarios.serializer import ModeloUsuarioSerializer
22
  from setup.environment import api_url
23
- from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
24
- ContextualRetriever,
25
- )
26
  from asgiref.sync import sync_to_async
27
 
28
 
29
  class EnhancedDocumentSummarizer(DocumentSummarizer):
30
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
31
- claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
32
 
33
  def __init__(
34
  self,
@@ -38,7 +35,6 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
38
  chunk_overlap,
39
  num_k_rerank,
40
  model_cohere_rerank,
41
- claude_context_model,
42
  prompt_auxiliar,
43
  gpt_model,
44
  gpt_temperature,
@@ -56,14 +52,10 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
56
  model_cohere_rerank,
57
  )
58
  self.config = config
59
- self.contextual_retriever = ContextualRetriever(
60
- config, self.claude_api_key, claude_context_model
61
- )
62
  self.logger = logging.getLogger(__name__)
63
  self.prompt_auxiliar = prompt_auxiliar
64
  self.gpt_model = gpt_model
65
  self.gpt_temperature = gpt_temperature
66
- # self.id_modelo_do_usuario = id_modelo_do_usuario
67
  self.prompt_gerar_documento = prompt_gerar_documento
68
  self.reciprocal_rank_fusion = reciprocal_rank_fusion
69
  self.resumo_gerado = ""
 
20
  )
21
  from modelos_usuarios.serializer import ModeloUsuarioSerializer
22
  from setup.environment import api_url
23
+
 
 
24
  from asgiref.sync import sync_to_async
25
 
26
 
27
  class EnhancedDocumentSummarizer(DocumentSummarizer):
28
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
 
29
 
30
  def __init__(
31
  self,
 
35
  chunk_overlap,
36
  num_k_rerank,
37
  model_cohere_rerank,
 
38
  prompt_auxiliar,
39
  gpt_model,
40
  gpt_temperature,
 
52
  model_cohere_rerank,
53
  )
54
  self.config = config
 
 
 
55
  self.logger = logging.getLogger(__name__)
56
  self.prompt_auxiliar = prompt_auxiliar
57
  self.gpt_model = gpt_model
58
  self.gpt_temperature = gpt_temperature
 
59
  self.prompt_gerar_documento = prompt_gerar_documento
60
  self.reciprocal_rank_fusion = reciprocal_rank_fusion
61
  self.resumo_gerado = ""
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -1,33 +1,16 @@
1
  import os
2
-
3
- from _utils.LLMs.LLM_class import LLM
4
- from _utils.gerar_relatorio_modelo_usuario.prompts import (
5
- prompt_auxiliar_do_contextual_prompt,
6
- create_prompt_auxiliar_do_contextual_prompt,
7
- )
8
- from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
9
- from _utils.chains.Chain_class import Chain
10
  from _utils.gerar_relatorio_modelo_usuario.utils import (
 
11
  validate_many_chunks_in_one_request,
12
  )
13
- from _utils.handle_files import return_document_list_with_llama_parser
14
- from _utils.prompts.Prompt_class import Prompt
15
- from _utils.splitters.Splitter_class import Splitter
16
- from setup.easy_imports import PyPDFLoader
17
- from langchain_openai import ChatOpenAI
18
  from typing import List, Dict, Tuple, Optional, cast
19
  from anthropic import Anthropic, AsyncAnthropic
20
  import logging
21
  from langchain.schema import Document
22
  from llama_index import Document as Llama_Index_Document
23
  import asyncio
24
- from langchain.prompts import PromptTemplate
25
  from typing import List
26
- from multiprocessing import Process, Barrier, Queue
27
  from dataclasses import dataclass
28
- from langchain_core.messages import HumanMessage
29
- from asgiref.sync import sync_to_async
30
- from setup.easy_imports import ChatPromptTemplate, ChatOpenAI
31
 
32
  from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
33
  from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
@@ -36,161 +19,30 @@ from _utils.models.gerar_relatorio import (
36
  DocumentChunk,
37
  RetrievalConfig,
38
  )
39
- from _utils.prompts.Prompt_class import prompt as prompt_obj
40
 
41
  lista_contador = []
42
 
43
 
44
  class ContextualRetriever:
45
- def __init__(
46
- self, config: RetrievalConfig, claude_api_key: str, claude_context_model: str
47
- ):
48
  self.config = config
49
- # self.claude_client = Anthropic(api_key=claude_api_key)
50
- self.claude_client = AsyncAnthropic(api_key=claude_api_key)
51
  self.logger = logging.getLogger(__name__)
52
  self.bm25 = None
53
  self.claude_context_model = claude_context_model
54
 
55
- async def contextualize_all_chunks(
56
- self, full_text_as_array: List[str], chunks: List[DocumentChunk]
57
- ) -> List[ContextualizedChunk]:
58
- """Add context to all chunks"""
59
- contextualized_chunks = []
60
- full_text = ""
61
- for x in full_text_as_array:
62
- full_text += x
63
-
64
- prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
65
-
66
- print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
67
-
68
- # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
69
- # response_auxiliar_summary = await aclaude_answer(
70
- # self.claude_client, self.claude_context_model, prompt_auxiliar_summary
71
- # )
72
-
73
- llms = LLM()
74
- response_auxiliar_summary = await llms.googleGemini().ainvoke(
75
- [HumanMessage(content=prompt_auxiliar_summary)]
76
- )
77
-
78
- print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)
79
-
80
- lista_de_listas_cada_com_20_chunks = [
81
- chunks[i : i + 20] for i in range(0, len(chunks), 20)
82
- ]
83
- print(
84
- "lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
85
- )
86
-
87
- async with asyncio.TaskGroup() as tg:
88
- tasks = [
89
- tg.create_task(
90
- self.create_contextualized_chunk(
91
- chunk, full_text_as_array, response_auxiliar_summary.content
92
- )
93
- )
94
- # for chunk in chunks # ORIGINAL
95
- for chunk in lista_de_listas_cada_com_20_chunks
96
- ]
97
-
98
- # contextualized_chunks = [task.result() for task in tasks]
99
- contextualized_chunks = []
100
- for task in tasks:
101
- # print("\n\ntask", task)
102
- # print("\n\ntask.result()", task.result())
103
-
104
- contextualized_chunks = contextualized_chunks + task.result()
105
-
106
- return contextualized_chunks
107
-
108
- # ORIGINAL
109
- # async def create_contextualized_chunk(
110
- # self, chunk, single_page_text, response_auxiliar_summary
111
- # ):
112
- # lista_contador.append(0)
113
- # print("contador: ", len(lista_contador))
114
- # page_number = chunk.page_number - 1
115
- # page_content = single_page_text[page_number].page_content
116
-
117
- # context = await self.llm_generate_context(
118
- # page_content, chunk, response_auxiliar_summary
119
- # )
120
- # print("context: ", context)
121
- # return ContextualizedChunk(
122
- # content=chunk.content,
123
- # page_number=chunk.page_number,
124
- # chunk_id=chunk.chunk_id,
125
- # start_char=chunk.start_char,
126
- # end_char=chunk.end_char,
127
- # context=context,
128
- # )
129
-
130
- async def create_contextualized_chunk(
131
- self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
132
- ):
133
-
134
- lista_contador.append(0)
135
- print("contador: ", len(lista_contador))
136
- # all_pages_contents = ""
137
- # contador = 1
138
- # for chunk in chunks:
139
- # page_number = chunk.page_number - 1
140
- # page_content = single_page_text[page_number].page_content
141
-
142
- # all_pages_contents += page_content
143
- # contador += 1
144
-
145
- result = await self.llm_generate_context(chunks, response_auxiliar_summary)
146
-
147
- lista_chunks = []
148
- for index, chunk in enumerate(chunks):
149
- lista_chunks.append(
150
- ContextualizedChunk(
151
- contextual_summary=result[index][2],
152
- content=chunk.content,
153
- page_number=chunk.page_number,
154
- id_do_processo=int(result[index][0]),
155
- chunk_id=chunk.chunk_id,
156
- start_char=chunk.start_char,
157
- end_char=chunk.end_char,
158
- context=result[index][1],
159
- )
160
- )
161
-
162
- return lista_chunks
163
-
164
- # ORIGINAL
165
- # async def llm_generate_context(
166
- # self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
167
- # ) -> str:
168
- # """Generate contextual description using ChatOpenAI"""
169
- # try:
170
- # print("COMEÇOU A REQUISIÇÃO")
171
- # prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
172
- # # response = await aclaude_answer(
173
- # # self.claude_client, self.claude_context_model, prompt
174
- # # )
175
-
176
- # # response = await agpt_answer(prompt)
177
- # llms = LLM()
178
- # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
179
- # return cast(str, response.content)
180
- # except Exception as e:
181
- # self.logger.error(
182
- # f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
183
- # )
184
- # return ""
185
 
186
- async def llm_generate_context(
187
- self, chunks: List[DocumentChunk], resumo_auxiliar # , page_text: str
188
  ) -> str:
189
  """Generate contextual description using ChatOpenAI"""
190
  contador = 1
191
  all_chunks_contents = ""
192
 
193
- for chunk in chunks:
194
  all_chunks_contents += chunk.content
195
  all_chunks_contents += f"\n\n CHUNK {contador}:\n"
196
  contador += 1
@@ -203,7 +55,9 @@ class ContextualRetriever:
203
  # )
204
 
205
  for attempt in range(4):
206
- print(f"\n\nTENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt}")
 
 
207
  raw_response = await agpt_answer(prompt)
208
  response = cast(str, raw_response)
209
  # llms = LLM()
@@ -211,7 +65,6 @@ class ContextualRetriever:
211
  # return cast(str, response.content)
212
 
213
  matches = validate_many_chunks_in_one_request(response)
214
- # Convert matches to the desired format
215
 
216
  if matches:
217
  result = [
@@ -224,62 +77,61 @@ class ContextualRetriever:
224
  self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
225
  return ""
226
 
227
- # def gerar_resumo_auxiliar_do_contextual_embedding(self):
228
- # prompt = Prompt().create_prompt_template(
229
- # "", prompt_auxiliar_do_contextual_prompt
230
- # )
231
- # Chain(prompt, ChatOpenAI())
232
- # return
233
-
234
 
235
- # Primeira função chamada do arquivo
236
- async def contextualize_chunk_based_on_serializer(
237
- serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
238
- ):
239
- if serializer["should_have_contextual_chunks"]:
240
- contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
241
- pages, all_PDFs_chunks
242
  )
243
- chunks_passados = contextualized_chunks
244
- is_contextualized_chunk = True
245
- else:
246
- chunks_passados = all_PDFs_chunks
247
- is_contextualized_chunk = False
248
-
249
- return chunks_passados, is_contextualized_chunk
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- async def get_full_text_and_all_PDFs_chunks(
253
- listaPDFs: List[str],
254
- splitterObject: Splitter,
255
- should_use_llama_parse: bool,
256
- isBubble: bool,
257
- ):
258
- all_PDFs_chunks = []
259
 
260
- pages: List[str] = []
 
 
 
 
 
261
 
262
- # Load and process document
263
- for pdf_path in listaPDFs:
264
- # if isBubble:
265
- # pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
266
- # else:
267
- # if should_use_llama_parse:
268
- # pages = pages + await return_document_list_with_llama_parser(pdf_path)
269
- # else:
270
- # pages = pages + PyPDFLoader(pdf_path).load()
271
 
272
- chunks, pages = await splitterObject.load_and_split_document(
273
- pdf_path, should_use_llama_parse, isBubble
274
- )
275
- all_PDFs_chunks = all_PDFs_chunks + chunks
276
- # Get full text for contextualization
277
- # loader = PyPDFLoader(pdf_path)
 
 
 
 
278
 
279
- # full_text = ""
280
- # full_text = " ".join([page.page_content for page in pages])
 
 
281
 
282
- return all_PDFs_chunks, pages # , full_text
283
 
284
 
285
  # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
 
1
  import os
 
 
 
 
 
 
 
 
2
  from _utils.gerar_relatorio_modelo_usuario.utils import (
3
+ get_response_from_auxiliar_contextual_prompt,
4
  validate_many_chunks_in_one_request,
5
  )
 
 
 
 
 
6
  from typing import List, Dict, Tuple, Optional, cast
7
  from anthropic import Anthropic, AsyncAnthropic
8
  import logging
9
  from langchain.schema import Document
10
  from llama_index import Document as Llama_Index_Document
11
  import asyncio
 
12
  from typing import List
 
13
  from dataclasses import dataclass
 
 
 
14
 
15
  from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agpt_answer
16
  from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
 
19
  DocumentChunk,
20
  RetrievalConfig,
21
  )
 
22
 
23
  lista_contador = []
24
 
25
 
26
  class ContextualRetriever:
27
+
28
+ def __init__(self, config: RetrievalConfig, claude_context_model: str):
 
29
  self.config = config
 
 
30
  self.logger = logging.getLogger(__name__)
31
  self.bm25 = None
32
  self.claude_context_model = claude_context_model
33
 
34
+ self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
35
+ self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
36
+ # self.claude_client = Anthropic(api_key=claude_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ async def llm_call_uma_lista_de_chunks(
39
+ self, lista_com_20_chunks: List[DocumentChunk], resumo_auxiliar
40
  ) -> str:
41
  """Generate contextual description using ChatOpenAI"""
42
  contador = 1
43
  all_chunks_contents = ""
44
 
45
+ for chunk in lista_com_20_chunks:
46
  all_chunks_contents += chunk.content
47
  all_chunks_contents += f"\n\n CHUNK {contador}:\n"
48
  contador += 1
 
55
  # )
56
 
57
  for attempt in range(4):
58
+ print(
59
+ f"\n\nTENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt}: {all_chunks_contents[0:500]}"
60
+ )
61
  raw_response = await agpt_answer(prompt)
62
  response = cast(str, raw_response)
63
  # llms = LLM()
 
65
  # return cast(str, response.content)
66
 
67
  matches = validate_many_chunks_in_one_request(response)
 
68
 
69
  if matches:
70
  result = [
 
77
  self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
78
  return ""
79
 
80
+ async def contextualize_uma_lista_de_chunks(
81
+ self, lista_com_20_chunks: List[DocumentChunk], response_auxiliar_summary
82
+ ):
83
+ lista_contador.append(0)
84
+ print("contador: ", len(lista_contador))
 
 
85
 
86
+ result = await self.llm_call_uma_lista_de_chunks(
87
+ lista_com_20_chunks, response_auxiliar_summary
 
 
 
 
 
88
  )
 
 
 
 
 
 
 
89
 
90
+ lista_chunks = []
91
+ for index, chunk in enumerate(lista_com_20_chunks):
92
+ lista_chunks.append(
93
+ ContextualizedChunk(
94
+ contextual_summary=result[index][2],
95
+ content=chunk.content,
96
+ page_number=chunk.page_number,
97
+ id_do_processo=int(result[index][0]),
98
+ chunk_id=chunk.chunk_id,
99
+ start_char=chunk.start_char,
100
+ end_char=chunk.end_char,
101
+ context=result[index][1],
102
+ )
103
+ )
104
 
105
+ return lista_chunks
 
 
 
 
 
 
106
 
107
+ async def contextualize_all_chunks(
108
+ self,
109
+ all_PDFs_chunks: List[DocumentChunk],
110
+ response_auxiliar_summary,
111
+ ) -> List[ContextualizedChunk]:
112
+ """Add context to all chunks"""
113
 
114
+ lista_de_listas_cada_com_20_chunks = [
115
+ all_PDFs_chunks[i : i + 20] for i in range(0, len(all_PDFs_chunks), 20)
116
+ ]
 
 
 
 
 
 
117
 
118
+ async with asyncio.TaskGroup() as tg:
119
+ tasks = [
120
+ tg.create_task(
121
+ self.contextualize_uma_lista_de_chunks(
122
+ lista_com_20_chunks,
123
+ response_auxiliar_summary,
124
+ )
125
+ )
126
+ for lista_com_20_chunks in lista_de_listas_cada_com_20_chunks
127
+ ]
128
 
129
+ # contextualized_chunks = [task.result() for task in tasks]
130
+ contextualized_chunks = []
131
+ for task in tasks:
132
+ contextualized_chunks = contextualized_chunks + task.result()
133
 
134
+ return contextualized_chunks
135
 
136
 
137
  # Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
_utils/gerar_relatorio_modelo_usuario/utils.py CHANGED
@@ -1,5 +1,12 @@
1
  from typing import List, Tuple
2
  from langchain_core.documents import Document
 
 
 
 
 
 
 
3
 
4
 
5
  def gerar_resposta_compilada(serializer):
@@ -69,3 +76,51 @@ def validate_many_chunks_in_one_request(response: str):
69
  if len(matches) == 0:
70
  return False
71
  return matches_as_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import List, Tuple
2
  from langchain_core.documents import Document
3
+ from langchain_core.messages import HumanMessage
4
+
5
+ from _utils.splitters.Splitter_class import Splitter
6
+ from _utils.LLMs.LLM_class import LLM
7
+ from _utils.gerar_relatorio_modelo_usuario.prompts import (
8
+ create_prompt_auxiliar_do_contextual_prompt,
9
+ )
10
 
11
 
12
  def gerar_resposta_compilada(serializer):
 
76
  if len(matches) == 0:
77
  return False
78
  return matches_as_list
79
+
80
+
81
+ # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
82
+ async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
83
+ full_text = ""
84
+ for x in full_text_as_array:
85
+ full_text += x
86
+
87
+ prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
88
+
89
+ print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
90
+
91
+ # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
92
+ # response_auxiliar_summary = await aclaude_answer(
93
+ # self.claude_client, self.claude_context_model, prompt_auxiliar_summary
94
+ # )
95
+
96
+ llms = LLM()
97
+ response_auxiliar_summary = await llms.googleGemini().ainvoke(
98
+ [HumanMessage(content=prompt_auxiliar_summary)]
99
+ )
100
+
101
+ print(
102
+ "\n\n\n\nresponse_auxiliar_summary.content[0:500]: ",
103
+ response_auxiliar_summary.content[0:500],
104
+ )
105
+
106
+ return response_auxiliar_summary.content
107
+
108
+
109
+ async def get_full_text_and_all_PDFs_chunks(
110
+ listaPDFs: List[str],
111
+ splitterObject: Splitter,
112
+ should_use_llama_parse: bool,
113
+ isBubble: bool,
114
+ ):
115
+ all_PDFs_chunks = []
116
+
117
+ pages: List[str] = []
118
+
119
+ # Load and process document
120
+ for pdf_path in listaPDFs:
121
+ chunks, pages = await splitterObject.load_and_split_document(
122
+ pdf_path, should_use_llama_parse, isBubble
123
+ )
124
+ all_PDFs_chunks = all_PDFs_chunks + chunks
125
+
126
+ return all_PDFs_chunks, pages
_utils/resumo_completo_cursor.py CHANGED
@@ -4,10 +4,13 @@ from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
4
  EnhancedDocumentSummarizer,
5
  )
6
  from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
7
- contextualize_chunk_based_on_serializer,
 
 
 
8
  get_full_text_and_all_PDFs_chunks,
 
9
  )
10
- from _utils.gerar_relatorio_modelo_usuario.utils import gerar_resposta_compilada
11
  from _utils.models.gerar_relatorio import (
12
  RetrievalConfig,
13
  )
@@ -51,6 +54,10 @@ async def get_llm_summary_answer_by_cursor_complete(
51
  chunk_overlap=serializer["chunk_overlap"],
52
  )
53
 
 
 
 
 
54
  # Initialize enhanced summarizer
55
  summarizer = EnhancedDocumentSummarizer(
56
  config=config,
@@ -59,29 +66,35 @@ async def get_llm_summary_answer_by_cursor_complete(
59
  chunk_size=serializer["chunk_size"],
60
  num_k_rerank=serializer["num_k_rerank"],
61
  model_cohere_rerank=serializer["model_cohere_rerank"],
62
- claude_context_model=serializer["claude_context_model"],
63
  prompt_auxiliar=serializer["prompt_auxiliar"],
64
  gpt_model=serializer["model"],
65
  gpt_temperature=serializer["gpt_temperature"],
66
- # id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
67
  prompt_gerar_documento=serializer["prompt_gerar_documento"],
68
  reciprocal_rank_fusion=reciprocal_rank_fusion,
69
  )
70
 
71
- allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
72
  listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
73
  )
74
 
75
- chunks_passados, is_contextualized_chunk = (
76
- await contextualize_chunk_based_on_serializer(
77
- serializer, summarizer.contextual_retriever, pages, allPdfsChunks
 
 
78
  )
79
- )
 
 
 
 
 
 
80
 
81
  # Create enhanced vector store and BM25 index
82
  vector_store, bm25, chunk_ids = (
83
  summarizer.vector_store.create_enhanced_vector_store(
84
- chunks_passados, is_contextualized_chunk
85
  )
86
  )
87
 
 
4
  EnhancedDocumentSummarizer,
5
  )
6
  from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
7
+ ContextualRetriever,
8
+ )
9
+ from _utils.gerar_relatorio_modelo_usuario.utils import (
10
+ gerar_resposta_compilada,
11
  get_full_text_and_all_PDFs_chunks,
12
+ get_response_from_auxiliar_contextual_prompt,
13
  )
 
14
  from _utils.models.gerar_relatorio import (
15
  RetrievalConfig,
16
  )
 
54
  chunk_overlap=serializer["chunk_overlap"],
55
  )
56
 
57
+ contextual_retriever = ContextualRetriever(
58
+ config, serializer["claude_context_model"]
59
+ )
60
+
61
  # Initialize enhanced summarizer
62
  summarizer = EnhancedDocumentSummarizer(
63
  config=config,
 
66
  chunk_size=serializer["chunk_size"],
67
  num_k_rerank=serializer["num_k_rerank"],
68
  model_cohere_rerank=serializer["model_cohere_rerank"],
 
69
  prompt_auxiliar=serializer["prompt_auxiliar"],
70
  gpt_model=serializer["model"],
71
  gpt_temperature=serializer["gpt_temperature"],
 
72
  prompt_gerar_documento=serializer["prompt_gerar_documento"],
73
  reciprocal_rank_fusion=reciprocal_rank_fusion,
74
  )
75
 
76
+ all_PDFs_chunks, full_text_as_array = await get_full_text_and_all_PDFs_chunks(
77
  listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
78
  )
79
 
80
+ is_contextualized_chunk = serializer["should_have_contextual_chunks"]
81
+
82
+ if is_contextualized_chunk:
83
+ response_auxiliar_summary = await get_response_from_auxiliar_contextual_prompt(
84
+ full_text_as_array
85
  )
86
+
87
+ contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
88
+ all_PDFs_chunks, response_auxiliar_summary
89
+ )
90
+ chunks_processados = contextualized_chunks
91
+ else:
92
+ chunks_processados = all_PDFs_chunks
93
 
94
  # Create enhanced vector store and BM25 index
95
  vector_store, bm25, chunk_ids = (
96
  summarizer.vector_store.create_enhanced_vector_store(
97
+ chunks_processados, is_contextualized_chunk
98
  )
99
  )
100
 
tests/gerar_relatorio_modelo_usuario/test_contextual_retriever.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ class TestContextualRetriever:
2
+ pass