Spaces:
Running
Running
luanpoppe
commited on
Commit
·
e70ffc1
1
Parent(s):
8f3dc39
feat: adicionando possibilidade de requisição com gemini, adicionando possibilidade de requsição com deepseek, colocando como padrão não utilizar o llama parse, mudar o padrão para realizar poucas requisições do contextual e lidar com as respostas contendo vários chunks de uma vez só
Browse files- .env.example +4 -1
- _utils/LLMs/LLM_class.py +26 -0
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +190 -84
- _utils/gerar_relatorio_modelo_usuario/prompts.py +65 -28
- _utils/handle_files.py +7 -3
- _utils/resumo_completo_cursor.py +1 -1
- _utils/splitters/Splitter_class.py +6 -2
- gerar_documento/serializer.py +1 -1
- setup/easy_imports.py +1 -0
.env.example
CHANGED
@@ -6,4 +6,7 @@ LANGCHAIN_API_KEY=""
|
|
6 |
CLAUDE_API_KEY=""
|
7 |
COHERE_API_KEY=""
|
8 |
BUBBLE_TOKEN=""
|
9 |
-
|
|
|
|
|
|
|
|
6 |
CLAUDE_API_KEY=""
|
7 |
COHERE_API_KEY=""
|
8 |
BUBBLE_TOKEN=""
|
9 |
+
LLAMA_CLOUD_API_KEY_POPS=""
|
10 |
+
LLAMA_CLOUD_API_KEY_PEIXE=""
|
11 |
+
DEEPSEEKK_API_KEY=""
|
12 |
+
GOOGLE_API_KEY_PEIXE=""
|
_utils/LLMs/LLM_class.py
CHANGED
@@ -1,4 +1,13 @@
|
|
|
|
|
|
|
|
|
|
1 |
from setup.environment import default_model
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
class LLM:
|
@@ -7,3 +16,20 @@ class LLM:
|
|
7 |
|
8 |
# def create_GPT_model(self, model=default_model):
|
9 |
# return ChatOpen()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from langchain_openai import OpenAI
|
2 |
+
from typing import cast
|
3 |
+
from openai import OpenAI
|
4 |
+
from pydantic import SecretStr
|
5 |
from setup.environment import default_model
|
6 |
+
from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
|
7 |
+
import os
|
8 |
+
|
9 |
+
deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
|
10 |
+
google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
|
11 |
|
12 |
|
13 |
class LLM:
|
|
|
16 |
|
17 |
# def create_GPT_model(self, model=default_model):
|
18 |
# return ChatOpen()
|
19 |
+
|
20 |
+
def deepseek(self):
|
21 |
+
return ChatOpenAI(
|
22 |
+
api_key=SecretStr(deepseek_api_key),
|
23 |
+
base_url="https://api.deepseek.com/v1",
|
24 |
+
model="deepseek-chat",
|
25 |
+
)
|
26 |
+
|
27 |
+
def googleGemini(self):
|
28 |
+
return ChatGoogleGenerativeAI(
|
29 |
+
api_key=SecretStr(google_api_key),
|
30 |
+
model="gemini-1.5-flash",
|
31 |
+
temperature=0,
|
32 |
+
max_tokens=None,
|
33 |
+
timeout=None,
|
34 |
+
max_retries=2,
|
35 |
+
)
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
|
|
|
3 |
from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
4 |
prompt_auxiliar_do_contextual_prompt,
|
5 |
create_prompt_auxiliar_do_contextual_prompt,
|
@@ -11,7 +12,7 @@ from _utils.prompts.Prompt_class import Prompt
|
|
11 |
from _utils.splitters.Splitter_class import Splitter
|
12 |
from setup.easy_imports import PyPDFLoader
|
13 |
from langchain_openai import ChatOpenAI
|
14 |
-
from typing import List, Dict, Tuple, Optional
|
15 |
from anthropic import Anthropic, AsyncAnthropic
|
16 |
import logging
|
17 |
from langchain.schema import Document
|
@@ -48,101 +49,211 @@ class ContextualRetriever:
|
|
48 |
self.bm25 = None
|
49 |
self.claude_context_model = claude_context_model
|
50 |
|
51 |
-
async def llm_generate_context(
|
52 |
-
self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
|
53 |
-
) -> str:
|
54 |
-
"""Generate contextual description using ChatOpenAI"""
|
55 |
-
try:
|
56 |
-
print("COMEÇOU A REQUISIÇÃO")
|
57 |
-
prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
|
58 |
-
# response = await aclaude_answer(
|
59 |
-
# self.claude_client, self.claude_context_model, prompt
|
60 |
-
# )
|
61 |
-
|
62 |
-
response = await agpt_answer(prompt)
|
63 |
-
return response
|
64 |
-
except Exception as e:
|
65 |
-
self.logger.error(
|
66 |
-
f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
|
67 |
-
)
|
68 |
-
return ""
|
69 |
-
|
70 |
-
# def gerar_resumo_auxiliar_do_contextual_embedding(self):
|
71 |
-
# prompt = Prompt().create_prompt_template(
|
72 |
-
# "", prompt_auxiliar_do_contextual_prompt
|
73 |
-
# )
|
74 |
-
# Chain(prompt, ChatOpenAI())
|
75 |
-
# return
|
76 |
-
|
77 |
-
async def create_contextualized_chunk(
|
78 |
-
self, chunk, single_page_text, response_auxiliar_summary
|
79 |
-
):
|
80 |
-
lista_contador.append(0)
|
81 |
-
print("contador: ", len(lista_contador))
|
82 |
-
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
|
83 |
-
# page_content = ""
|
84 |
-
# for i in range(
|
85 |
-
# max(0, chunk.page_number - 1),
|
86 |
-
# min(len(single_page_text), chunk.page_number + 2),
|
87 |
-
# ):
|
88 |
-
# page_content += single_page_text[i].page_content if single_page_text[i] else ""
|
89 |
-
page_number = chunk.page_number - 1
|
90 |
-
page_content = single_page_text[page_number].page_content
|
91 |
-
|
92 |
-
context = await self.llm_generate_context(
|
93 |
-
page_content, chunk, response_auxiliar_summary
|
94 |
-
)
|
95 |
-
return ContextualizedChunk(
|
96 |
-
content=chunk.content,
|
97 |
-
page_number=chunk.page_number,
|
98 |
-
chunk_id=chunk.chunk_id,
|
99 |
-
start_char=chunk.start_char,
|
100 |
-
end_char=chunk.end_char,
|
101 |
-
context=context,
|
102 |
-
)
|
103 |
-
|
104 |
async def contextualize_all_chunks(
|
105 |
self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
|
106 |
) -> List[ContextualizedChunk]:
|
107 |
"""Add context to all chunks"""
|
108 |
contextualized_chunks = []
|
109 |
-
lista_contador = []
|
110 |
full_text = ""
|
111 |
for x in full_text_as_array:
|
112 |
full_text += x.page_content
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
#
|
119 |
-
#
|
|
|
120 |
# )
|
121 |
|
122 |
-
|
|
|
|
|
|
|
123 |
|
124 |
-
print("\n\n\
|
125 |
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
128 |
)
|
129 |
|
130 |
-
print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary)
|
131 |
-
|
132 |
async with asyncio.TaskGroup() as tg:
|
133 |
tasks = [
|
134 |
tg.create_task(
|
135 |
self.create_contextualized_chunk(
|
136 |
-
chunk, full_text_as_array, response_auxiliar_summary
|
137 |
)
|
138 |
)
|
139 |
-
for chunk in chunks
|
|
|
140 |
]
|
141 |
|
142 |
-
contextualized_chunks = [task.result() for task in tasks]
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
|
|
144 |
return contextualized_chunks
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
async def get_full_text_and_all_PDFs_chunks(
|
148 |
listaPDFs: List[str],
|
@@ -159,7 +270,9 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
159 |
pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
160 |
else:
|
161 |
pages = pages + get_pdf_from_bubble(pdf_path)
|
162 |
-
chunks = splitterObject.load_and_split_document(
|
|
|
|
|
163 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
164 |
# Get full text for contextualization
|
165 |
# loader = PyPDFLoader(pdf_path)
|
@@ -170,17 +283,10 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
170 |
return all_PDFs_chunks, pages # , full_text
|
171 |
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
chunks_passados = contextualized_chunks
|
181 |
-
is_contextualized_chunk = True
|
182 |
-
else:
|
183 |
-
chunks_passados = all_PDFs_chunks
|
184 |
-
is_contextualized_chunk = False
|
185 |
-
|
186 |
-
return chunks_passados, is_contextualized_chunk
|
|
|
1 |
import os
|
2 |
|
3 |
+
from _utils.LLMs.LLM_class import LLM
|
4 |
from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
5 |
prompt_auxiliar_do_contextual_prompt,
|
6 |
create_prompt_auxiliar_do_contextual_prompt,
|
|
|
12 |
from _utils.splitters.Splitter_class import Splitter
|
13 |
from setup.easy_imports import PyPDFLoader
|
14 |
from langchain_openai import ChatOpenAI
|
15 |
+
from typing import List, Dict, Tuple, Optional, cast
|
16 |
from anthropic import Anthropic, AsyncAnthropic
|
17 |
import logging
|
18 |
from langchain.schema import Document
|
|
|
49 |
self.bm25 = None
|
50 |
self.claude_context_model = claude_context_model
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
async def contextualize_all_chunks(
|
53 |
self, full_text_as_array: List[Document], chunks: List[DocumentChunk]
|
54 |
) -> List[ContextualizedChunk]:
|
55 |
"""Add context to all chunks"""
|
56 |
contextualized_chunks = []
|
|
|
57 |
full_text = ""
|
58 |
for x in full_text_as_array:
|
59 |
full_text += x.page_content
|
60 |
|
61 |
+
prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
|
62 |
+
|
63 |
+
print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
|
64 |
|
65 |
+
# Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
|
66 |
+
# response_auxiliar_summary = await aclaude_answer(
|
67 |
+
# self.claude_client, self.claude_context_model, prompt_auxiliar_summary
|
68 |
# )
|
69 |
|
70 |
+
llms = LLM()
|
71 |
+
response_auxiliar_summary = await llms.googleGemini().ainvoke(
|
72 |
+
[HumanMessage(content=prompt_auxiliar_summary)]
|
73 |
+
)
|
74 |
|
75 |
+
print("\n\n\n\nresponse_auxiliar_summary: ", response_auxiliar_summary.content)
|
76 |
|
77 |
+
lista_de_listas_cada_com_20_chunks = [
|
78 |
+
chunks[i : i + 20] for i in range(0, len(chunks), 20)
|
79 |
+
]
|
80 |
+
print(
|
81 |
+
"lista_de_listas_cada_com_20_chunks: ", lista_de_listas_cada_com_20_chunks
|
82 |
)
|
83 |
|
|
|
|
|
84 |
async with asyncio.TaskGroup() as tg:
|
85 |
tasks = [
|
86 |
tg.create_task(
|
87 |
self.create_contextualized_chunk(
|
88 |
+
chunk, full_text_as_array, response_auxiliar_summary.content
|
89 |
)
|
90 |
)
|
91 |
+
# for chunk in chunks # ORIGINAL
|
92 |
+
for chunk in lista_de_listas_cada_com_20_chunks
|
93 |
]
|
94 |
|
95 |
+
# contextualized_chunks = [task.result() for task in tasks]
|
96 |
+
contextualized_chunks = []
|
97 |
+
for task in tasks:
|
98 |
+
# print("\n\ntask", task)
|
99 |
+
# print("\n\ntask.result()", task.result())
|
100 |
+
|
101 |
+
contextualized_chunks = contextualized_chunks + task.result()
|
102 |
|
103 |
+
print("\n\ncontextualized_chunks", contextualized_chunks)
|
104 |
return contextualized_chunks
|
105 |
|
106 |
+
# ORIGINAL
|
107 |
+
# async def create_contextualized_chunk(
|
108 |
+
# self, chunk, single_page_text, response_auxiliar_summary
|
109 |
+
# ):
|
110 |
+
# lista_contador.append(0)
|
111 |
+
# print("contador: ", len(lista_contador))
|
112 |
+
# page_number = chunk.page_number - 1
|
113 |
+
# page_content = single_page_text[page_number].page_content
|
114 |
+
|
115 |
+
# context = await self.llm_generate_context(
|
116 |
+
# page_content, chunk, response_auxiliar_summary
|
117 |
+
# )
|
118 |
+
# print("context: ", context)
|
119 |
+
# return ContextualizedChunk(
|
120 |
+
# content=chunk.content,
|
121 |
+
# page_number=chunk.page_number,
|
122 |
+
# chunk_id=chunk.chunk_id,
|
123 |
+
# start_char=chunk.start_char,
|
124 |
+
# end_char=chunk.end_char,
|
125 |
+
# context=context,
|
126 |
+
# )
|
127 |
+
|
128 |
+
async def create_contextualized_chunk(
|
129 |
+
self, chunks: List[DocumentChunk], single_page_text, response_auxiliar_summary
|
130 |
+
):
|
131 |
+
|
132 |
+
lista_contador.append(0)
|
133 |
+
print("contador: ", len(lista_contador))
|
134 |
+
all_pages_contents = ""
|
135 |
+
contador = 1
|
136 |
+
for chunk in chunks:
|
137 |
+
page_number = chunk.page_number - 1
|
138 |
+
page_content = single_page_text[page_number].page_content
|
139 |
+
|
140 |
+
all_pages_contents += page_content
|
141 |
+
contador += 1
|
142 |
+
|
143 |
+
context = await self.llm_generate_context(
|
144 |
+
page_content, chunks, response_auxiliar_summary
|
145 |
+
)
|
146 |
+
|
147 |
+
context = (
|
148 |
+
context.replace("document_id: ", "")
|
149 |
+
.replace("document_id:", "")
|
150 |
+
.replace("DOCUMENT_ID: ", "")
|
151 |
+
.replace("DOCUMENT_ID: ", "")
|
152 |
+
)
|
153 |
+
|
154 |
+
# print("context: ", context)
|
155 |
+
import re
|
156 |
+
|
157 |
+
pattern = r"\[(\d+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
|
158 |
+
# pattern = r"\[\s*(?:document_id:\s*)?(\d+)\s*\] --- \[document_title:\s*(.+?)\s*\] --- \[(.+?)\]"
|
159 |
+
matches = re.findall(pattern, context, re.DOTALL)
|
160 |
+
|
161 |
+
# Convert matches to the desired format
|
162 |
+
result = [
|
163 |
+
[int(doc_id), title.strip(), content.strip()]
|
164 |
+
for doc_id, title, content in matches
|
165 |
+
]
|
166 |
+
# print("\n\nresult", result)
|
167 |
+
|
168 |
+
lista_chunks = []
|
169 |
+
for index, chunk in enumerate(chunks):
|
170 |
+
lista_chunks.append(
|
171 |
+
ContextualizedChunk(
|
172 |
+
content=chunk.content,
|
173 |
+
page_number=chunk.page_number,
|
174 |
+
chunk_id=result[index][0],
|
175 |
+
start_char=chunk.start_char,
|
176 |
+
end_char=chunk.end_char,
|
177 |
+
context=" ".join(result[index][1:2]),
|
178 |
+
)
|
179 |
+
)
|
180 |
+
|
181 |
+
return lista_chunks
|
182 |
+
|
183 |
+
# ORIGINAL
|
184 |
+
# async def llm_generate_context(
|
185 |
+
# self, page_text: str, chunk: DocumentChunk, resumo_auxiliar
|
186 |
+
# ) -> str:
|
187 |
+
# """Generate contextual description using ChatOpenAI"""
|
188 |
+
# try:
|
189 |
+
# print("COMEÇOU A REQUISIÇÃO")
|
190 |
+
# prompt = contextual_prompt(page_text, resumo_auxiliar, chunk.content)
|
191 |
+
# # response = await aclaude_answer(
|
192 |
+
# # self.claude_client, self.claude_context_model, prompt
|
193 |
+
# # )
|
194 |
+
|
195 |
+
# # response = await agpt_answer(prompt)
|
196 |
+
# llms = LLM()
|
197 |
+
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
198 |
+
# return cast(str, response.content)
|
199 |
+
# except Exception as e:
|
200 |
+
# self.logger.error(
|
201 |
+
# f"Context generation failed for chunk {chunk.chunk_id}: {str(e)}"
|
202 |
+
# )
|
203 |
+
# return ""
|
204 |
+
|
205 |
+
async def llm_generate_context(
|
206 |
+
self, page_text: str, chunks: List[DocumentChunk], resumo_auxiliar
|
207 |
+
) -> str:
|
208 |
+
"""Generate contextual description using ChatOpenAI"""
|
209 |
+
contador = 1
|
210 |
+
all_chunks_contents = ""
|
211 |
+
|
212 |
+
for chunk in chunks:
|
213 |
+
all_chunks_contents += chunk.content
|
214 |
+
all_chunks_contents += f"\n\n CHUNK {contador}:\n"
|
215 |
+
contador += 1
|
216 |
+
|
217 |
+
try:
|
218 |
+
print("COMEÇOU A REQUISIÇÃO")
|
219 |
+
prompt = contextual_prompt(page_text, resumo_auxiliar, all_chunks_contents)
|
220 |
+
# response = await aclaude_answer(
|
221 |
+
# self.claude_client, self.claude_context_model, prompt
|
222 |
+
# )
|
223 |
+
|
224 |
+
response = await agpt_answer(prompt)
|
225 |
+
# llms = LLM()
|
226 |
+
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
227 |
+
# return cast(str, response.content)
|
228 |
+
return cast(str, response)
|
229 |
+
except Exception as e:
|
230 |
+
self.logger.error(f"Context generation failed for chunks .... : {str(e)}")
|
231 |
+
return ""
|
232 |
+
|
233 |
+
# def gerar_resumo_auxiliar_do_contextual_embedding(self):
|
234 |
+
# prompt = Prompt().create_prompt_template(
|
235 |
+
# "", prompt_auxiliar_do_contextual_prompt
|
236 |
+
# )
|
237 |
+
# Chain(prompt, ChatOpenAI())
|
238 |
+
# return
|
239 |
+
|
240 |
+
|
241 |
+
# Primeira função chamada do arquivo
|
242 |
+
async def contextualize_chunk_based_on_serializer(
|
243 |
+
serializer, contextual_retriever: ContextualRetriever, pages, all_PDFs_chunks
|
244 |
+
):
|
245 |
+
if serializer["should_have_contextual_chunks"]:
|
246 |
+
contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
|
247 |
+
pages, all_PDFs_chunks
|
248 |
+
)
|
249 |
+
chunks_passados = contextualized_chunks
|
250 |
+
is_contextualized_chunk = True
|
251 |
+
else:
|
252 |
+
chunks_passados = all_PDFs_chunks
|
253 |
+
is_contextualized_chunk = False
|
254 |
+
|
255 |
+
return chunks_passados, is_contextualized_chunk
|
256 |
+
|
257 |
|
258 |
async def get_full_text_and_all_PDFs_chunks(
|
259 |
listaPDFs: List[str],
|
|
|
270 |
pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
271 |
else:
|
272 |
pages = pages + get_pdf_from_bubble(pdf_path)
|
273 |
+
chunks = splitterObject.load_and_split_document(
|
274 |
+
pdf_path, pages, should_use_llama_parse
|
275 |
+
)
|
276 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
277 |
# Get full text for contextualization
|
278 |
# loader = PyPDFLoader(pdf_path)
|
|
|
283 |
return all_PDFs_chunks, pages # , full_text
|
284 |
|
285 |
|
286 |
+
# Código comentado abaixo é para ler as páginas ao redor da página atual do chunk
|
287 |
+
# page_content = ""
|
288 |
+
# for i in range(
|
289 |
+
# max(0, chunk.page_number - 1),
|
290 |
+
# min(len(single_page_text), chunk.page_number + 2),
|
291 |
+
# ):
|
292 |
+
# page_content += single_page_text[i].page_content if single_page_text[i] else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_utils/gerar_relatorio_modelo_usuario/prompts.py
CHANGED
@@ -117,40 +117,77 @@ Formate sua resposta da seguinte maneira:
|
|
117 |
</resumo_final>"""
|
118 |
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
Here's the summary of the full text of the document:
|
124 |
<summary_text>
|
125 |
{summary_text}
|
126 |
</summary_text>
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
And here's the specific chunk to contextualize:
|
135 |
-
<chunk>
|
136 |
-
{chunk_content}
|
137 |
-
</chunk>
|
138 |
-
|
139 |
-
Follow these steps:
|
140 |
-
1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
141 |
-
2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
|
142 |
3. Identify where the specific chunk fits within these themes.
|
143 |
4. Create a concise context that situates the chunk within the document.
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
|
156 |
# return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
|
|
|
117 |
</resumo_final>"""
|
118 |
|
119 |
|
120 |
+
# ORIGINAL
|
121 |
+
# def contextual_prompt(single_page_text, summary_text, chunk_content):
|
122 |
+
# return f"""You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze a chunk of text from a larger document and provide a brief context for it.
|
123 |
+
|
124 |
+
# Here's the summary of the full text of the document:
|
125 |
+
# <summary_text>
|
126 |
+
# {summary_text}
|
127 |
+
# </summary_text>
|
128 |
+
|
129 |
+
# Here's the single page where the chunk is situated:
|
130 |
+
|
131 |
+
# <single_page>
|
132 |
+
# {single_page_text}
|
133 |
+
# </single_page>
|
134 |
+
|
135 |
+
# And here's the specific chunk to contextualize:
|
136 |
+
# <chunk>
|
137 |
+
# {chunk_content}
|
138 |
+
# </chunk>
|
139 |
+
|
140 |
+
# Follow these steps:
|
141 |
+
# 1. Identify and quote the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
142 |
+
# 2. Summarize the main topics or themes of the single page and where it fit within the summary of the full text.
|
143 |
+
# 3. Identify where the specific chunk fits within these themes.
|
144 |
+
# 4. Create a concise context that situates the chunk within the document.
|
145 |
+
|
146 |
+
# With this informations, your response should be a single, concise paragraph that includes:
|
147 |
+
# - The document ID
|
148 |
+
# - The document name
|
149 |
+
# - A brief context for the chunk
|
150 |
+
|
151 |
+
# Example final output structure (do not copy the content, only the format):
|
152 |
+
# <chunk_context>
|
153 |
+
# [Single paragraph with document ID, name, and chunk context]
|
154 |
+
# </chunk_context>"""
|
155 |
+
|
156 |
+
|
157 |
+
def contextual_prompt(all_pages_contents, summary_text, chunk_content):
|
158 |
+
return f"""
|
159 |
+
You are an AI assistant specialized in providing context for document retrieval. Your task is to analyze multiple chunks of text from a larger document and provide brief contexts for each of them.
|
160 |
Here's the summary of the full text of the document:
|
161 |
<summary_text>
|
162 |
{summary_text}
|
163 |
</summary_text>
|
164 |
+
Here are the pages where the chunks are situated:
|
165 |
+
<page>
|
166 |
+
{all_pages_contents}
|
167 |
+
</page>
|
168 |
+
You will be given 20 specific chunks to contextualize. For each chunk, follow these steps:
|
169 |
+
1. Identify the document ID (found between "NUM." and "- Pág") and the document name (from the header).
|
170 |
+
2. Summarize the main topics or themes of the single page and how they relate to the summary of the full text.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
3. Identify where the specific chunk fits within these themes.
|
172 |
4. Create a concise context that situates the chunk within the document.
|
173 |
+
Your final output should be a numbered list of 20 chunk contexts, each containing a single, concise paragraph that includes:
|
174 |
+
<final_output>
|
175 |
+
[document_id] --- [document_name] --- [brief_context_for_the_chunk]
|
176 |
+
</final_output>
|
177 |
+
Here are the 20 chunks to analyze:
|
178 |
+
<user_input>
|
179 |
+
{chunk_content}
|
180 |
+
</user_input>
|
181 |
+
Example output structure (do not copy the content, only the format):
|
182 |
+
1. <chunk_context>
|
183 |
+
[document_id] --- [document_title] --- [brief_context_for_the_chunk]
|
184 |
+
</chunk_context>
|
185 |
+
2.<chunk_context>
|
186 |
+
[document_id] --- [document_title] --- [brief_context_for_the_chunk]
|
187 |
+
</chunk_context>
|
188 |
+
[Continue for all 20 chunks]
|
189 |
+
Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
|
190 |
+
"""
|
191 |
|
192 |
|
193 |
# return f"""You are a language model tasked with providing context to improve the retrieval of information from a chunk extracted from a document. Follow these steps internally (do not display reasoning or reflection in the final output):
|
_utils/handle_files.py
CHANGED
@@ -19,7 +19,7 @@ def handle_pdf_files_from_serializer(files):
|
|
19 |
temp_file.write(chunk)
|
20 |
temp_file_path = temp_file.name # Get the path of the temporary file
|
21 |
listaPDFs.append(temp_file_path)
|
22 |
-
print("
|
23 |
return listaPDFs
|
24 |
|
25 |
|
@@ -29,7 +29,7 @@ def remove_pdf_temp_files(listaPDFs):
|
|
29 |
|
30 |
|
31 |
async def return_document_list_with_llama_parser(file: str):
|
32 |
-
llama_parser_api = os.getenv("
|
33 |
documents: List[LangchainDocument] = []
|
34 |
if llama_parser_api:
|
35 |
parser = LlamaParse(
|
@@ -39,7 +39,11 @@ async def return_document_list_with_llama_parser(file: str):
|
|
39 |
verbose=True,
|
40 |
)
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
for doc in parsed_document[0].get("pages"): # type: ignore
|
44 |
# documents.append(doc.to_langchain_format())
|
45 |
|
|
|
19 |
temp_file.write(chunk)
|
20 |
temp_file_path = temp_file.name # Get the path of the temporary file
|
21 |
listaPDFs.append(temp_file_path)
|
22 |
+
print("\n\nlistaPDFs: ", listaPDFs)
|
23 |
return listaPDFs
|
24 |
|
25 |
|
|
|
29 |
|
30 |
|
31 |
async def return_document_list_with_llama_parser(file: str):
|
32 |
+
llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY_POPS")
|
33 |
documents: List[LangchainDocument] = []
|
34 |
if llama_parser_api:
|
35 |
parser = LlamaParse(
|
|
|
39 |
verbose=True,
|
40 |
)
|
41 |
|
42 |
+
try:
|
43 |
+
parsed_document = await parser.aget_json(file)
|
44 |
+
except:
|
45 |
+
raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:")
|
46 |
+
print("parsed_document: ", parsed_document)
|
47 |
for doc in parsed_document[0].get("pages"): # type: ignore
|
48 |
# documents.append(doc.to_langchain_format())
|
49 |
|
_utils/resumo_completo_cursor.py
CHANGED
@@ -105,7 +105,7 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
|
|
105 |
for x in structured_summaries:
|
106 |
texto_completo = texto_completo + x["content"] + "\n"
|
107 |
|
108 |
-
print("\n\ntexto_completo: ", texto_completo)
|
109 |
|
110 |
return {
|
111 |
"resultado": structured_summaries,
|
|
|
105 |
for x in structured_summaries:
|
106 |
texto_completo = texto_completo + x["content"] + "\n"
|
107 |
|
108 |
+
print("\n\ntexto_completo[0: 1000]: ", texto_completo[0:1000])
|
109 |
|
110 |
return {
|
111 |
"resultado": structured_summaries,
|
_utils/splitters/Splitter_class.py
CHANGED
@@ -19,7 +19,7 @@ class Splitter:
|
|
19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
20 |
|
21 |
def load_and_split_document(
|
22 |
-
self, pdf_path: str, pages: List[Document] | None
|
23 |
) -> List[DocumentChunk]:
|
24 |
"""Load PDF and split into chunks with metadata"""
|
25 |
# loader = PyPDFLoader(pdf_path)
|
@@ -43,10 +43,14 @@ class Splitter:
|
|
43 |
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
44 |
end_char = start_char + len(chunk)
|
45 |
|
|
|
|
|
|
|
|
|
46 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
47 |
content=chunk,
|
48 |
page_number=cast(int, page.metadata.get("page"))
|
49 |
-
+
|
50 |
chunk_id=chunk_id,
|
51 |
start_char=char_count + start_char,
|
52 |
end_char=char_count + end_char,
|
|
|
19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
20 |
|
21 |
def load_and_split_document(
|
22 |
+
self, pdf_path: str, pages: List[Document] | None, should_use_llama_parse: bool
|
23 |
) -> List[DocumentChunk]:
|
24 |
"""Load PDF and split into chunks with metadata"""
|
25 |
# loader = PyPDFLoader(pdf_path)
|
|
|
43 |
) # Retorna a posição onde se encontra o chunk dentro da página inteira
|
44 |
end_char = start_char + len(chunk)
|
45 |
|
46 |
+
if should_use_llama_parse:
|
47 |
+
somar_pages = 0
|
48 |
+
else:
|
49 |
+
somar_pages = 1
|
50 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
51 |
content=chunk,
|
52 |
page_number=cast(int, page.metadata.get("page"))
|
53 |
+
+ somar_pages, # 1-based page numbering
|
54 |
chunk_id=chunk_id,
|
55 |
start_char=char_count + start_char,
|
56 |
end_char=char_count + end_char,
|
gerar_documento/serializer.py
CHANGED
@@ -73,4 +73,4 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
73 |
gpt_temperature = serializers.FloatField(default=0)
|
74 |
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
|
75 |
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
|
76 |
-
should_use_llama_parse = serializers.BooleanField(required=False, default=
|
|
|
73 |
gpt_temperature = serializers.FloatField(default=0)
|
74 |
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
|
75 |
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
|
76 |
+
should_use_llama_parse = serializers.BooleanField(required=False, default=False) # type: ignore
|
setup/easy_imports.py
CHANGED
@@ -14,6 +14,7 @@ from langchain.prompts import PromptTemplate
|
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
from langchain_community.document_loaders import PyPDFLoader
|
16 |
from langchain_community.vectorstores import Chroma
|
|
|
17 |
|
18 |
# from langchain_community.chat_models import ChatOpenAI
|
19 |
from langchain_openai import ChatOpenAI
|
|
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
from langchain_community.document_loaders import PyPDFLoader
|
16 |
from langchain_community.vectorstores import Chroma
|
17 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
18 |
|
19 |
# from langchain_community.chat_models import ChatOpenAI
|
20 |
from langchain_openai import ChatOpenAI
|