Spaces:
Sleeping
Sleeping
luanpoppe
commited on
Commit
·
b374298
1
Parent(s):
fbd645e
feat: adicionando llama_parse e algumas tipagens
Browse files- example.env → .env.example +2 -1
- _utils/bubble_integrations/obter_arquivo.py +2 -2
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +25 -19
- _utils/gerar_relatorio_modelo_usuario/llm_calls.py +25 -19
- _utils/handle_files.py +35 -1
- _utils/resumo_completo_cursor.py +3 -5
- _utils/splitters/Splitter_class.py +13 -10
- gerar_documento/serializer.py +3 -2
- gerar_documento/views.py +5 -2
example.env → .env.example
RENAMED
@@ -5,4 +5,5 @@ HUGGINGFACEHUB_API_TOKEN=""
|
|
5 |
LANGCHAIN_API_KEY=""
|
6 |
CLAUDE_API_KEY=""
|
7 |
COHERE_API_KEY=""
|
8 |
-
BUBBLE_TOKEN=""
|
|
|
|
5 |
LANGCHAIN_API_KEY=""
|
6 |
CLAUDE_API_KEY=""
|
7 |
COHERE_API_KEY=""
|
8 |
+
BUBBLE_TOKEN=""
|
9 |
+
LLAMA_CLOUD_API_KEY=""
|
_utils/bubble_integrations/obter_arquivo.py
CHANGED
@@ -16,8 +16,8 @@ headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
|
16 |
|
17 |
def get_pdf_from_bubble(
|
18 |
file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
|
19 |
-
bubble_editor_version="version-test",
|
20 |
):
|
|
|
21 |
result = PyPDFLoader(file_url, headers=headers)
|
22 |
|
23 |
-
return result
|
|
|
16 |
|
17 |
def get_pdf_from_bubble(
|
18 |
file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
|
|
|
19 |
):
|
20 |
+
|
21 |
result = PyPDFLoader(file_url, headers=headers)
|
22 |
|
23 |
+
return result.load()
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -6,6 +6,7 @@ from _utils.gerar_relatorio_modelo_usuario.prompts import (
|
|
6 |
)
|
7 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
8 |
from _utils.chains.Chain_class import Chain
|
|
|
9 |
from _utils.prompts.Prompt_class import Prompt
|
10 |
from _utils.splitters.Splitter_class import Splitter
|
11 |
from setup.easy_imports import PyPDFLoader
|
@@ -14,6 +15,7 @@ from typing import List, Dict, Tuple, Optional
|
|
14 |
from anthropic import Anthropic, AsyncAnthropic
|
15 |
import logging
|
16 |
from langchain.schema import Document
|
|
|
17 |
import asyncio
|
18 |
from langchain.prompts import PromptTemplate
|
19 |
from typing import List
|
@@ -142,26 +144,30 @@ class ContextualRetriever:
|
|
142 |
return contextualized_chunks
|
143 |
|
144 |
|
145 |
-
def get_full_text_and_all_PDFs_chunks(
|
|
|
|
|
|
|
|
|
146 |
all_PDFs_chunks = []
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
return
|
165 |
|
166 |
|
167 |
async def contextualize_chunk_based_on_serializer(
|
|
|
6 |
)
|
7 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
8 |
from _utils.chains.Chain_class import Chain
|
9 |
+
from _utils.handle_files import return_document_list_with_llama_parser
|
10 |
from _utils.prompts.Prompt_class import Prompt
|
11 |
from _utils.splitters.Splitter_class import Splitter
|
12 |
from setup.easy_imports import PyPDFLoader
|
|
|
15 |
from anthropic import Anthropic, AsyncAnthropic
|
16 |
import logging
|
17 |
from langchain.schema import Document
|
18 |
+
from llama_index import Document as Llama_Index_Document
|
19 |
import asyncio
|
20 |
from langchain.prompts import PromptTemplate
|
21 |
from typing import List
|
|
|
144 |
return contextualized_chunks
|
145 |
|
146 |
|
147 |
+
async def get_full_text_and_all_PDFs_chunks(
|
148 |
+
listaPDFs: List[str],
|
149 |
+
splitterObject: Splitter,
|
150 |
+
should_use_llama_parse: bool,
|
151 |
+
):
|
152 |
all_PDFs_chunks = []
|
153 |
+
|
154 |
+
pages: List[Document] = []
|
155 |
+
|
156 |
+
# Load and process document
|
157 |
+
for pdf_path in listaPDFs:
|
158 |
+
if should_use_llama_parse:
|
159 |
+
pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
160 |
+
else:
|
161 |
+
pages = pages + get_pdf_from_bubble(pdf_path)
|
162 |
+
chunks = splitterObject.load_and_split_document(pdf_path, pages)
|
163 |
+
all_PDFs_chunks = all_PDFs_chunks + chunks
|
164 |
+
# Get full text for contextualization
|
165 |
+
# loader = PyPDFLoader(pdf_path)
|
166 |
+
|
167 |
+
# full_text = ""
|
168 |
+
# full_text = " ".join([page.page_content for page in pages])
|
169 |
+
|
170 |
+
return all_PDFs_chunks, pages # , full_text
|
171 |
|
172 |
|
173 |
async def contextualize_chunk_based_on_serializer(
|
_utils/gerar_relatorio_modelo_usuario/llm_calls.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import os
|
|
|
|
|
2 |
from setup.environment import default_model
|
3 |
from langchain_core.messages import HumanMessage
|
4 |
from langchain_openai import ChatOpenAI
|
@@ -17,14 +19,16 @@ async def aclaude_answer(claude_client, claude_context_model, prompt):
|
|
17 |
|
18 |
|
19 |
async def agpt_answer(prompt):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
|
29 |
|
30 |
def gpt_answer(
|
@@ -34,14 +38,16 @@ def gpt_answer(
|
|
34 |
max_retries=5,
|
35 |
shouldReturnFullResponse=False,
|
36 |
):
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
1 |
import os
|
2 |
+
|
3 |
+
from pydantic import SecretStr
|
4 |
from setup.environment import default_model
|
5 |
from langchain_core.messages import HumanMessage
|
6 |
from langchain_openai import ChatOpenAI
|
|
|
19 |
|
20 |
|
21 |
async def agpt_answer(prompt):
|
22 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
23 |
+
if api_key:
|
24 |
+
gpt = ChatOpenAI(
|
25 |
+
temperature=0,
|
26 |
+
model="gpt-4o-mini",
|
27 |
+
api_key=SecretStr(api_key),
|
28 |
+
max_retries=5,
|
29 |
+
)
|
30 |
+
response = await gpt.ainvoke([HumanMessage(content=prompt)])
|
31 |
+
return response.content
|
32 |
|
33 |
|
34 |
def gpt_answer(
|
|
|
38 |
max_retries=5,
|
39 |
shouldReturnFullResponse=False,
|
40 |
):
|
41 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
42 |
+
if api_key:
|
43 |
+
gpt = ChatOpenAI(
|
44 |
+
temperature=temperature,
|
45 |
+
model=model,
|
46 |
+
api_key=SecretStr(api_key),
|
47 |
+
max_retries=max_retries,
|
48 |
+
)
|
49 |
+
response = gpt.invoke([HumanMessage(content=prompt)])
|
50 |
+
if shouldReturnFullResponse:
|
51 |
+
return response
|
52 |
+
else:
|
53 |
+
return response.content
|
_utils/handle_files.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import tempfile, os
|
2 |
-
|
|
|
|
|
3 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
|
|
4 |
|
5 |
|
6 |
def handle_pdf_files_from_serializer(files):
|
@@ -19,6 +22,37 @@ def handle_pdf_files_from_serializer(files):
|
|
19 |
print("listaPDFs: ", listaPDFs)
|
20 |
return listaPDFs
|
21 |
|
|
|
22 |
def remove_pdf_temp_files(listaPDFs):
|
23 |
for file in listaPDFs:
|
24 |
os.remove(file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import tempfile, os
|
2 |
+
from typing import List
|
3 |
+
from langchain_core.documents import Document as LangchainDocument
|
4 |
+
from llama_index import Document
|
5 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
6 |
+
from llama_parse import LlamaParse, ResultType
|
7 |
|
8 |
|
9 |
def handle_pdf_files_from_serializer(files):
|
|
|
22 |
print("listaPDFs: ", listaPDFs)
|
23 |
return listaPDFs
|
24 |
|
25 |
+
|
26 |
def remove_pdf_temp_files(listaPDFs):
|
27 |
for file in listaPDFs:
|
28 |
os.remove(file)
|
29 |
+
|
30 |
+
|
31 |
+
async def return_document_list_with_llama_parser(file: str):
|
32 |
+
llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
|
33 |
+
documents: List[LangchainDocument] = []
|
34 |
+
if llama_parser_api:
|
35 |
+
parser = LlamaParse(
|
36 |
+
api_key=llama_parser_api,
|
37 |
+
result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
|
38 |
+
language="pt",
|
39 |
+
verbose=True,
|
40 |
+
)
|
41 |
+
|
42 |
+
parsed_document = await parser.aget_json(file)
|
43 |
+
for doc in parsed_document[0].get("pages"): # type: ignore
|
44 |
+
# documents.append(doc.to_langchain_format())
|
45 |
+
|
46 |
+
langchain_document = LangchainDocument(
|
47 |
+
page_content=doc.get("md"), # type: ignore
|
48 |
+
metadata={
|
49 |
+
"page": doc.get("page"), # type: ignore
|
50 |
+
# **doc.get("metadata", {}), # type: ignore
|
51 |
+
}, # Include page number in metadata
|
52 |
+
)
|
53 |
+
|
54 |
+
documents.append(langchain_document)
|
55 |
+
|
56 |
+
return documents
|
57 |
+
else:
|
58 |
+
raise ValueError("Não foi possível obter a API_KEY do llama parser")
|
_utils/resumo_completo_cursor.py
CHANGED
@@ -38,9 +38,7 @@ os.environ.get("LANGCHAIN_API_KEY")
|
|
38 |
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
39 |
|
40 |
|
41 |
-
async def get_llm_summary_answer_by_cursor_complete(
|
42 |
-
serializer, listaPDFs=None, contexto=None
|
43 |
-
):
|
44 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
45 |
# Configuration
|
46 |
config = RetrievalConfig(
|
@@ -70,8 +68,8 @@ async def get_llm_summary_answer_by_cursor_complete(
|
|
70 |
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
71 |
)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
)
|
76 |
|
77 |
chunks_passados, is_contextualized_chunk = (
|
|
|
38 |
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
39 |
|
40 |
|
41 |
+
async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
|
|
|
|
|
42 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
43 |
# Configuration
|
44 |
config = RetrievalConfig(
|
|
|
68 |
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
69 |
)
|
70 |
|
71 |
+
allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
|
72 |
+
listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"]
|
73 |
)
|
74 |
|
75 |
chunks_passados, is_contextualized_chunk = (
|
_utils/splitters/Splitter_class.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
|
3 |
-
from typing import List, Dict, Tuple, Optional
|
4 |
from _utils.models.gerar_relatorio import (
|
5 |
DocumentChunk,
|
6 |
)
|
@@ -18,13 +18,15 @@ class Splitter:
|
|
18 |
)
|
19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
20 |
|
21 |
-
def load_and_split_document(
|
|
|
|
|
22 |
"""Load PDF and split into chunks with metadata"""
|
23 |
# loader = PyPDFLoader(pdf_path)
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
chunks = []
|
29 |
char_count = 0
|
30 |
|
@@ -43,7 +45,8 @@ class Splitter:
|
|
43 |
|
44 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
45 |
content=chunk,
|
46 |
-
page_number=page.metadata.get("page")
|
|
|
47 |
chunk_id=chunk_id,
|
48 |
start_char=char_count + start_char,
|
49 |
end_char=char_count + end_char,
|
@@ -71,8 +74,7 @@ class Splitter:
|
|
71 |
page_chunks = self.text_splitter.split_text(
|
72 |
text
|
73 |
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
74 |
-
print("\n\n\
|
75 |
-
print("page_chunks: ", page_chunks)
|
76 |
|
77 |
for chunk in page_chunks:
|
78 |
chunk_id = str(uuid.uuid4())
|
@@ -83,7 +85,8 @@ class Splitter:
|
|
83 |
|
84 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
85 |
content=chunk,
|
86 |
-
page_number=page.metadata.get("page")
|
|
|
87 |
chunk_id=chunk_id,
|
88 |
start_char=char_count + start_char,
|
89 |
end_char=char_count + end_char,
|
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
|
3 |
+
from typing import List, Dict, Tuple, Optional, cast
|
4 |
from _utils.models.gerar_relatorio import (
|
5 |
DocumentChunk,
|
6 |
)
|
|
|
18 |
)
|
19 |
self.chunk_metadata = {} # Store chunk metadata for tracing
|
20 |
|
21 |
+
def load_and_split_document(
|
22 |
+
self, pdf_path: str, pages: List[Document] | None
|
23 |
+
) -> List[DocumentChunk]:
|
24 |
"""Load PDF and split into chunks with metadata"""
|
25 |
# loader = PyPDFLoader(pdf_path)
|
26 |
+
if not pages:
|
27 |
+
pages = get_pdf_from_bubble(
|
28 |
+
pdf_path
|
29 |
+
) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
30 |
chunks = []
|
31 |
char_count = 0
|
32 |
|
|
|
45 |
|
46 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
47 |
content=chunk,
|
48 |
+
page_number=cast(int, page.metadata.get("page"))
|
49 |
+
+ 1, # 1-based page numbering
|
50 |
chunk_id=chunk_id,
|
51 |
start_char=char_count + start_char,
|
52 |
end_char=char_count + end_char,
|
|
|
74 |
page_chunks = self.text_splitter.split_text(
|
75 |
text
|
76 |
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
|
77 |
+
print("\n\n\npage_chunks: ", page_chunks)
|
|
|
78 |
|
79 |
for chunk in page_chunks:
|
80 |
chunk_id = str(uuid.uuid4())
|
|
|
85 |
|
86 |
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
|
87 |
content=chunk,
|
88 |
+
page_number=cast(int, page.metadata.get("page"))
|
89 |
+
+ 1, # 1-based page numbering
|
90 |
chunk_id=chunk_id,
|
91 |
start_char=char_count + start_char,
|
92 |
end_char=char_count + end_char,
|
gerar_documento/serializer.py
CHANGED
@@ -44,7 +44,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
|
|
44 |
)
|
45 |
gpt_temperature = serializers.FloatField(default=0)
|
46 |
id_modelo_do_usuario = serializers.IntegerField(required=False)
|
47 |
-
should_have_contextual_chunks = serializers.BooleanField(default=False)
|
48 |
|
49 |
|
50 |
class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
@@ -71,4 +71,5 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
71 |
)
|
72 |
gpt_temperature = serializers.FloatField(default=0)
|
73 |
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
|
74 |
-
should_have_contextual_chunks = serializers.BooleanField(default=False)
|
|
|
|
44 |
)
|
45 |
gpt_temperature = serializers.FloatField(default=0)
|
46 |
id_modelo_do_usuario = serializers.IntegerField(required=False)
|
47 |
+
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
|
48 |
|
49 |
|
50 |
class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
|
71 |
)
|
72 |
gpt_temperature = serializers.FloatField(default=0)
|
73 |
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
|
74 |
+
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
|
75 |
+
should_use_llama_parse = serializers.BooleanField(required=False, default=True) # type: ignore
|
gerar_documento/views.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from setup.easy_imports import (
|
2 |
Response,
|
3 |
AsyncAPIView,
|
@@ -27,7 +28,9 @@ class GerarDocumentoView(AsyncAPIView):
|
|
27 |
print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
|
28 |
serializer = GerarDocumentoSerializer(data=request.data)
|
29 |
if serializer.is_valid(raise_exception=True):
|
30 |
-
|
|
|
|
|
31 |
print("\n\ndata: ", data)
|
32 |
|
33 |
data["prompt_auxiliar"] = (
|
@@ -59,7 +62,7 @@ class GerarDocumentoComPDFProprioView(AsyncAPIView):
|
|
59 |
print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
|
60 |
serializer = GerarDocumentoComPDFProprioSerializer(data=request.data)
|
61 |
if serializer.is_valid(raise_exception=True):
|
62 |
-
data = serializer.validated_data
|
63 |
print("\n\ndata: ", data)
|
64 |
|
65 |
listaPDFs = handle_pdf_files_from_serializer(data["files"])
|
|
|
1 |
+
from typing import Any, Dict, cast
|
2 |
from setup.easy_imports import (
|
3 |
Response,
|
4 |
AsyncAPIView,
|
|
|
28 |
print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
|
29 |
serializer = GerarDocumentoSerializer(data=request.data)
|
30 |
if serializer.is_valid(raise_exception=True):
|
31 |
+
if serializer.validated_data:
|
32 |
+
raise ValueError("Erro no validated_data")
|
33 |
+
data = cast(Dict[str, Any], serializer.validated_data)
|
34 |
print("\n\ndata: ", data)
|
35 |
|
36 |
data["prompt_auxiliar"] = (
|
|
|
62 |
print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
|
63 |
serializer = GerarDocumentoComPDFProprioSerializer(data=request.data)
|
64 |
if serializer.is_valid(raise_exception=True):
|
65 |
+
data = cast(Dict[str, Any], serializer.validated_data)
|
66 |
print("\n\ndata: ", data)
|
67 |
|
68 |
listaPDFs = handle_pdf_files_from_serializer(data["files"])
|