luanpoppe commited on
Commit
b374298
·
1 Parent(s): fbd645e

feat: adicionando llama_parse e algumas tipagens

Browse files
example.env → .env.example RENAMED
@@ -5,4 +5,5 @@ HUGGINGFACEHUB_API_TOKEN=""
5
  LANGCHAIN_API_KEY=""
6
  CLAUDE_API_KEY=""
7
  COHERE_API_KEY=""
8
- BUBBLE_TOKEN=""
 
 
5
  LANGCHAIN_API_KEY=""
6
  CLAUDE_API_KEY=""
7
  COHERE_API_KEY=""
8
+ BUBBLE_TOKEN=""
9
+ LLAMA_CLOUD_API_KEY=""
_utils/bubble_integrations/obter_arquivo.py CHANGED
@@ -16,8 +16,8 @@ headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
16
 
17
  def get_pdf_from_bubble(
18
  file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
19
- bubble_editor_version="version-test",
20
  ):
 
21
  result = PyPDFLoader(file_url, headers=headers)
22
 
23
- return result
 
16
 
17
  def get_pdf_from_bubble(
18
  file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
 
19
  ):
20
+
21
  result = PyPDFLoader(file_url, headers=headers)
22
 
23
+ return result.load()
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -6,6 +6,7 @@ from _utils.gerar_relatorio_modelo_usuario.prompts import (
6
  )
7
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
8
  from _utils.chains.Chain_class import Chain
 
9
  from _utils.prompts.Prompt_class import Prompt
10
  from _utils.splitters.Splitter_class import Splitter
11
  from setup.easy_imports import PyPDFLoader
@@ -14,6 +15,7 @@ from typing import List, Dict, Tuple, Optional
14
  from anthropic import Anthropic, AsyncAnthropic
15
  import logging
16
  from langchain.schema import Document
 
17
  import asyncio
18
  from langchain.prompts import PromptTemplate
19
  from typing import List
@@ -142,26 +144,30 @@ class ContextualRetriever:
142
  return contextualized_chunks
143
 
144
 
145
- def get_full_text_and_all_PDFs_chunks(contexto, listaPDFs, splitterObject: Splitter):
 
 
 
 
146
  all_PDFs_chunks = []
147
- full_text = ""
148
- if contexto:
149
- full_text = contexto
150
- chunks = splitterObject.load_and_split_text(full_text)
151
- all_PDFs_chunks = chunks
152
- else:
153
- # Load and process document
154
- for pdf in listaPDFs:
155
- pdf_path = pdf
156
- chunks = splitterObject.load_and_split_document(pdf_path)
157
- all_PDFs_chunks = all_PDFs_chunks + chunks
158
- # Get full text for contextualization
159
- # loader = PyPDFLoader(pdf_path)
160
- loader = get_pdf_from_bubble(pdf_path)
161
- pages = loader.load()
162
- full_text = " ".join([page.page_content for page in pages])
163
-
164
- return full_text, all_PDFs_chunks, pages
165
 
166
 
167
  async def contextualize_chunk_based_on_serializer(
 
6
  )
7
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
8
  from _utils.chains.Chain_class import Chain
9
+ from _utils.handle_files import return_document_list_with_llama_parser
10
  from _utils.prompts.Prompt_class import Prompt
11
  from _utils.splitters.Splitter_class import Splitter
12
  from setup.easy_imports import PyPDFLoader
 
15
  from anthropic import Anthropic, AsyncAnthropic
16
  import logging
17
  from langchain.schema import Document
18
+ from llama_index import Document as Llama_Index_Document
19
  import asyncio
20
  from langchain.prompts import PromptTemplate
21
  from typing import List
 
144
  return contextualized_chunks
145
 
146
 
147
+ async def get_full_text_and_all_PDFs_chunks(
148
+ listaPDFs: List[str],
149
+ splitterObject: Splitter,
150
+ should_use_llama_parse: bool,
151
+ ):
152
  all_PDFs_chunks = []
153
+
154
+ pages: List[Document] = []
155
+
156
+ # Load and process document
157
+ for pdf_path in listaPDFs:
158
+ if should_use_llama_parse:
159
+ pages = pages + await return_document_list_with_llama_parser(pdf_path)
160
+ else:
161
+ pages = pages + get_pdf_from_bubble(pdf_path)
162
+ chunks = splitterObject.load_and_split_document(pdf_path, pages)
163
+ all_PDFs_chunks = all_PDFs_chunks + chunks
164
+ # Get full text for contextualization
165
+ # loader = PyPDFLoader(pdf_path)
166
+
167
+ # full_text = ""
168
+ # full_text = " ".join([page.page_content for page in pages])
169
+
170
+ return all_PDFs_chunks, pages # , full_text
171
 
172
 
173
  async def contextualize_chunk_based_on_serializer(
_utils/gerar_relatorio_modelo_usuario/llm_calls.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
  from setup.environment import default_model
3
  from langchain_core.messages import HumanMessage
4
  from langchain_openai import ChatOpenAI
@@ -17,14 +19,16 @@ async def aclaude_answer(claude_client, claude_context_model, prompt):
17
 
18
 
19
  async def agpt_answer(prompt):
20
- gpt = ChatOpenAI(
21
- temperature=0,
22
- model="gpt-4o-mini",
23
- api_key=os.environ.get("OPENAI_API_KEY"),
24
- max_retries=5,
25
- )
26
- response = await gpt.ainvoke([HumanMessage(content=prompt)])
27
- return response.content
 
 
28
 
29
 
30
  def gpt_answer(
@@ -34,14 +38,16 @@ def gpt_answer(
34
  max_retries=5,
35
  shouldReturnFullResponse=False,
36
  ):
37
- gpt = ChatOpenAI(
38
- temperature=temperature,
39
- model=model,
40
- api_key=os.environ.get("OPENAI_API_KEY"),
41
- max_retries=max_retries,
42
- )
43
- response = gpt.invoke([HumanMessage(content=prompt)])
44
- if shouldReturnFullResponse:
45
- return response
46
- else:
47
- return response.content
 
 
 
1
  import os
2
+
3
+ from pydantic import SecretStr
4
  from setup.environment import default_model
5
  from langchain_core.messages import HumanMessage
6
  from langchain_openai import ChatOpenAI
 
19
 
20
 
21
  async def agpt_answer(prompt):
22
+ api_key = os.environ.get("OPENAI_API_KEY")
23
+ if api_key:
24
+ gpt = ChatOpenAI(
25
+ temperature=0,
26
+ model="gpt-4o-mini",
27
+ api_key=SecretStr(api_key),
28
+ max_retries=5,
29
+ )
30
+ response = await gpt.ainvoke([HumanMessage(content=prompt)])
31
+ return response.content
32
 
33
 
34
  def gpt_answer(
 
38
  max_retries=5,
39
  shouldReturnFullResponse=False,
40
  ):
41
+ api_key = os.environ.get("OPENAI_API_KEY")
42
+ if api_key:
43
+ gpt = ChatOpenAI(
44
+ temperature=temperature,
45
+ model=model,
46
+ api_key=SecretStr(api_key),
47
+ max_retries=max_retries,
48
+ )
49
+ response = gpt.invoke([HumanMessage(content=prompt)])
50
+ if shouldReturnFullResponse:
51
+ return response
52
+ else:
53
+ return response.content
_utils/handle_files.py CHANGED
@@ -1,6 +1,9 @@
1
  import tempfile, os
2
-
 
 
3
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 
4
 
5
 
6
  def handle_pdf_files_from_serializer(files):
@@ -19,6 +22,37 @@ def handle_pdf_files_from_serializer(files):
19
  print("listaPDFs: ", listaPDFs)
20
  return listaPDFs
21
 
 
22
  def remove_pdf_temp_files(listaPDFs):
23
  for file in listaPDFs:
24
  os.remove(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import tempfile, os
2
+ from typing import List
3
+ from langchain_core.documents import Document as LangchainDocument
4
+ from llama_index import Document
5
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
6
+ from llama_parse import LlamaParse, ResultType
7
 
8
 
9
  def handle_pdf_files_from_serializer(files):
 
22
  print("listaPDFs: ", listaPDFs)
23
  return listaPDFs
24
 
25
+
26
  def remove_pdf_temp_files(listaPDFs):
27
  for file in listaPDFs:
28
  os.remove(file)
29
+
30
+
31
+ async def return_document_list_with_llama_parser(file: str):
32
+ llama_parser_api = os.getenv("LLAMA_CLOUD_API_KEY")
33
+ documents: List[LangchainDocument] = []
34
+ if llama_parser_api:
35
+ parser = LlamaParse(
36
+ api_key=llama_parser_api,
37
+ result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured'
38
+ language="pt",
39
+ verbose=True,
40
+ )
41
+
42
+ parsed_document = await parser.aget_json(file)
43
+ for doc in parsed_document[0].get("pages"): # type: ignore
44
+ # documents.append(doc.to_langchain_format())
45
+
46
+ langchain_document = LangchainDocument(
47
+ page_content=doc.get("md"), # type: ignore
48
+ metadata={
49
+ "page": doc.get("page"), # type: ignore
50
+ # **doc.get("metadata", {}), # type: ignore
51
+ }, # Include page number in metadata
52
+ )
53
+
54
+ documents.append(langchain_document)
55
+
56
+ return documents
57
+ else:
58
+ raise ValueError("Não foi possível obter a API_KEY do llama parser")
_utils/resumo_completo_cursor.py CHANGED
@@ -38,9 +38,7 @@ os.environ.get("LANGCHAIN_API_KEY")
38
  os.environ["LANGCHAIN_PROJECT"] = "VELLA"
39
 
40
 
41
- async def get_llm_summary_answer_by_cursor_complete(
42
- serializer, listaPDFs=None, contexto=None
43
- ):
44
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
45
  # Configuration
46
  config = RetrievalConfig(
@@ -70,8 +68,8 @@ async def get_llm_summary_answer_by_cursor_complete(
70
  reciprocal_rank_fusion=reciprocal_rank_fusion,
71
  )
72
 
73
- full_text, allPdfsChunks, pages = get_full_text_and_all_PDFs_chunks(
74
- contexto, listaPDFs, summarizer.splitter
75
  )
76
 
77
  chunks_passados, is_contextualized_chunk = (
 
38
  os.environ["LANGCHAIN_PROJECT"] = "VELLA"
39
 
40
 
41
+ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
 
 
42
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
43
  # Configuration
44
  config = RetrievalConfig(
 
68
  reciprocal_rank_fusion=reciprocal_rank_fusion,
69
  )
70
 
71
+ allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
72
+ listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"]
73
  )
74
 
75
  chunks_passados, is_contextualized_chunk = (
_utils/splitters/Splitter_class.py CHANGED
@@ -1,6 +1,6 @@
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
3
- from typing import List, Dict, Tuple, Optional
4
  from _utils.models.gerar_relatorio import (
5
  DocumentChunk,
6
  )
@@ -18,13 +18,15 @@ class Splitter:
18
  )
19
  self.chunk_metadata = {} # Store chunk metadata for tracing
20
 
21
- def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
 
 
22
  """Load PDF and split into chunks with metadata"""
23
  # loader = PyPDFLoader(pdf_path)
24
- loader = get_pdf_from_bubble(pdf_path)
25
- pages = (
26
- loader.load()
27
- ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
28
  chunks = []
29
  char_count = 0
30
 
@@ -43,7 +45,8 @@ class Splitter:
43
 
44
  doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
45
  content=chunk,
46
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
 
47
  chunk_id=chunk_id,
48
  start_char=char_count + start_char,
49
  end_char=char_count + end_char,
@@ -71,8 +74,7 @@ class Splitter:
71
  page_chunks = self.text_splitter.split_text(
72
  text
73
  ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
74
- print("\n\n\n")
75
- print("page_chunks: ", page_chunks)
76
 
77
  for chunk in page_chunks:
78
  chunk_id = str(uuid.uuid4())
@@ -83,7 +85,8 @@ class Splitter:
83
 
84
  doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
85
  content=chunk,
86
- page_number=page.metadata.get("page") + 1, # 1-based page numbering
 
87
  chunk_id=chunk_id,
88
  start_char=char_count + start_char,
89
  end_char=char_count + end_char,
 
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
3
+ from typing import List, Dict, Tuple, Optional, cast
4
  from _utils.models.gerar_relatorio import (
5
  DocumentChunk,
6
  )
 
18
  )
19
  self.chunk_metadata = {} # Store chunk metadata for tracing
20
 
21
+ def load_and_split_document(
22
+ self, pdf_path: str, pages: List[Document] | None
23
+ ) -> List[DocumentChunk]:
24
  """Load PDF and split into chunks with metadata"""
25
  # loader = PyPDFLoader(pdf_path)
26
+ if not pages:
27
+ pages = get_pdf_from_bubble(
28
+ pdf_path
29
+ ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
30
  chunks = []
31
  char_count = 0
32
 
 
45
 
46
  doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
47
  content=chunk,
48
+ page_number=cast(int, page.metadata.get("page"))
49
+ + 1, # 1-based page numbering
50
  chunk_id=chunk_id,
51
  start_char=char_count + start_char,
52
  end_char=char_count + end_char,
 
74
  page_chunks = self.text_splitter.split_text(
75
  text
76
  ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
77
+ print("\n\n\npage_chunks: ", page_chunks)
 
78
 
79
  for chunk in page_chunks:
80
  chunk_id = str(uuid.uuid4())
 
85
 
86
  doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
87
  content=chunk,
88
+ page_number=cast(int, page.metadata.get("page"))
89
+ + 1, # 1-based page numbering
90
  chunk_id=chunk_id,
91
  start_char=char_count + start_char,
92
  end_char=char_count + end_char,
gerar_documento/serializer.py CHANGED
@@ -44,7 +44,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
44
  )
45
  gpt_temperature = serializers.FloatField(default=0)
46
  id_modelo_do_usuario = serializers.IntegerField(required=False)
47
- should_have_contextual_chunks = serializers.BooleanField(default=False)
48
 
49
 
50
  class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
@@ -71,4 +71,5 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
71
  )
72
  gpt_temperature = serializers.FloatField(default=0)
73
  id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
74
- should_have_contextual_chunks = serializers.BooleanField(default=False)
 
 
44
  )
45
  gpt_temperature = serializers.FloatField(default=0)
46
  id_modelo_do_usuario = serializers.IntegerField(required=False)
47
+ should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
48
 
49
 
50
  class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
 
71
  )
72
  gpt_temperature = serializers.FloatField(default=0)
73
  id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
74
+ should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
75
+ should_use_llama_parse = serializers.BooleanField(required=False, default=True) # type: ignore
gerar_documento/views.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from setup.easy_imports import (
2
  Response,
3
  AsyncAPIView,
@@ -27,7 +28,9 @@ class GerarDocumentoView(AsyncAPIView):
27
  print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
28
  serializer = GerarDocumentoSerializer(data=request.data)
29
  if serializer.is_valid(raise_exception=True):
30
- data = serializer.validated_data
 
 
31
  print("\n\ndata: ", data)
32
 
33
  data["prompt_auxiliar"] = (
@@ -59,7 +62,7 @@ class GerarDocumentoComPDFProprioView(AsyncAPIView):
59
  print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
60
  serializer = GerarDocumentoComPDFProprioSerializer(data=request.data)
61
  if serializer.is_valid(raise_exception=True):
62
- data = serializer.validated_data
63
  print("\n\ndata: ", data)
64
 
65
  listaPDFs = handle_pdf_files_from_serializer(data["files"])
 
1
+ from typing import Any, Dict, cast
2
  from setup.easy_imports import (
3
  Response,
4
  AsyncAPIView,
 
28
  print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
29
  serializer = GerarDocumentoSerializer(data=request.data)
30
  if serializer.is_valid(raise_exception=True):
31
+ if serializer.validated_data:
32
+ raise ValueError("Erro no validated_data")
33
+ data = cast(Dict[str, Any], serializer.validated_data)
34
  print("\n\ndata: ", data)
35
 
36
  data["prompt_auxiliar"] = (
 
62
  print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
63
  serializer = GerarDocumentoComPDFProprioSerializer(data=request.data)
64
  if serializer.is_valid(raise_exception=True):
65
+ data = cast(Dict[str, Any], serializer.validated_data)
66
  print("\n\ndata: ", data)
67
 
68
  listaPDFs = handle_pdf_files_from_serializer(data["files"])