Spaces:
Running
Running
luanpoppe
commited on
Commit
·
aae4d3d
1
Parent(s):
5fde427
feat: melhorando refatoração do gerar_documento
Browse files
_utils/gerar_documento.py
CHANGED
|
@@ -1,30 +1,11 @@
|
|
| 1 |
import os
|
| 2 |
-
from
|
| 3 |
-
from typing import Any, Union, cast
|
| 4 |
-
from _utils.Utils_Class import UtilsClass
|
| 5 |
-
from _utils.axiom_logs import AxiomLogs
|
| 6 |
-
from _utils.langchain_utils.LLM_class import LLM
|
| 7 |
-
from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
|
| 8 |
from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
|
| 9 |
from rest_framework.response import Response
|
| 10 |
from _utils.gerar_documento_utils.GerarDocumento import (
|
| 11 |
GerarDocumento,
|
| 12 |
)
|
| 13 |
-
from _utils.gerar_documento_utils.contextual_retriever import (
|
| 14 |
-
ContextualRetriever,
|
| 15 |
-
)
|
| 16 |
-
from _utils.gerar_documento_utils.utils import (
|
| 17 |
-
generate_document_title,
|
| 18 |
-
gerar_resposta_compilada,
|
| 19 |
-
get_response_from_auxiliar_contextual_prompt,
|
| 20 |
-
)
|
| 21 |
-
from _utils.models.gerar_documento import (
|
| 22 |
-
RetrievalConfig,
|
| 23 |
-
)
|
| 24 |
-
import markdown
|
| 25 |
|
| 26 |
-
from _utils.langchain_utils.Prompt_class import Prompt
|
| 27 |
-
from _utils.utils import convert_markdown_to_HTML
|
| 28 |
from gerar_documento.serializer import (
|
| 29 |
GerarDocumentoComPDFProprioSerializer,
|
| 30 |
GerarDocumentoComPDFProprioSerializerData,
|
|
@@ -48,20 +29,12 @@ async def gerar_documento(
|
|
| 48 |
isBubble=False,
|
| 49 |
):
|
| 50 |
try:
|
| 51 |
-
axiom = axiom_instance.send_axiom
|
| 52 |
-
ax = AxiomLogs(axiom_instance)
|
| 53 |
-
utils = UtilsClass()
|
| 54 |
summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
|
|
|
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
is_contextualized_chunk = serializer.should_have_contextual_chunks
|
| 59 |
|
| 60 |
-
|
| 61 |
-
full_text_as_array
|
| 62 |
-
)
|
| 63 |
-
summarizer.resumo_auxiliar = response_auxiliar_summary
|
| 64 |
-
ax.resumo_inicial_processo(response_auxiliar_summary)
|
| 65 |
|
| 66 |
await summarizer.generate_chunks_processados()
|
| 67 |
|
|
@@ -85,7 +58,7 @@ async def gerar_documento(
|
|
| 85 |
"texto_completo": summarizer.texto_completo_como_html,
|
| 86 |
"titulo_do_documento": summarizer.titulo_do_documento,
|
| 87 |
"resultado": structured_summaries,
|
| 88 |
-
"parametros-utilizados": gerar_resposta_compilada(
|
| 89 |
}
|
| 90 |
except Exception as e:
|
| 91 |
custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import Any, Union
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
|
| 4 |
from rest_framework.response import Response
|
| 5 |
from _utils.gerar_documento_utils.GerarDocumento import (
|
| 6 |
GerarDocumento,
|
| 7 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
| 9 |
from gerar_documento.serializer import (
|
| 10 |
GerarDocumentoComPDFProprioSerializer,
|
| 11 |
GerarDocumentoComPDFProprioSerializerData,
|
|
|
|
| 29 |
isBubble=False,
|
| 30 |
):
|
| 31 |
try:
|
|
|
|
|
|
|
|
|
|
| 32 |
summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
|
| 33 |
+
summarizer.lista_pdfs = listaPDFs
|
| 34 |
|
| 35 |
+
await summarizer.get_text_and_pdf_chunks()
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
await summarizer.get_response_from_auxiliar_contextual_prompt()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
await summarizer.generate_chunks_processados()
|
| 40 |
|
|
|
|
| 58 |
"texto_completo": summarizer.texto_completo_como_html,
|
| 59 |
"titulo_do_documento": summarizer.titulo_do_documento,
|
| 60 |
"resultado": structured_summaries,
|
| 61 |
+
"parametros-utilizados": summarizer.gerar_resposta_compilada(),
|
| 62 |
}
|
| 63 |
except Exception as e:
|
| 64 |
custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
|
_utils/gerar_documento_utils/GerarDocumento.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
-
import
|
| 3 |
from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
|
| 4 |
|
| 5 |
from pydantic import SecretStr
|
|
@@ -9,6 +9,7 @@ from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_fin
|
|
| 9 |
from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
|
| 10 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
| 11 |
from _utils.gerar_documento_utils.prompts import (
|
|
|
|
| 12 |
prompt_gerar_query_dinamicamente,
|
| 13 |
prompt_para_gerar_titulo,
|
| 14 |
)
|
|
@@ -40,6 +41,7 @@ from _utils.langchain_utils.Splitter_class import Splitter
|
|
| 40 |
import time
|
| 41 |
from setup.tokens import openai_api_key, cohere_api_key
|
| 42 |
from setup.logging import Axiom
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
|
@@ -124,6 +126,10 @@ class GerarDocumento:
|
|
| 124 |
structured_output: List[Any]
|
| 125 |
texto_completo_como_html: str
|
| 126 |
titulo_do_documento: str
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
def __init__(
|
| 129 |
self,
|
|
@@ -133,6 +139,7 @@ class GerarDocumento:
|
|
| 133 |
isBubble: bool,
|
| 134 |
axiom_instance: Axiom,
|
| 135 |
):
|
|
|
|
| 136 |
self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
|
| 137 |
self.logger = logging.getLogger(__name__)
|
| 138 |
# self.prompt_auxiliar = prompt_auxiliar
|
|
@@ -188,8 +195,10 @@ class GerarDocumento:
|
|
| 188 |
else self.all_PDFs_chunks
|
| 189 |
)
|
| 190 |
self.chunks_processados = chunks_processados
|
| 191 |
-
self.
|
| 192 |
-
|
|
|
|
|
|
|
| 193 |
|
| 194 |
async def generate_query_for_vector_store(self):
|
| 195 |
prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
|
|
@@ -542,3 +551,77 @@ class GerarDocumento:
|
|
| 542 |
self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
|
| 543 |
|
| 544 |
return texto_final_juntando_as_etapas
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
+
from langchain_core.messages import HumanMessage
|
| 3 |
from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
|
| 4 |
|
| 5 |
from pydantic import SecretStr
|
|
|
|
| 9 |
from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
|
| 10 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
| 11 |
from _utils.gerar_documento_utils.prompts import (
|
| 12 |
+
create_prompt_auxiliar_do_contextual_prompt,
|
| 13 |
prompt_gerar_query_dinamicamente,
|
| 14 |
prompt_para_gerar_titulo,
|
| 15 |
)
|
|
|
|
| 41 |
import time
|
| 42 |
from setup.tokens import openai_api_key, cohere_api_key
|
| 43 |
from setup.logging import Axiom
|
| 44 |
+
import tiktoken
|
| 45 |
|
| 46 |
|
| 47 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
|
|
|
| 126 |
structured_output: List[Any]
|
| 127 |
texto_completo_como_html: str
|
| 128 |
titulo_do_documento: str
|
| 129 |
+
encoding_tiktoken = tiktoken.get_encoding("cl100k_base")
|
| 130 |
+
serializer: Union[
|
| 131 |
+
GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
|
| 132 |
+
]
|
| 133 |
|
| 134 |
def __init__(
|
| 135 |
self,
|
|
|
|
| 139 |
isBubble: bool,
|
| 140 |
axiom_instance: Axiom,
|
| 141 |
):
|
| 142 |
+
self.serializer = serializer
|
| 143 |
self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
|
| 144 |
self.logger = logging.getLogger(__name__)
|
| 145 |
# self.prompt_auxiliar = prompt_auxiliar
|
|
|
|
| 195 |
else self.all_PDFs_chunks
|
| 196 |
)
|
| 197 |
self.chunks_processados = chunks_processados
|
| 198 |
+
if len(self.chunks_processados) == 0:
|
| 199 |
+
self.chunks_processados = self.all_PDFs_chunks
|
| 200 |
+
self.ax.chunks_inicialmente(self.chunks_processados)
|
| 201 |
+
return self.chunks_processados
|
| 202 |
|
| 203 |
async def generate_query_for_vector_store(self):
|
| 204 |
prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
|
|
|
|
| 551 |
self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
|
| 552 |
|
| 553 |
return texto_final_juntando_as_etapas
|
| 554 |
+
|
| 555 |
+
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
|
| 556 |
+
async def get_response_from_auxiliar_contextual_prompt(self):
|
| 557 |
+
llms = LLM()
|
| 558 |
+
responses = []
|
| 559 |
+
|
| 560 |
+
current_chunk = []
|
| 561 |
+
current_token_count = 0
|
| 562 |
+
chunk_counter = 1
|
| 563 |
+
|
| 564 |
+
for part in self.full_text_as_array:
|
| 565 |
+
part_tokens = len(self.encoding_tiktoken.encode(part))
|
| 566 |
+
|
| 567 |
+
# Check if adding this part would EXCEED the limit
|
| 568 |
+
if current_token_count + part_tokens > 600000:
|
| 569 |
+
# Process the accumulated chunk before it exceeds the limit
|
| 570 |
+
chunk_text = "".join(current_chunk)
|
| 571 |
+
print(
|
| 572 |
+
f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
| 576 |
+
response = await llms.google_gemini().ainvoke(
|
| 577 |
+
[HumanMessage(content=prompt)]
|
| 578 |
+
)
|
| 579 |
+
responses.append(response.content)
|
| 580 |
+
|
| 581 |
+
# Start new chunk with current part
|
| 582 |
+
current_chunk = [part]
|
| 583 |
+
current_token_count = part_tokens
|
| 584 |
+
chunk_counter += 1
|
| 585 |
+
else:
|
| 586 |
+
# Safe to add to current chunk
|
| 587 |
+
current_chunk.append(part)
|
| 588 |
+
current_token_count += part_tokens
|
| 589 |
+
|
| 590 |
+
# Process the final remaining chunk
|
| 591 |
+
if current_chunk:
|
| 592 |
+
chunk_text = "".join(current_chunk)
|
| 593 |
+
print(
|
| 594 |
+
f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
|
| 595 |
+
)
|
| 596 |
+
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
| 597 |
+
response = await llms.google_gemini().ainvoke(
|
| 598 |
+
[HumanMessage(content=prompt)]
|
| 599 |
+
)
|
| 600 |
+
responses.append(response.content)
|
| 601 |
+
|
| 602 |
+
self.resumo_auxiliar = "".join(responses)
|
| 603 |
+
self.ax.resumo_inicial_processo(self.resumo_auxiliar)
|
| 604 |
+
|
| 605 |
+
return self.resumo_auxiliar
|
| 606 |
+
|
| 607 |
+
def gerar_resposta_compilada(self):
|
| 608 |
+
serializer = self.serializer
|
| 609 |
+
return {
|
| 610 |
+
"num_chunks_retrieval": serializer.num_chunks_retrieval,
|
| 611 |
+
"embedding_weight": serializer.embedding_weight,
|
| 612 |
+
"bm25_weight": serializer.bm25_weight,
|
| 613 |
+
"context_window": serializer.context_window,
|
| 614 |
+
"chunk_overlap": serializer.chunk_overlap,
|
| 615 |
+
"num_k_rerank": serializer.num_k_rerank,
|
| 616 |
+
"model_cohere_rerank": serializer.model_cohere_rerank,
|
| 617 |
+
"more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
|
| 618 |
+
"claude_context_model": serializer.claude_context_model,
|
| 619 |
+
"gpt_temperature": serializer.gpt_temperature,
|
| 620 |
+
"user_message": serializer.user_message,
|
| 621 |
+
"model": serializer.model,
|
| 622 |
+
"hf_embedding": serializer.hf_embedding,
|
| 623 |
+
"chunk_size": serializer.chunk_size,
|
| 624 |
+
"chunk_overlap": serializer.chunk_overlap,
|
| 625 |
+
# "prompt_auxiliar": serializer.prompt_auxiliar,
|
| 626 |
+
"prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
|
| 627 |
+
}
|
_utils/gerar_documento_utils/utils.py
CHANGED
|
@@ -1,92 +1,10 @@
|
|
| 1 |
-
from typing import Any, List, Tuple, Union
|
| 2 |
-
from langchain_core.documents import Document
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
|
| 5 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
| 6 |
-
from _utils.
|
| 7 |
-
from _utils.langchain_utils.LLM_class import LLM
|
| 8 |
-
from _utils.gerar_documento_utils.prompts import (
|
| 9 |
-
create_prompt_auxiliar_do_contextual_prompt,
|
| 10 |
-
prompt_para_gerar_titulo,
|
| 11 |
-
)
|
| 12 |
-
|
| 13 |
-
from _utils.models.gerar_documento import DocumentChunk
|
| 14 |
-
from gerar_documento.serializer import GerarDocumentoSerializerData
|
| 15 |
import tiktoken
|
| 16 |
|
| 17 |
encoding = tiktoken.get_encoding("cl100k_base")
|
| 18 |
|
| 19 |
|
| 20 |
-
def gerar_resposta_compilada(serializer: Union[GerarDocumentoSerializerData, Any]):
|
| 21 |
-
return {
|
| 22 |
-
"num_chunks_retrieval": serializer.num_chunks_retrieval,
|
| 23 |
-
"embedding_weight": serializer.embedding_weight,
|
| 24 |
-
"bm25_weight": serializer.bm25_weight,
|
| 25 |
-
"context_window": serializer.context_window,
|
| 26 |
-
"chunk_overlap": serializer.chunk_overlap,
|
| 27 |
-
"num_k_rerank": serializer.num_k_rerank,
|
| 28 |
-
"model_cohere_rerank": serializer.model_cohere_rerank,
|
| 29 |
-
"more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
|
| 30 |
-
"claude_context_model": serializer.claude_context_model,
|
| 31 |
-
"gpt_temperature": serializer.gpt_temperature,
|
| 32 |
-
"user_message": serializer.user_message,
|
| 33 |
-
"model": serializer.model,
|
| 34 |
-
"hf_embedding": serializer.hf_embedding,
|
| 35 |
-
"chunk_size": serializer.chunk_size,
|
| 36 |
-
"chunk_overlap": serializer.chunk_overlap,
|
| 37 |
-
# "prompt_auxiliar": serializer.prompt_auxiliar,
|
| 38 |
-
"prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
|
| 43 |
-
async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
|
| 44 |
-
llms = LLM()
|
| 45 |
-
responses = []
|
| 46 |
-
|
| 47 |
-
current_chunk = []
|
| 48 |
-
current_token_count = 0
|
| 49 |
-
chunk_counter = 1
|
| 50 |
-
|
| 51 |
-
for part in full_text_as_array:
|
| 52 |
-
part_tokens = len(encoding.encode(part))
|
| 53 |
-
|
| 54 |
-
# Check if adding this part would EXCEED the limit
|
| 55 |
-
if current_token_count + part_tokens > 600000:
|
| 56 |
-
# Process the accumulated chunk before it exceeds the limit
|
| 57 |
-
chunk_text = "".join(current_chunk)
|
| 58 |
-
print(
|
| 59 |
-
f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
| 63 |
-
response = await llms.google_gemini().ainvoke(
|
| 64 |
-
[HumanMessage(content=prompt)]
|
| 65 |
-
)
|
| 66 |
-
responses.append(response.content)
|
| 67 |
-
|
| 68 |
-
# Start new chunk with current part
|
| 69 |
-
current_chunk = [part]
|
| 70 |
-
current_token_count = part_tokens
|
| 71 |
-
chunk_counter += 1
|
| 72 |
-
else:
|
| 73 |
-
# Safe to add to current chunk
|
| 74 |
-
current_chunk.append(part)
|
| 75 |
-
current_token_count += part_tokens
|
| 76 |
-
|
| 77 |
-
# Process the final remaining chunk
|
| 78 |
-
if current_chunk:
|
| 79 |
-
chunk_text = "".join(current_chunk)
|
| 80 |
-
print(
|
| 81 |
-
f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
|
| 82 |
-
)
|
| 83 |
-
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
| 84 |
-
response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
|
| 85 |
-
responses.append(response.content)
|
| 86 |
-
|
| 87 |
-
return "".join(responses)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
def split_text_by_tokens(full_text: str):
|
| 91 |
tokens = encoding.encode(full_text)
|
| 92 |
max_tokens = 600000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
| 2 |
+
from _utils.gerar_documento_utils.prompts import prompt_para_gerar_titulo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import tiktoken
|
| 4 |
|
| 5 |
encoding = tiktoken.get_encoding("cl100k_base")
|
| 6 |
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def split_text_by_tokens(full_text: str):
|
| 9 |
tokens = encoding.encode(full_text)
|
| 10 |
max_tokens = 600000
|
gerar_documento/views.py
CHANGED
|
@@ -7,7 +7,6 @@ from _utils.gerar_documento_utils.GerarDocumento import GerarDocumento
|
|
| 7 |
from _utils.langchain_utils.LLM_class import LLM
|
| 8 |
from _utils.gerar_documento_utils.utils import (
|
| 9 |
generate_document_title,
|
| 10 |
-
gerar_resposta_compilada,
|
| 11 |
split_text_by_tokens,
|
| 12 |
)
|
| 13 |
from _utils.langchain_utils.Prompt_class import Prompt
|
|
|
|
| 7 |
from _utils.langchain_utils.LLM_class import LLM
|
| 8 |
from _utils.gerar_documento_utils.utils import (
|
| 9 |
generate_document_title,
|
|
|
|
| 10 |
split_text_by_tokens,
|
| 11 |
)
|
| 12 |
from _utils.langchain_utils.Prompt_class import Prompt
|