Spaces:
Sleeping
Sleeping
luanpoppe
commited on
Commit
·
e725020
1
Parent(s):
3143cff
feat: separando as etapas de gerar relatório e gerar resumo
Browse files- _utils/resumo_completo_cursor.py +22 -14
- resumos/serializer.py +17 -3
_utils/resumo_completo_cursor.py
CHANGED
|
@@ -317,15 +317,16 @@ class ContextualRetriever:
|
|
| 317 |
return contextualized_chunks
|
| 318 |
|
| 319 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
| 320 |
-
def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model,
|
| 321 |
super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
|
| 322 |
self.config = config
|
| 323 |
self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
|
| 324 |
self.logger = logging.getLogger(__name__)
|
| 325 |
-
self.
|
| 326 |
self.gpt_model = gpt_model
|
| 327 |
self.gpt_temperature = gpt_temperature
|
| 328 |
self.id_modelo_do_usuario = id_modelo_do_usuario
|
|
|
|
| 329 |
|
| 330 |
def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
| 331 |
"""Create vector store and BM25 index with contextualized chunks"""
|
|
@@ -453,8 +454,6 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
| 453 |
'relevance_score': score,
|
| 454 |
'context': metadata.get('context', '')
|
| 455 |
})
|
| 456 |
-
|
| 457 |
-
prompt_template = self.system_prompt
|
| 458 |
|
| 459 |
url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
|
| 460 |
resposta = requests.get(url_request)
|
|
@@ -464,11 +463,6 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
| 464 |
|
| 465 |
modelo_buscado = resposta.json()["modelo"]
|
| 466 |
|
| 467 |
-
prompt = PromptTemplate(
|
| 468 |
-
template=prompt_template,
|
| 469 |
-
input_variables=["context", "modelo_usuario"]
|
| 470 |
-
)
|
| 471 |
-
|
| 472 |
llm = ChatOpenAI(
|
| 473 |
temperature=self.gpt_temperature,
|
| 474 |
model_name=self.gpt_model,
|
|
@@ -476,10 +470,22 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
| 476 |
|
| 477 |
)
|
| 478 |
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
# Split the response into paragraphs
|
| 482 |
-
summaries = [p.strip() for p in
|
| 483 |
|
| 484 |
# Create structured output
|
| 485 |
structured_output = []
|
|
@@ -525,10 +531,11 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
|
|
| 525 |
num_k_rerank=serializer["num_k_rerank"],
|
| 526 |
model_cohere_rerank=serializer["model_cohere_rerank"],
|
| 527 |
claude_context_model=serializer["claude_context_model"],
|
| 528 |
-
|
| 529 |
gpt_model=serializer["model"],
|
| 530 |
gpt_temperature=serializer["gpt_temperature"],
|
| 531 |
-
id_modelo_do_usuario=serializer["id_modelo_do_usuario"]
|
|
|
|
| 532 |
)
|
| 533 |
|
| 534 |
# # Load and process document
|
|
@@ -582,5 +589,6 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
|
|
| 582 |
"hf_embedding": serializer["hf_embedding"],
|
| 583 |
"chunk_size": serializer["chunk_size"],
|
| 584 |
"chunk_overlap": serializer["chunk_overlap"],
|
| 585 |
-
"
|
|
|
|
| 586 |
}}
|
|
|
|
| 317 |
return contextualized_chunks
|
| 318 |
|
| 319 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
| 320 |
+
def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, prompt_relatorio, gpt_model, gpt_temperature, id_modelo_do_usuario, prompt_modelo):
|
| 321 |
super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
|
| 322 |
self.config = config
|
| 323 |
self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
|
| 324 |
self.logger = logging.getLogger(__name__)
|
| 325 |
+
self.prompt_relatorio = prompt_relatorio
|
| 326 |
self.gpt_model = gpt_model
|
| 327 |
self.gpt_temperature = gpt_temperature
|
| 328 |
self.id_modelo_do_usuario = id_modelo_do_usuario
|
| 329 |
+
self.prompt_modelo = prompt_modelo
|
| 330 |
|
| 331 |
def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
| 332 |
"""Create vector store and BM25 index with contextualized chunks"""
|
|
|
|
| 454 |
'relevance_score': score,
|
| 455 |
'context': metadata.get('context', '')
|
| 456 |
})
|
|
|
|
|
|
|
| 457 |
|
| 458 |
url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
|
| 459 |
resposta = requests.get(url_request)
|
|
|
|
| 463 |
|
| 464 |
modelo_buscado = resposta.json()["modelo"]
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
llm = ChatOpenAI(
|
| 467 |
temperature=self.gpt_temperature,
|
| 468 |
model_name=self.gpt_model,
|
|
|
|
| 470 |
|
| 471 |
)
|
| 472 |
|
| 473 |
+
prompt_gerar_relatorio = PromptTemplate(
|
| 474 |
+
template=self.prompt_relatorio,
|
| 475 |
+
input_variables=["context"]
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
relatorio_gerado = llm.predict(prompt_gerar_relatorio.format(context="\n\n".join(contexts)))
|
| 479 |
+
|
| 480 |
+
prompt_gerar_modelo = PromptTemplate(
|
| 481 |
+
template=self.prompt_modelo,
|
| 482 |
+
input_variables=["context", "modelo_usuario"]
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
modelo_gerado = llm.predict(prompt_gerar_modelo.format(context=relatorio_gerado, modelo_usuario=modelo_buscado))
|
| 486 |
|
| 487 |
# Split the response into paragraphs
|
| 488 |
+
summaries = [p.strip() for p in modelo_gerado.split('\n\n') if p.strip()]
|
| 489 |
|
| 490 |
# Create structured output
|
| 491 |
structured_output = []
|
|
|
|
| 531 |
num_k_rerank=serializer["num_k_rerank"],
|
| 532 |
model_cohere_rerank=serializer["model_cohere_rerank"],
|
| 533 |
claude_context_model=serializer["claude_context_model"],
|
| 534 |
+
prompt_relatorio=serializer["prompt_relatorio"],
|
| 535 |
gpt_model=serializer["model"],
|
| 536 |
gpt_temperature=serializer["gpt_temperature"],
|
| 537 |
+
id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
|
| 538 |
+
prompt_modelo=serializer["prompt_modelo"]
|
| 539 |
)
|
| 540 |
|
| 541 |
# # Load and process document
|
|
|
|
| 589 |
"hf_embedding": serializer["hf_embedding"],
|
| 590 |
"chunk_size": serializer["chunk_size"],
|
| 591 |
"chunk_overlap": serializer["chunk_overlap"],
|
| 592 |
+
"prompt_relatorio": serializer["prompt_relatorio"],
|
| 593 |
+
"prompt_modelo": serializer["prompt_modelo"]
|
| 594 |
}}
|
resumos/serializer.py
CHANGED
|
@@ -29,7 +29,20 @@ class ResumoCursorSerializer(serializers.Serializer):
|
|
| 29 |
chunk_overlap = serializers.IntegerField(required=False, default=200)
|
| 30 |
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
Based on the following context, provide multiple key points from the document.
|
| 34 |
For each point, create a new paragraph.
|
| 35 |
Each paragraph should be a complete, self-contained insight.
|
|
@@ -41,9 +54,10 @@ system_prompt = """
|
|
| 41 |
|
| 42 |
Key points:
|
| 43 |
"""
|
| 44 |
-
user_message = "What are the main points of this document?"
|
| 45 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
| 46 |
-
system_prompt =
|
|
|
|
|
|
|
| 47 |
user_message = serializers.CharField(required=False, default=user_message)
|
| 48 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
| 49 |
embedding_weight = serializers.FloatField(default=0.5)
|
|
|
|
| 29 |
chunk_overlap = serializers.IntegerField(required=False, default=200)
|
| 30 |
|
| 31 |
|
| 32 |
+
system_prompt_relatorio = """
|
| 33 |
+
Based on the following context, provide multiple key points from the document.
|
| 34 |
+
For each point, create a new paragraph.
|
| 35 |
+
Each paragraph should be a complete, self-contained insight.
|
| 36 |
+
Include any relevant context provided.
|
| 37 |
+
|
| 38 |
+
Context: {context}
|
| 39 |
+
|
| 40 |
+
Key points:
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
user_message = "What are the main points of this document?"
|
| 44 |
+
|
| 45 |
+
system_prompt_modelo = """
|
| 46 |
Based on the following context, provide multiple key points from the document.
|
| 47 |
For each point, create a new paragraph.
|
| 48 |
Each paragraph should be a complete, self-contained insight.
|
|
|
|
| 54 |
|
| 55 |
Key points:
|
| 56 |
"""
|
|
|
|
| 57 |
class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
|
| 58 |
+
system_prompt = None
|
| 59 |
+
prompt_relatorio = serializers.CharField(required=False, default=system_prompt_relatorio)
|
| 60 |
+
prompt_modelo = serializers.CharField(required=False, default=system_prompt_modelo)
|
| 61 |
user_message = serializers.CharField(required=False, default=user_message)
|
| 62 |
num_chunks_retrieval = serializers.IntegerField(default=5)
|
| 63 |
embedding_weight = serializers.FloatField(default=0.5)
|