vella-backend / _utils /gerar_relatorio_modelo_usuario /DocumentSummarizer_simples.py
luanpoppe
feat: melhorias no código e refatorações
12d3e1a
raw
history blame
6.23 kB
from typing import List, Dict, Tuple, Optional
from _utils.splitters.Splitter_class import Splitter
from setup.easy_imports import (
HuggingFaceEmbeddings,
Chroma,
ChatOpenAI,
PromptTemplate,
)
import logging
from cohere import Client
from _utils.models.gerar_relatorio import (
DocumentChunk,
)
class DocumentSummarizer:
def __init__(
self,
openai_api_key: str,
cohere_api_key: str,
embedding_model,
chunk_size,
chunk_overlap,
num_k_rerank,
model_cohere_rerank,
):
self.openai_api_key = openai_api_key
self.cohere_client = Client(cohere_api_key)
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
self.num_k_rerank = num_k_rerank
self.model_cohere_rerank = model_cohere_rerank
self.splitter = Splitter(chunk_size, chunk_overlap)
def create_vector_store(
self, chunks: List[DocumentChunk]
) -> Chroma: # Esta função nunca está sendo utilizada
"""Create vector store with metadata"""
texts = [chunk.content for chunk in chunks]
metadatas = [
{
"chunk_id": chunk.chunk_id,
"page": chunk.page_number,
"start_char": chunk.start_char,
"end_char": chunk.end_char,
}
for chunk in chunks
]
vector_store = Chroma.from_texts(
texts=texts, metadatas=metadatas, embedding=self.embeddings
)
return vector_store
def rerank_chunks( # Esta função nunca está sendo utilizada
self, chunks: List[Dict], query: str, k: int = 5
) -> List[Dict]:
"""
Rerank chunks using Cohere's reranking model.
Args:
chunks: List of dictionaries containing chunks and their metadata
query: Original search query
k: Number of top chunks to return
Returns:
List of reranked chunks with updated relevance scores
"""
try:
# Prepare documents for reranking
documents = [chunk["content"] for chunk in chunks]
# Get reranking scores from Cohere
results = self.cohere_client.rerank(
query=query,
documents=documents,
top_n=k,
model=self.model_cohere_rerank,
)
# Create reranked results with original metadata
reranked_chunks = []
for hit in results:
original_chunk = chunks[hit.index]
reranked_chunks.append(
{**original_chunk, "relevance_score": hit.relevance_score}
)
return reranked_chunks
except Exception as e:
logging.error(f"Reranking failed: {str(e)}")
return chunks[:k] # Fallback to original ordering
def generate_summary_with_sources( # Esta função nunca está sendo utilizada
self,
vector_store: Chroma,
query: str = "Summarize the main points of this document",
) -> List[Dict]:
"""Generate summary with source citations using reranking"""
# Retrieve more initial chunks for reranking
relevant_docs = vector_store.similarity_search_with_score(query, k=20)
# Prepare chunks for reranking
chunks = []
for doc, score in relevant_docs:
chunks.append(
{
"content": doc.page_content,
"page": doc.metadata["page"],
"chunk_id": doc.metadata["chunk_id"],
"relevance_score": score,
}
)
# Rerank chunks
reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank)
# Prepare context and sources from reranked chunks
contexts = []
sources = []
for chunk in reranked_chunks:
contexts.append(chunk["content"])
sources.append(
{
"content": chunk["content"],
"page": chunk["page"],
"chunk_id": chunk["chunk_id"],
"relevance_score": chunk["relevance_score"],
}
)
prompt_template = """
Based on the following context, provide multiple key points from the document.
For each point, create a new paragraph.
Each paragraph should be a complete, self-contained insight.
Context: {context}
Key points:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context"])
llm = ChatOpenAI(
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
)
response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content
# Split the response into paragraphs
summaries = [p.strip() for p in response.split("\n\n") if p.strip()]
# Create structured output
structured_output = []
for idx, summary in enumerate(summaries):
# Associate each summary with the most relevant source
structured_output.append(
{
"content": summary,
"source": {
"page": sources[min(idx, len(sources) - 1)]["page"],
"text": sources[min(idx, len(sources) - 1)]["content"][:200]
+ "...",
"relevance_score": sources[min(idx, len(sources) - 1)][
"relevance_score"
],
},
}
)
return structured_output
def get_source_context(
self, chunk_id: str, window: int = 100
) -> Dict: # Esta função nunca está sendo utilizada
"""Get extended context around a specific chunk"""
metadata = self.chunk_metadata.get(chunk_id)
if not metadata:
return None
return {
"page": metadata["page"],
"start_char": metadata["start_char"],
"end_char": metadata["end_char"],
}