Spaces:
Running
Running
from typing import List, Dict, Tuple, Optional | |
from _utils.splitters.Splitter_class import Splitter | |
from setup.easy_imports import ( | |
HuggingFaceEmbeddings, | |
Chroma, | |
ChatOpenAI, | |
PromptTemplate, | |
) | |
import logging | |
from cohere import Client | |
from _utils.models.gerar_relatorio import ( | |
DocumentChunk, | |
) | |
class DocumentSummarizer: | |
def __init__( | |
self, | |
openai_api_key: str, | |
cohere_api_key: str, | |
embedding_model, | |
chunk_size, | |
chunk_overlap, | |
num_k_rerank, | |
model_cohere_rerank, | |
): | |
self.openai_api_key = openai_api_key | |
self.cohere_client = Client(cohere_api_key) | |
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
self.num_k_rerank = num_k_rerank | |
self.model_cohere_rerank = model_cohere_rerank | |
self.splitter = Splitter(chunk_size, chunk_overlap) | |
def create_vector_store( | |
self, chunks: List[DocumentChunk] | |
) -> Chroma: # Esta função nunca está sendo utilizada | |
"""Create vector store with metadata""" | |
texts = [chunk.content for chunk in chunks] | |
metadatas = [ | |
{ | |
"chunk_id": chunk.chunk_id, | |
"page": chunk.page_number, | |
"start_char": chunk.start_char, | |
"end_char": chunk.end_char, | |
} | |
for chunk in chunks | |
] | |
vector_store = Chroma.from_texts( | |
texts=texts, metadatas=metadatas, embedding=self.embeddings | |
) | |
return vector_store | |
def rerank_chunks( # Esta função nunca está sendo utilizada | |
self, chunks: List[Dict], query: str, k: int = 5 | |
) -> List[Dict]: | |
""" | |
Rerank chunks using Cohere's reranking model. | |
Args: | |
chunks: List of dictionaries containing chunks and their metadata | |
query: Original search query | |
k: Number of top chunks to return | |
Returns: | |
List of reranked chunks with updated relevance scores | |
""" | |
try: | |
# Prepare documents for reranking | |
documents = [chunk["content"] for chunk in chunks] | |
# Get reranking scores from Cohere | |
results = self.cohere_client.rerank( | |
query=query, | |
documents=documents, | |
top_n=k, | |
model=self.model_cohere_rerank, | |
) | |
# Create reranked results with original metadata | |
reranked_chunks = [] | |
for hit in results: | |
original_chunk = chunks[hit.index] | |
reranked_chunks.append( | |
{**original_chunk, "relevance_score": hit.relevance_score} | |
) | |
return reranked_chunks | |
except Exception as e: | |
logging.error(f"Reranking failed: {str(e)}") | |
return chunks[:k] # Fallback to original ordering | |
def generate_summary_with_sources( # Esta função nunca está sendo utilizada | |
self, | |
vector_store: Chroma, | |
query: str = "Summarize the main points of this document", | |
) -> List[Dict]: | |
"""Generate summary with source citations using reranking""" | |
# Retrieve more initial chunks for reranking | |
relevant_docs = vector_store.similarity_search_with_score(query, k=20) | |
# Prepare chunks for reranking | |
chunks = [] | |
for doc, score in relevant_docs: | |
chunks.append( | |
{ | |
"content": doc.page_content, | |
"page": doc.metadata["page"], | |
"chunk_id": doc.metadata["chunk_id"], | |
"relevance_score": score, | |
} | |
) | |
# Rerank chunks | |
reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank) | |
# Prepare context and sources from reranked chunks | |
contexts = [] | |
sources = [] | |
for chunk in reranked_chunks: | |
contexts.append(chunk["content"]) | |
sources.append( | |
{ | |
"content": chunk["content"], | |
"page": chunk["page"], | |
"chunk_id": chunk["chunk_id"], | |
"relevance_score": chunk["relevance_score"], | |
} | |
) | |
prompt_template = """ | |
Based on the following context, provide multiple key points from the document. | |
For each point, create a new paragraph. | |
Each paragraph should be a complete, self-contained insight. | |
Context: {context} | |
Key points: | |
""" | |
prompt = PromptTemplate(template=prompt_template, input_variables=["context"]) | |
llm = ChatOpenAI( | |
temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key | |
) | |
response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content | |
# Split the response into paragraphs | |
summaries = [p.strip() for p in response.split("\n\n") if p.strip()] | |
# Create structured output | |
structured_output = [] | |
for idx, summary in enumerate(summaries): | |
# Associate each summary with the most relevant source | |
structured_output.append( | |
{ | |
"content": summary, | |
"source": { | |
"page": sources[min(idx, len(sources) - 1)]["page"], | |
"text": sources[min(idx, len(sources) - 1)]["content"][:200] | |
+ "...", | |
"relevance_score": sources[min(idx, len(sources) - 1)][ | |
"relevance_score" | |
], | |
}, | |
} | |
) | |
return structured_output | |
def get_source_context( | |
self, chunk_id: str, window: int = 100 | |
) -> Dict: # Esta função nunca está sendo utilizada | |
"""Get extended context around a specific chunk""" | |
metadata = self.chunk_metadata.get(chunk_id) | |
if not metadata: | |
return None | |
return { | |
"page": metadata["page"], | |
"start_char": metadata["start_char"], | |
"end_char": metadata["end_char"], | |
} | |