from typing import List, Dict, Tuple, Optional from _utils.splitters.Splitter_class import Splitter from setup.easy_imports import ( HuggingFaceEmbeddings, Chroma, ChatOpenAI, PromptTemplate, ) import logging from cohere import Client from _utils.models.gerar_relatorio import ( DocumentChunk, ) class DocumentSummarizer: def __init__( self, openai_api_key: str, cohere_api_key: str, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, ): self.openai_api_key = openai_api_key self.cohere_client = Client(cohere_api_key) self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) self.num_k_rerank = num_k_rerank self.model_cohere_rerank = model_cohere_rerank self.splitter = Splitter(chunk_size, chunk_overlap) def create_vector_store( self, chunks: List[DocumentChunk] ) -> Chroma: # Esta função nunca está sendo utilizada """Create vector store with metadata""" texts = [chunk.content for chunk in chunks] metadatas = [ { "chunk_id": chunk.chunk_id, "page": chunk.page_number, "start_char": chunk.start_char, "end_char": chunk.end_char, } for chunk in chunks ] vector_store = Chroma.from_texts( texts=texts, metadatas=metadatas, embedding=self.embeddings ) return vector_store def rerank_chunks( # Esta função nunca está sendo utilizada self, chunks: List[Dict], query: str, k: int = 5 ) -> List[Dict]: """ Rerank chunks using Cohere's reranking model. Args: chunks: List of dictionaries containing chunks and their metadata query: Original search query k: Number of top chunks to return Returns: List of reranked chunks with updated relevance scores """ try: # Prepare documents for reranking documents = [chunk["content"] for chunk in chunks] # Get reranking scores from Cohere results = self.cohere_client.rerank( query=query, documents=documents, top_n=k, model=self.model_cohere_rerank, ) # Create reranked results with original metadata reranked_chunks = [] for hit in results: original_chunk = chunks[hit.index] reranked_chunks.append( {**original_chunk, "relevance_score": hit.relevance_score} ) return reranked_chunks except Exception as e: logging.error(f"Reranking failed: {str(e)}") return chunks[:k] # Fallback to original ordering def generate_summary_with_sources( # Esta função nunca está sendo utilizada self, vector_store: Chroma, query: str = "Summarize the main points of this document", ) -> List[Dict]: """Generate summary with source citations using reranking""" # Retrieve more initial chunks for reranking relevant_docs = vector_store.similarity_search_with_score(query, k=20) # Prepare chunks for reranking chunks = [] for doc, score in relevant_docs: chunks.append( { "content": doc.page_content, "page": doc.metadata["page"], "chunk_id": doc.metadata["chunk_id"], "relevance_score": score, } ) # Rerank chunks reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank) # Prepare context and sources from reranked chunks contexts = [] sources = [] for chunk in reranked_chunks: contexts.append(chunk["content"]) sources.append( { "content": chunk["content"], "page": chunk["page"], "chunk_id": chunk["chunk_id"], "relevance_score": chunk["relevance_score"], } ) prompt_template = """ Based on the following context, provide multiple key points from the document. For each point, create a new paragraph. Each paragraph should be a complete, self-contained insight. Context: {context} Key points: """ prompt = PromptTemplate(template=prompt_template, input_variables=["context"]) llm = ChatOpenAI( temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key ) response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content # Split the response into paragraphs summaries = [p.strip() for p in response.split("\n\n") if p.strip()] # Create structured output structured_output = [] for idx, summary in enumerate(summaries): # Associate each summary with the most relevant source structured_output.append( { "content": summary, "source": { "page": sources[min(idx, len(sources) - 1)]["page"], "text": sources[min(idx, len(sources) - 1)]["content"][:200] + "...", "relevance_score": sources[min(idx, len(sources) - 1)][ "relevance_score" ], }, } ) return structured_output def get_source_context( self, chunk_id: str, window: int = 100 ) -> Dict: # Esta função nunca está sendo utilizada """Get extended context around a specific chunk""" metadata = self.chunk_metadata.get(chunk_id) if not metadata: return None return { "page": metadata["page"], "start_char": metadata["start_char"], "end_char": metadata["end_char"], }