Spaces:
Running
Running
| from typing import List, Dict, Tuple, Optional | |
| from _utils.splitters.Splitter_class import Splitter | |
| from setup.easy_imports import ( | |
| HuggingFaceEmbeddings, | |
| Chroma, | |
| ChatOpenAI, | |
| PromptTemplate, | |
| ) | |
| import logging | |
| from cohere import Client | |
| from _utils.models.gerar_relatorio import ( | |
| DocumentChunk, | |
| ) | |
| class DocumentSummarizer: | |
| def __init__( | |
| self, | |
| openai_api_key: str, | |
| cohere_api_key: str, | |
| embedding_model, | |
| chunk_size, | |
| chunk_overlap, | |
| num_k_rerank, | |
| model_cohere_rerank, | |
| ): | |
| self.openai_api_key = openai_api_key | |
| self.cohere_client = Client(cohere_api_key) | |
| self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| self.num_k_rerank = num_k_rerank | |
| self.model_cohere_rerank = model_cohere_rerank | |
| self.splitter = Splitter(chunk_size, chunk_overlap) | |
| def create_vector_store( | |
| self, chunks: List[DocumentChunk] | |
| ) -> Chroma: # Esta função nunca está sendo utilizada | |
| """Create vector store with metadata""" | |
| texts = [chunk.content for chunk in chunks] | |
| metadatas = [ | |
| { | |
| "chunk_id": chunk.chunk_id, | |
| "page": chunk.page_number, | |
| "start_char": chunk.start_char, | |
| "end_char": chunk.end_char, | |
| } | |
| for chunk in chunks | |
| ] | |
| vector_store = Chroma.from_texts( | |
| texts=texts, metadatas=metadatas, embedding=self.embeddings | |
| ) | |
| return vector_store | |
| def rerank_chunks( # Esta função nunca está sendo utilizada | |
| self, chunks: List[Dict], query: str, k: int = 5 | |
| ) -> List[Dict]: | |
| """ | |
| Rerank chunks using Cohere's reranking model. | |
| Args: | |
| chunks: List of dictionaries containing chunks and their metadata | |
| query: Original search query | |
| k: Number of top chunks to return | |
| Returns: | |
| List of reranked chunks with updated relevance scores | |
| """ | |
| try: | |
| # Prepare documents for reranking | |
| documents = [chunk["content"] for chunk in chunks] | |
| # Get reranking scores from Cohere | |
| results = self.cohere_client.rerank( | |
| query=query, | |
| documents=documents, | |
| top_n=k, | |
| model=self.model_cohere_rerank, | |
| ) | |
| # Create reranked results with original metadata | |
| reranked_chunks = [] | |
| for hit in results: | |
| original_chunk = chunks[hit.index] | |
| reranked_chunks.append( | |
| {**original_chunk, "relevance_score": hit.relevance_score} | |
| ) | |
| return reranked_chunks | |
| except Exception as e: | |
| logging.error(f"Reranking failed: {str(e)}") | |
| return chunks[:k] # Fallback to original ordering | |
| def generate_summary_with_sources( # Esta função nunca está sendo utilizada | |
| self, | |
| vector_store: Chroma, | |
| query: str = "Summarize the main points of this document", | |
| ) -> List[Dict]: | |
| """Generate summary with source citations using reranking""" | |
| # Retrieve more initial chunks for reranking | |
| relevant_docs = vector_store.similarity_search_with_score(query, k=20) | |
| # Prepare chunks for reranking | |
| chunks = [] | |
| for doc, score in relevant_docs: | |
| chunks.append( | |
| { | |
| "content": doc.page_content, | |
| "page": doc.metadata["page"], | |
| "chunk_id": doc.metadata["chunk_id"], | |
| "relevance_score": score, | |
| } | |
| ) | |
| # Rerank chunks | |
| reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank) | |
| # Prepare context and sources from reranked chunks | |
| contexts = [] | |
| sources = [] | |
| for chunk in reranked_chunks: | |
| contexts.append(chunk["content"]) | |
| sources.append( | |
| { | |
| "content": chunk["content"], | |
| "page": chunk["page"], | |
| "chunk_id": chunk["chunk_id"], | |
| "relevance_score": chunk["relevance_score"], | |
| } | |
| ) | |
| prompt_template = """ | |
| Based on the following context, provide multiple key points from the document. | |
| For each point, create a new paragraph. | |
| Each paragraph should be a complete, self-contained insight. | |
| Context: {context} | |
| Key points: | |
| """ | |
| prompt = PromptTemplate(template=prompt_template, input_variables=["context"]) | |
| llm = ChatOpenAI( | |
| temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key | |
| ) | |
| response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content | |
| # Split the response into paragraphs | |
| summaries = [p.strip() for p in response.split("\n\n") if p.strip()] | |
| # Create structured output | |
| structured_output = [] | |
| for idx, summary in enumerate(summaries): | |
| # Associate each summary with the most relevant source | |
| structured_output.append( | |
| { | |
| "content": summary, | |
| "source": { | |
| "page": sources[min(idx, len(sources) - 1)]["page"], | |
| "text": sources[min(idx, len(sources) - 1)]["content"][:200] | |
| + "...", | |
| "relevance_score": sources[min(idx, len(sources) - 1)][ | |
| "relevance_score" | |
| ], | |
| }, | |
| } | |
| ) | |
| return structured_output | |
| def get_source_context( | |
| self, chunk_id: str, window: int = 100 | |
| ) -> Dict: # Esta função nunca está sendo utilizada | |
| """Get extended context around a specific chunk""" | |
| metadata = self.chunk_metadata.get(chunk_id) | |
| if not metadata: | |
| return None | |
| return { | |
| "page": metadata["page"], | |
| "start_char": metadata["start_char"], | |
| "end_char": metadata["end_char"], | |
| } | |