import os from _utils.gerar_relatorio_modelo_usuario.prompts import prompt_auxiliar_SEM_CONTEXT from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import ( EnhancedDocumentSummarizer, ) from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import ( contextualize_chunk_based_on_serializer, get_full_text_and_all_PDFs_chunks, ) from _utils.gerar_relatorio_modelo_usuario.utils import gerar_resposta_compilada from _utils.models.gerar_relatorio import ( RetrievalConfig, ) def reciprocal_rank_fusion(result_lists, weights=None): """Combine multiple ranked lists using reciprocal rank fusion""" fused_scores = {} num_lists = len(result_lists) if weights is None: weights = [1.0] * num_lists for i in range(num_lists): for doc_id, score in result_lists[i]: if doc_id not in fused_scores: fused_scores[doc_id] = 0 fused_scores[doc_id] += weights[i] * score # Sort by score in descending order sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True) return sorted_results os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" os.environ.get("LANGCHAIN_API_KEY") os.environ["LANGCHAIN_PROJECT"] = "VELLA" async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None): """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs""" # Configuration config = RetrievalConfig( num_chunks=serializer["num_chunks_retrieval"], embedding_weight=serializer["embedding_weight"], bm25_weight=serializer["bm25_weight"], context_window=serializer["context_window"], chunk_overlap=serializer["chunk_overlap"], ) # Initialize enhanced summarizer summarizer = EnhancedDocumentSummarizer( openai_api_key=os.environ.get("OPENAI_API_KEY"), claude_api_key=os.environ.get("CLAUDE_API_KEY"), config=config, embedding_model=serializer["hf_embedding"], chunk_overlap=serializer["chunk_overlap"], chunk_size=serializer["chunk_size"], num_k_rerank=serializer["num_k_rerank"], model_cohere_rerank=serializer["model_cohere_rerank"], claude_context_model=serializer["claude_context_model"], prompt_auxiliar=serializer["prompt_auxiliar"], gpt_model=serializer["model"], gpt_temperature=serializer["gpt_temperature"], # id_modelo_do_usuario=serializer["id_modelo_do_usuario"], prompt_gerar_documento=serializer["prompt_gerar_documento"], reciprocal_rank_fusion=reciprocal_rank_fusion, ) allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks( listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"] ) chunks_passados, is_contextualized_chunk = ( await contextualize_chunk_based_on_serializer( serializer, summarizer.contextual_retriever, pages, allPdfsChunks ) ) # Create enhanced vector store and BM25 index vector_store, bm25, chunk_ids = ( summarizer.vector_store.create_enhanced_vector_store( chunks_passados, is_contextualized_chunk ) ) # Generate enhanced summary structured_summaries = await summarizer.generate_enhanced_summary( vector_store, bm25, chunk_ids # , serializer["user_message"] , prompt_auxiliar_SEM_CONTEXT, ) if not isinstance(structured_summaries, list): from rest_framework.response import Response return Response({"erro": structured_summaries}) texto_completo = summarizer.resumo_gerado + "\n\n" for x in structured_summaries: texto_completo = texto_completo + x["content"] + "\n" print("\n\ntexto_completo[0: 1000]: ", texto_completo[0:1000]) return { "resultado": structured_summaries, "texto_completo": texto_completo, "parametros-utilizados": gerar_resposta_compilada(serializer), }