Spaces:
Running
Running
File size: 4,373 Bytes
1fd7b67 12d3e1a 1286e81 12d3e1a d07865c 12d3e1a d07865c 12d3e1a 1286e81 c625f4c ca8a144 c625f4c ca8a144 1fd7b67 c625f4c 1fd7b67 c625f4c 1fd7b67 55f46c1 78209bc 55f46c1 c625f4c ca8a144 c625f4c ca8a144 d07865c ca8a144 12d3e1a ca8a144 3143cff 12d3e1a 1286e81 ca8a144 d07865c 55f46c1 12d3e1a d07865c baeaaa5 d07865c ca8a144 12d3e1a d07865c 12d3e1a c625f4c ca8a144 3736ce1 ca8a144 baeaaa5 3736ce1 ca8a144 1fd7b67 baeaaa5 12d3e1a c625f4c 1286e81 12d3e1a e70ffc1 12d3e1a ca8a144 c625f4c 12d3e1a c625f4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
from _utils.gerar_relatorio_modelo_usuario.prompts import prompt_auxiliar_SEM_CONTEXT
from _utils.gerar_relatorio_modelo_usuario.EnhancedDocumentSummarizer import (
EnhancedDocumentSummarizer,
)
from _utils.gerar_relatorio_modelo_usuario.contextual_retriever import (
ContextualRetriever,
)
from _utils.gerar_relatorio_modelo_usuario.utils import (
gerar_resposta_compilada,
get_full_text_and_all_PDFs_chunks,
get_response_from_auxiliar_contextual_prompt,
)
from _utils.models.gerar_relatorio import (
RetrievalConfig,
)
def reciprocal_rank_fusion(result_lists, weights=None):
"""Combine multiple ranked lists using reciprocal rank fusion"""
fused_scores = {}
num_lists = len(result_lists)
if weights is None:
weights = [1.0] * num_lists
for i in range(num_lists):
for doc_id, score in result_lists[i]:
if doc_id not in fused_scores:
fused_scores[doc_id] = 0
fused_scores[doc_id] += weights[i] * score
# Sort by score in descending order
sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
return sorted_results
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
async def get_llm_summary_answer_by_cursor_complete(
serializer, listaPDFs, isBubble=False
):
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
# Configuration
config = RetrievalConfig(
num_chunks=serializer["num_chunks_retrieval"],
embedding_weight=serializer["embedding_weight"],
bm25_weight=serializer["bm25_weight"],
context_window=serializer["context_window"],
chunk_overlap=serializer["chunk_overlap"],
)
contextual_retriever = ContextualRetriever(
config, serializer["claude_context_model"]
)
# Initialize enhanced summarizer
summarizer = EnhancedDocumentSummarizer(
config=config,
embedding_model=serializer["hf_embedding"],
chunk_overlap=serializer["chunk_overlap"],
chunk_size=serializer["chunk_size"],
num_k_rerank=serializer["num_k_rerank"],
model_cohere_rerank=serializer["model_cohere_rerank"],
prompt_auxiliar=serializer["prompt_auxiliar"],
gpt_model=serializer["model"],
gpt_temperature=serializer["gpt_temperature"],
prompt_gerar_documento=serializer["prompt_gerar_documento"],
reciprocal_rank_fusion=reciprocal_rank_fusion,
)
all_PDFs_chunks, full_text_as_array = await get_full_text_and_all_PDFs_chunks(
listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
)
is_contextualized_chunk = serializer["should_have_contextual_chunks"]
if is_contextualized_chunk:
response_auxiliar_summary = await get_response_from_auxiliar_contextual_prompt(
full_text_as_array
)
contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
all_PDFs_chunks, response_auxiliar_summary
)
chunks_processados = contextualized_chunks
else:
chunks_processados = all_PDFs_chunks
# Create enhanced vector store and BM25 index
vector_store, bm25, chunk_ids = (
summarizer.vector_store.create_enhanced_vector_store(
chunks_processados, is_contextualized_chunk
)
)
llm_ultimas_requests = serializer["llm_ultimas_requests"]
# Generate enhanced summary
structured_summaries = await summarizer.generate_enhanced_summary(
vector_store, bm25, chunk_ids, llm_ultimas_requests, prompt_auxiliar_SEM_CONTEXT
)
if not isinstance(structured_summaries, list):
from rest_framework.response import Response
return Response({"erro": structured_summaries})
texto_completo = summarizer.resumo_gerado + "\n\n"
for x in structured_summaries:
texto_completo = texto_completo + x["content"] + "\n"
print("\n\ntexto_completo[0: 1000]: ", texto_completo[0:1000])
return {
"resultado": structured_summaries,
"texto_completo": texto_completo,
"parametros-utilizados": gerar_resposta_compilada(serializer),
}
|