luanpoppe
fix: adicionando opção de não utilizar o contextualized chunks temporariamente
1286e81
raw
history blame
6.79 kB
import os
from langchain_community.document_loaders import PyPDFLoader
from _utils.resumo_completo_cursor import EnhancedDocumentSummarizer, RetrievalConfig
from rest_framework.response import Response
from ragas import evaluate
from langchain.chains import SequentialChain
from langchain.prompts import PromptTemplate
# from langchain.schema import ChainResult
from langchain.memory import SimpleMemory
def test_ragas(serializer, listaPDFs):
# Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer
config = RetrievalConfig(
num_chunks=serializer["num_chunks_retrieval"],
embedding_weight=serializer["embedding_weight"],
bm25_weight=serializer["bm25_weight"],
context_window=serializer["context_window"],
chunk_overlap=serializer["chunk_overlap"],
)
summarizer = EnhancedDocumentSummarizer(
openai_api_key=os.environ.get("OPENAI_API_KEY"),
claude_api_key=os.environ.get("CLAUDE_API_KEY"),
config=config,
embedding_model=serializer["hf_embedding"],
chunk_overlap=serializer["chunk_overlap"],
chunk_size=serializer["chunk_size"],
num_k_rerank=serializer["num_k_rerank"],
model_cohere_rerank=serializer["model_cohere_rerank"],
claude_context_model=serializer["claude_context_model"],
prompt_relatorio=serializer["prompt_relatorio"],
gpt_model=serializer["model"],
gpt_temperature=serializer["gpt_temperature"],
id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
prompt_modelo=serializer["prompt_modelo"],
)
# Step 1: Define the components
def load_and_split_documents(pdf_list, summarizer):
"""Loads and splits PDF documents into chunks."""
all_chunks = []
for pdf_path in pdf_list:
chunks = summarizer.load_and_split_document(pdf_path)
all_chunks.extend(chunks)
return {"chunks": all_chunks}
def get_full_text_from_pdfs(pdf_list):
"""Gets the full text from PDFs for contextualization."""
full_text = []
for pdf_path in pdf_list:
loader = PyPDFLoader(pdf_path)
pages = loader.load()
text = " ".join([page.page_content for page in pages])
full_text.append(text)
return {"full_text": " ".join(full_text)}
def contextualize_all_chunks(full_text, chunks, contextual_retriever):
"""Adds context to chunks using Claude."""
contextualized_chunks = contextual_retriever.contextualize_all_chunks(
full_text, chunks
)
return {"contextualized_chunks": contextualized_chunks}
def create_vector_store(contextualized_chunks, summarizer):
"""Creates an enhanced vector store and BM25 index."""
vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store(
contextualized_chunks
)
return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids}
def generate_summary(vector_store, bm25, chunk_ids, query, summarizer):
"""Generates an enhanced summary using the vector store and BM25 index."""
structured_summaries = summarizer.generate_enhanced_summary(
vector_store, bm25, chunk_ids, query
)
return {"structured_summaries": structured_summaries}
# Step 3: Define Sequential Chain
chain = SequentialChain(
chains=[
lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer),
lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]),
lambda inputs: contextualize_all_chunks(
inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever
),
lambda inputs: create_vector_store(
inputs["contextualized_chunks"], summarizer
),
lambda inputs: generate_summary(
inputs["vector_store"],
inputs["bm25"],
inputs["chunk_ids"],
inputs["user_message"],
summarizer,
),
],
input_variables=["pdf_list", "user_message"],
output_variables=["structured_summaries"],
)
from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import (
LLMContextRecall,
Faithfulness,
FactualCorrectness,
SemanticSimilarity,
)
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
# from ragas.embeddings import LangchainEmbeddingsWrapper
# evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_llm = LangchainLLMWrapper(chain)
# evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
from datasets import load_dataset
dataset = load_dataset(
"explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True
)
from ragas import EvaluationDataset
eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"])
metrics = [
LLMContextRecall(llm=evaluator_llm),
FactualCorrectness(llm=evaluator_llm),
Faithfulness(llm=evaluator_llm),
# SemanticSimilarity(embeddings=evaluator_embeddings)
]
results = evaluate(dataset=eval_dataset, metrics=metrics)
print("results: ", results)
# Step 4: Run the Chain
inputs = {
"pdf_list": listaPDFs,
"user_message": serializer["user_message"],
}
# result = chain.run(inputs)
return Response({"msg": results})
# Step 5: Format the Output
# return {
# "resultado": result["structured_summaries"],
# "parametros-utilizados": {
# "num_chunks_retrieval": serializer["num_chunks_retrieval"],
# "embedding_weight": serializer["embedding_weight"],
# "bm25_weight": serializer["bm25_weight"],
# "context_window": serializer["context_window"],
# "chunk_overlap": serializer["chunk_overlap"],
# "num_k_rerank": serializer["num_k_rerank"],
# "model_cohere_rerank": serializer["model_cohere_rerank"],
# "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"],
# "claude_context_model": serializer["claude_context_model"],
# "gpt_temperature": serializer["gpt_temperature"],
# "user_message": serializer["user_message"],
# "model": serializer["model"],
# "hf_embedding": serializer["hf_embedding"],
# "chunk_size": serializer["chunk_size"],
# "chunk_overlap": serializer["chunk_overlap"],
# "prompt_relatorio": serializer["prompt_relatorio"],
# "prompt_modelo": serializer["prompt_modelo"],
# },
# }