Spaces:
Running
Running
import os | |
from langchain_community.document_loaders import PyPDFLoader | |
from _utils.resumo_completo_cursor import EnhancedDocumentSummarizer, RetrievalConfig | |
from rest_framework.response import Response | |
from ragas import evaluate | |
from langchain.chains import SequentialChain | |
from langchain.prompts import PromptTemplate | |
# from langchain.schema import ChainResult | |
from langchain.memory import SimpleMemory | |
def test_ragas(serializer, listaPDFs): | |
# Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer | |
config = RetrievalConfig( | |
num_chunks=serializer["num_chunks_retrieval"], | |
embedding_weight=serializer["embedding_weight"], | |
bm25_weight=serializer["bm25_weight"], | |
context_window=serializer["context_window"], | |
chunk_overlap=serializer["chunk_overlap"], | |
) | |
summarizer = EnhancedDocumentSummarizer( | |
openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
claude_api_key=os.environ.get("CLAUDE_API_KEY"), | |
config=config, | |
embedding_model=serializer["hf_embedding"], | |
chunk_overlap=serializer["chunk_overlap"], | |
chunk_size=serializer["chunk_size"], | |
num_k_rerank=serializer["num_k_rerank"], | |
model_cohere_rerank=serializer["model_cohere_rerank"], | |
claude_context_model=serializer["claude_context_model"], | |
prompt_relatorio=serializer["prompt_relatorio"], | |
gpt_model=serializer["model"], | |
gpt_temperature=serializer["gpt_temperature"], | |
id_modelo_do_usuario=serializer["id_modelo_do_usuario"], | |
prompt_modelo=serializer["prompt_modelo"], | |
) | |
# Step 1: Define the components | |
def load_and_split_documents(pdf_list, summarizer): | |
"""Loads and splits PDF documents into chunks.""" | |
all_chunks = [] | |
for pdf_path in pdf_list: | |
chunks = summarizer.load_and_split_document(pdf_path) | |
all_chunks.extend(chunks) | |
return {"chunks": all_chunks} | |
def get_full_text_from_pdfs(pdf_list): | |
"""Gets the full text from PDFs for contextualization.""" | |
full_text = [] | |
for pdf_path in pdf_list: | |
loader = PyPDFLoader(pdf_path) | |
pages = loader.load() | |
text = " ".join([page.page_content for page in pages]) | |
full_text.append(text) | |
return {"full_text": " ".join(full_text)} | |
def contextualize_all_chunks(full_text, chunks, contextual_retriever): | |
"""Adds context to chunks using Claude.""" | |
contextualized_chunks = contextual_retriever.contextualize_all_chunks( | |
full_text, chunks | |
) | |
return {"contextualized_chunks": contextualized_chunks} | |
def create_vector_store(contextualized_chunks, summarizer): | |
"""Creates an enhanced vector store and BM25 index.""" | |
vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store( | |
contextualized_chunks | |
) | |
return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids} | |
def generate_summary(vector_store, bm25, chunk_ids, query, summarizer): | |
"""Generates an enhanced summary using the vector store and BM25 index.""" | |
structured_summaries = summarizer.generate_enhanced_summary( | |
vector_store, bm25, chunk_ids, query | |
) | |
return {"structured_summaries": structured_summaries} | |
# Step 3: Define Sequential Chain | |
chain = SequentialChain( | |
chains=[ | |
lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer), | |
lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]), | |
lambda inputs: contextualize_all_chunks( | |
inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever | |
), | |
lambda inputs: create_vector_store( | |
inputs["contextualized_chunks"], summarizer | |
), | |
lambda inputs: generate_summary( | |
inputs["vector_store"], | |
inputs["bm25"], | |
inputs["chunk_ids"], | |
inputs["user_message"], | |
summarizer, | |
), | |
], | |
input_variables=["pdf_list", "user_message"], | |
output_variables=["structured_summaries"], | |
) | |
from ragas.langchain.evalchain import RagasEvaluatorChain | |
from ragas.metrics import ( | |
LLMContextRecall, | |
Faithfulness, | |
FactualCorrectness, | |
SemanticSimilarity, | |
) | |
from ragas import evaluate | |
from ragas.llms import LangchainLLMWrapper | |
# from ragas.embeddings import LangchainEmbeddingsWrapper | |
# evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) | |
evaluator_llm = LangchainLLMWrapper(chain) | |
# evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) | |
from datasets import load_dataset | |
dataset = load_dataset( | |
"explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True | |
) | |
from ragas import EvaluationDataset | |
eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"]) | |
metrics = [ | |
LLMContextRecall(llm=evaluator_llm), | |
FactualCorrectness(llm=evaluator_llm), | |
Faithfulness(llm=evaluator_llm), | |
# SemanticSimilarity(embeddings=evaluator_embeddings) | |
] | |
results = evaluate(dataset=eval_dataset, metrics=metrics) | |
print("results: ", results) | |
# Step 4: Run the Chain | |
inputs = { | |
"pdf_list": listaPDFs, | |
"user_message": serializer["user_message"], | |
} | |
# result = chain.run(inputs) | |
return Response({"msg": results}) | |
# Step 5: Format the Output | |
# return { | |
# "resultado": result["structured_summaries"], | |
# "parametros-utilizados": { | |
# "num_chunks_retrieval": serializer["num_chunks_retrieval"], | |
# "embedding_weight": serializer["embedding_weight"], | |
# "bm25_weight": serializer["bm25_weight"], | |
# "context_window": serializer["context_window"], | |
# "chunk_overlap": serializer["chunk_overlap"], | |
# "num_k_rerank": serializer["num_k_rerank"], | |
# "model_cohere_rerank": serializer["model_cohere_rerank"], | |
# "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"], | |
# "claude_context_model": serializer["claude_context_model"], | |
# "gpt_temperature": serializer["gpt_temperature"], | |
# "user_message": serializer["user_message"], | |
# "model": serializer["model"], | |
# "hf_embedding": serializer["hf_embedding"], | |
# "chunk_size": serializer["chunk_size"], | |
# "chunk_overlap": serializer["chunk_overlap"], | |
# "prompt_relatorio": serializer["prompt_relatorio"], | |
# "prompt_modelo": serializer["prompt_modelo"], | |
# }, | |
# } | |