Spaces:
Running
Running
| import os | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from _utils.resumo_completo_cursor import EnhancedDocumentSummarizer, RetrievalConfig | |
| from rest_framework.response import Response | |
| from ragas import evaluate | |
| from langchain.chains import SequentialChain | |
| from langchain.prompts import PromptTemplate | |
| # from langchain.schema import ChainResult | |
| from langchain.memory import SimpleMemory | |
| def test_ragas(serializer, listaPDFs): | |
| # Step 2: Setup RetrievalConfig and EnhancedDocumentSummarizer | |
| config = RetrievalConfig( | |
| num_chunks=serializer["num_chunks_retrieval"], | |
| embedding_weight=serializer["embedding_weight"], | |
| bm25_weight=serializer["bm25_weight"], | |
| context_window=serializer["context_window"], | |
| chunk_overlap=serializer["chunk_overlap"], | |
| ) | |
| summarizer = EnhancedDocumentSummarizer( | |
| openai_api_key=os.environ.get("OPENAI_API_KEY"), | |
| claude_api_key=os.environ.get("CLAUDE_API_KEY"), | |
| config=config, | |
| embedding_model=serializer["hf_embedding"], | |
| chunk_overlap=serializer["chunk_overlap"], | |
| chunk_size=serializer["chunk_size"], | |
| num_k_rerank=serializer["num_k_rerank"], | |
| model_cohere_rerank=serializer["model_cohere_rerank"], | |
| claude_context_model=serializer["claude_context_model"], | |
| prompt_relatorio=serializer["prompt_relatorio"], | |
| gpt_model=serializer["model"], | |
| gpt_temperature=serializer["gpt_temperature"], | |
| id_modelo_do_usuario=serializer["id_modelo_do_usuario"], | |
| prompt_modelo=serializer["prompt_modelo"], | |
| ) | |
| # Step 1: Define the components | |
| def load_and_split_documents(pdf_list, summarizer): | |
| """Loads and splits PDF documents into chunks.""" | |
| all_chunks = [] | |
| for pdf_path in pdf_list: | |
| chunks = summarizer.load_and_split_document(pdf_path) | |
| all_chunks.extend(chunks) | |
| return {"chunks": all_chunks} | |
| def get_full_text_from_pdfs(pdf_list): | |
| """Gets the full text from PDFs for contextualization.""" | |
| full_text = [] | |
| for pdf_path in pdf_list: | |
| loader = PyPDFLoader(pdf_path) | |
| pages = loader.load() | |
| text = " ".join([page.page_content for page in pages]) | |
| full_text.append(text) | |
| return {"full_text": " ".join(full_text)} | |
| def contextualize_all_chunks(full_text, chunks, contextual_retriever): | |
| """Adds context to chunks using Claude.""" | |
| contextualized_chunks = contextual_retriever.contextualize_all_chunks( | |
| full_text, chunks | |
| ) | |
| return {"contextualized_chunks": contextualized_chunks} | |
| def create_vector_store(contextualized_chunks, summarizer): | |
| """Creates an enhanced vector store and BM25 index.""" | |
| vector_store, bm25, chunk_ids = summarizer.create_enhanced_vector_store( | |
| contextualized_chunks | |
| ) | |
| return {"vector_store": vector_store, "bm25": bm25, "chunk_ids": chunk_ids} | |
| def generate_summary(vector_store, bm25, chunk_ids, query, summarizer): | |
| """Generates an enhanced summary using the vector store and BM25 index.""" | |
| structured_summaries = summarizer.generate_enhanced_summary( | |
| vector_store, bm25, chunk_ids, query | |
| ) | |
| return {"structured_summaries": structured_summaries} | |
| # Step 3: Define Sequential Chain | |
| chain = SequentialChain( | |
| chains=[ | |
| lambda inputs: load_and_split_documents(inputs["pdf_list"], summarizer), | |
| lambda inputs: get_full_text_from_pdfs(inputs["pdf_list"]), | |
| lambda inputs: contextualize_all_chunks( | |
| inputs["full_text"], inputs["chunks"], summarizer.contextual_retriever | |
| ), | |
| lambda inputs: create_vector_store( | |
| inputs["contextualized_chunks"], summarizer | |
| ), | |
| lambda inputs: generate_summary( | |
| inputs["vector_store"], | |
| inputs["bm25"], | |
| inputs["chunk_ids"], | |
| inputs["user_message"], | |
| summarizer, | |
| ), | |
| ], | |
| input_variables=["pdf_list", "user_message"], | |
| output_variables=["structured_summaries"], | |
| ) | |
| from ragas.langchain.evalchain import RagasEvaluatorChain | |
| from ragas.metrics import ( | |
| LLMContextRecall, | |
| Faithfulness, | |
| FactualCorrectness, | |
| SemanticSimilarity, | |
| ) | |
| from ragas import evaluate | |
| from ragas.llms import LangchainLLMWrapper | |
| # from ragas.embeddings import LangchainEmbeddingsWrapper | |
| # evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) | |
| evaluator_llm = LangchainLLMWrapper(chain) | |
| # evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) | |
| from datasets import load_dataset | |
| dataset = load_dataset( | |
| "explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True | |
| ) | |
| from ragas import EvaluationDataset | |
| eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"]) | |
| metrics = [ | |
| LLMContextRecall(llm=evaluator_llm), | |
| FactualCorrectness(llm=evaluator_llm), | |
| Faithfulness(llm=evaluator_llm), | |
| # SemanticSimilarity(embeddings=evaluator_embeddings) | |
| ] | |
| results = evaluate(dataset=eval_dataset, metrics=metrics) | |
| print("results: ", results) | |
| # Step 4: Run the Chain | |
| inputs = { | |
| "pdf_list": listaPDFs, | |
| "user_message": serializer["user_message"], | |
| } | |
| # result = chain.run(inputs) | |
| return Response({"msg": results}) | |
| # Step 5: Format the Output | |
| # return { | |
| # "resultado": result["structured_summaries"], | |
| # "parametros-utilizados": { | |
| # "num_chunks_retrieval": serializer["num_chunks_retrieval"], | |
| # "embedding_weight": serializer["embedding_weight"], | |
| # "bm25_weight": serializer["bm25_weight"], | |
| # "context_window": serializer["context_window"], | |
| # "chunk_overlap": serializer["chunk_overlap"], | |
| # "num_k_rerank": serializer["num_k_rerank"], | |
| # "model_cohere_rerank": serializer["model_cohere_rerank"], | |
| # "more_initial_chunks_for_reranking": serializer["more_initial_chunks_for_reranking"], | |
| # "claude_context_model": serializer["claude_context_model"], | |
| # "gpt_temperature": serializer["gpt_temperature"], | |
| # "user_message": serializer["user_message"], | |
| # "model": serializer["model"], | |
| # "hf_embedding": serializer["hf_embedding"], | |
| # "chunk_size": serializer["chunk_size"], | |
| # "chunk_overlap": serializer["chunk_overlap"], | |
| # "prompt_relatorio": serializer["prompt_relatorio"], | |
| # "prompt_modelo": serializer["prompt_modelo"], | |
| # }, | |
| # } | |