ric9176 commited on
Commit
b008f13
·
1 Parent(s): 58b3429

Add evaluate_rag.py

Browse files
Files changed (1) hide show
  1. evaluate_rag.py +135 -0
evaluate_rag.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nltk
3
+ from typing import List
4
+ from ragas.llms import LangchainLLMWrapper
5
+ from ragas.embeddings import LangchainEmbeddingsWrapper
6
+ from ragas.testset import TestsetGenerator
7
+ from ragas.metrics import (
8
+ LLMContextRecall,
9
+ Faithfulness,
10
+ FactualCorrectness,
11
+ ResponseRelevancy,
12
+ ContextEntityRecall,
13
+ NoiseSensitivity
14
+ )
15
+ from ragas import evaluate, RunConfig, EvaluationDataset
16
+
17
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
18
+ from langchain_community.document_loaders import WebBaseLoader
19
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
20
+ from langchain_qdrant import QdrantVectorStore
21
+ from qdrant_client import QdrantClient
22
+ from qdrant_client.http.models import Distance, VectorParams
23
+ from langchain.prompts import ChatPromptTemplate
24
+
25
+ # Initialize the URLs (same as app.py)
26
+ urls = [
27
+ "https://www.timeout.com/london/things-to-do-in-london-this-weekend",
28
+ "https://www.timeout.com/london/london-events-in-march"
29
+ ]
30
+
31
+ # Load documents
32
+ loader = WebBaseLoader(urls)
33
+ docs = loader.load()
34
+
35
+ # Initialize generator models for RAGAS
36
+ generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4"))
37
+ generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
38
+
39
+ # Generate synthetic test dataset
40
+ generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
41
+ dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
42
+
43
+ # Print the generated test questions
44
+ print("\nGenerated Test Questions:")
45
+ for i, row in dataset.to_pandas().iterrows():
46
+ print(f"{i+1}. {row['question']}")
47
+
48
+ # Set up the RAG pipeline for testing
49
+ # Split documents
50
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
51
+ split_documents = text_splitter.split_documents(docs)
52
+
53
+ # Create vector store
54
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
55
+ client = QdrantClient(":memory:")
56
+
57
+ client.create_collection(
58
+ collection_name="london_events",
59
+ vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
60
+ )
61
+
62
+ vector_store = QdrantVectorStore(
63
+ client=client,
64
+ collection_name="london_events",
65
+ embedding=embeddings,
66
+ )
67
+
68
+ # Add documents to vector store
69
+ vector_store.add_documents(documents=split_documents)
70
+ retriever = vector_store.as_retriever(search_kwargs={"k": 5})
71
+
72
+ # Create RAG prompt
73
+ RAG_PROMPT = """
74
+ You are a helpful assistant who answers questions about events and activities in London.
75
+ Answer based only on the provided context. If you cannot find the answer, say so.
76
+
77
+ Question: {question}
78
+
79
+ Context: {context}
80
+
81
+ Answer:"""
82
+
83
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
84
+ llm = ChatOpenAI(model="gpt-4")
85
+
86
+ # Process each test question through the RAG pipeline
87
+ for test_row in dataset:
88
+ # Retrieve relevant documents
89
+ retrieved_docs = retriever.get_relevant_documents(test_row.eval_sample.user_input)
90
+
91
+ # Format context and generate response
92
+ context = "\n\n".join(doc.page_content for doc in retrieved_docs)
93
+ messages = rag_prompt.format_messages(question=test_row.eval_sample.user_input, context=context)
94
+ response = llm.invoke(messages)
95
+
96
+ # Store results in dataset
97
+ test_row.eval_sample.response = response.content
98
+ test_row.eval_sample.retrieved_contexts = [doc.page_content for doc in retrieved_docs]
99
+
100
+ # Convert to evaluation dataset
101
+ evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
102
+
103
+ # Set up evaluator
104
+ evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4"))
105
+
106
+ # Run evaluation with all metrics
107
+ custom_run_config = RunConfig(timeout=360)
108
+
109
+ result = evaluate(
110
+ dataset=evaluation_dataset,
111
+ metrics=[
112
+ LLMContextRecall(),
113
+ Faithfulness(),
114
+ FactualCorrectness(),
115
+ ResponseRelevancy(),
116
+ ContextEntityRecall(),
117
+ NoiseSensitivity()
118
+ ],
119
+ llm=evaluator_llm,
120
+ run_config=custom_run_config
121
+ )
122
+
123
+ # Print results
124
+ print("\nEvaluation Results:")
125
+ for metric, score in result.items():
126
+ print(f"{metric}: {score:.4f}")
127
+
128
+ # Save results to file
129
+ with open("docs/evaluation_results.txt", "w") as f:
130
+ f.write("RAG System Evaluation Results\n")
131
+ f.write("==========================\n\n")
132
+ f.write("Test Dataset Size: 10 questions\n\n")
133
+ f.write("Metric Scores:\n")
134
+ for metric, score in result.items():
135
+ f.write(f"{metric}: {score:.4f}\n")