Spaces:
Sleeping
Sleeping
Update test output formats
Browse files- app.py +1 -1
- docs/evaluation_results.csv +0 -0
- docs/evaluation_results.md +0 -0
- docs/evaluation_results.txt +24 -0
- docs/test_questions.md +12 -0
- evaluate_rag.py +43 -17
- rag.py +1 -1
app.py
CHANGED
@@ -41,7 +41,7 @@ def retrieve_context(query: str) -> list[str]:
|
|
41 |
tavily_tool = TavilySearchResults(max_results=5)
|
42 |
tool_belt = [tavily_tool, retrieve_context]
|
43 |
|
44 |
-
llm = ChatOpenAI(model="gpt-4o", temperature=0)
|
45 |
model = llm.bind_tools(tool_belt)
|
46 |
|
47 |
# Define system prompt
|
|
|
41 |
tavily_tool = TavilySearchResults(max_results=5)
|
42 |
tool_belt = [tavily_tool, retrieve_context]
|
43 |
|
44 |
+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
45 |
model = llm.bind_tools(tool_belt)
|
46 |
|
47 |
# Define system prompt
|
docs/evaluation_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
docs/evaluation_results.md
ADDED
The diff for this file is too large to render.
See raw diff
|
|
docs/evaluation_results.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
RAG System Evaluation Results
|
2 |
+
==========================
|
3 |
+
|
4 |
+
Test Dataset Size: 10 questions
|
5 |
+
|
6 |
+
Test Questions:
|
7 |
+
---------------
|
8 |
+
1. Southbank Centre events this weekend
|
9 |
+
2. What is the Imagine Festival at the Southbank Centre?
|
10 |
+
3. What event is High Vis supporting at the Crystal Palace Bowl?
|
11 |
+
4. What can you do for £24 in London?
|
12 |
+
5. How can I get newsletters about the Chelsea Physic Garden's snowdrop festival and other events in London?
|
13 |
+
6. Chelsea Physic Garden got what event this weekend?
|
14 |
+
7. What activities are available at the Chelsea Physic Garden as mentioned in the newsletters?
|
15 |
+
8. How can I find out about music concerts like the High Vis’s Electric Brixton show in London?
|
16 |
+
9. What events in London in March 2025?
|
17 |
+
10. What are some of the key events and activities planned in London for March 2025?
|
18 |
+
11. What are some of the events and exhibitions happening in London this weekend that feature the work of Donald Rodney?
|
19 |
+
12. What are some of the activities and events available at the Horniman Museum during the February half-term weekend for a tourist visiting London?
|
20 |
+
|
21 |
+
Metric Scores:
|
22 |
+
-------------
|
23 |
+
| Metric | Score |
|
24 |
+
|---------------------------|-------|
|
docs/test_questions.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Test Questions
|
2 |
+
|
3 |
+
1. What activities or events can one attend at Alexandra Palace during the weekend in London?
|
4 |
+
2. What are some of the events happening in London this weekend?
|
5 |
+
3. What can visitors expect to see at The Courtauld Gallery?
|
6 |
+
4. Who are the creators of the play 'Kyoto' and what is it about?
|
7 |
+
5. What's at Sea Containers?
|
8 |
+
6. What are some workshops and cultural events happening in London in March 2025 that tourists or locals can attend?
|
9 |
+
7. What are some of the art and culture workshops and events that tourists or locals can attend in London in March 2025?
|
10 |
+
8. What queer cinema event is happening in London in March 2025?
|
11 |
+
9. Wher can I watch the best new queer cinema in London in March 2025?
|
12 |
+
10. What festivals and events in London can I go to in March 2025?
|
evaluate_rag.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
import nltk
|
3 |
from typing import List
|
4 |
from ragas.llms import LangchainLLMWrapper
|
@@ -14,6 +15,16 @@ from ragas.metrics import (
|
|
14 |
)
|
15 |
from ragas import evaluate, RunConfig, EvaluationDataset
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
18 |
from langchain_community.document_loaders import WebBaseLoader
|
19 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -40,10 +51,12 @@ generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
|
|
40 |
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
|
41 |
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
|
42 |
|
|
|
|
|
43 |
# Print the generated test questions
|
44 |
print("\nGenerated Test Questions:")
|
45 |
-
for i,
|
46 |
-
print(f"{i+1}. {
|
47 |
|
48 |
# Set up the RAG pipeline for testing
|
49 |
# Split documents
|
@@ -81,7 +94,7 @@ Context: {context}
|
|
81 |
Answer:"""
|
82 |
|
83 |
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
84 |
-
llm = ChatOpenAI(model="gpt-
|
85 |
|
86 |
# Process each test question through the RAG pipeline
|
87 |
for test_row in dataset:
|
@@ -100,8 +113,9 @@ for test_row in dataset:
|
|
100 |
# Convert to evaluation dataset
|
101 |
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
|
102 |
|
|
|
103 |
# Set up evaluator
|
104 |
-
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-
|
105 |
|
106 |
# Run evaluation with all metrics
|
107 |
custom_run_config = RunConfig(timeout=360)
|
@@ -120,16 +134,28 @@ result = evaluate(
|
|
120 |
run_config=custom_run_config
|
121 |
)
|
122 |
|
123 |
-
|
124 |
-
print("
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
import nltk
|
4 |
from typing import List
|
5 |
from ragas.llms import LangchainLLMWrapper
|
|
|
15 |
)
|
16 |
from ragas import evaluate, RunConfig, EvaluationDataset
|
17 |
|
18 |
+
# Load environment variables
|
19 |
+
load_dotenv()
|
20 |
+
|
21 |
+
# Ensure OpenAI API key is set
|
22 |
+
if not os.getenv("OPENAI_API_KEY"):
|
23 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
24 |
+
|
25 |
+
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
|
26 |
+
os.environ["RAGAS_APP_TOKEN"] = os.getenv("RAGAS_APP_TOKEN")
|
27 |
+
|
28 |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
29 |
from langchain_community.document_loaders import WebBaseLoader
|
30 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
51 |
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
|
52 |
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
|
53 |
|
54 |
+
print(dataset.upload())
|
55 |
+
|
56 |
# Print the generated test questions
|
57 |
print("\nGenerated Test Questions:")
|
58 |
+
for i, test_row in enumerate(dataset):
|
59 |
+
print(f"{i+1}. {test_row.eval_sample.user_input}")
|
60 |
|
61 |
# Set up the RAG pipeline for testing
|
62 |
# Split documents
|
|
|
94 |
Answer:"""
|
95 |
|
96 |
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
97 |
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
98 |
|
99 |
# Process each test question through the RAG pipeline
|
100 |
for test_row in dataset:
|
|
|
113 |
# Convert to evaluation dataset
|
114 |
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
|
115 |
|
116 |
+
|
117 |
# Set up evaluator
|
118 |
+
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
|
119 |
|
120 |
# Run evaluation with all metrics
|
121 |
custom_run_config = RunConfig(timeout=360)
|
|
|
134 |
run_config=custom_run_config
|
135 |
)
|
136 |
|
137 |
+
print("RAW RESULT: ", result)
|
138 |
+
print("Type of result: ", type(result))
|
139 |
+
|
140 |
+
# Convert to pandas DataFrame for better formatting
|
141 |
+
df = result.to_pandas()
|
142 |
+
print("\nEvaluation Results as DataFrame:")
|
143 |
+
print(df)
|
144 |
+
|
145 |
+
try:
|
146 |
+
# Try to save as markdown
|
147 |
+
print("Attempting to save as markdown...")
|
148 |
+
import tabulate # Try to import explicitly to verify installation
|
149 |
+
df.to_markdown("docs/evaluation_results.md", index=False)
|
150 |
+
print("Successfully saved as markdown!")
|
151 |
+
except ImportError as e:
|
152 |
+
# Print detailed error message
|
153 |
+
print(f"Import Error: {e}")
|
154 |
+
print("Note: Install 'tabulate' package for markdown output. Falling back to CSV format.")
|
155 |
+
df.to_csv("docs/evaluation_results.csv", index=False)
|
156 |
+
|
157 |
+
# Save test questions
|
158 |
+
with open("docs/test_questions.md", "w") as f:
|
159 |
+
f.write("# Test Questions\n\n")
|
160 |
+
for i, test_row in enumerate(dataset):
|
161 |
+
f.write(f"{i+1}. {test_row.eval_sample.user_input}\n")
|
rag.py
CHANGED
@@ -44,7 +44,7 @@ def create_rag_pipeline(collection_name: str = "rag_collection"):
|
|
44 |
|
45 |
# Create text splitter for chunking
|
46 |
def tiktoken_len(text):
|
47 |
-
tokens = tiktoken.encoding_for_model("gpt-
|
48 |
return len(tokens)
|
49 |
|
50 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
44 |
|
45 |
# Create text splitter for chunking
|
46 |
def tiktoken_len(text):
|
47 |
+
tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
|
48 |
return len(tokens)
|
49 |
|
50 |
text_splitter = RecursiveCharacterTextSplitter(
|