Spaces:
Sleeping
Sleeping
Implemented processing of a dataset through the LangGraph, along with evaluation rules. This will allow comparing the processing of this LangGraph against RAGAS runs on the same dataset.
Browse files- experiments/README.md +45 -0
- experiments/evaluate_on_dataset.py +78 -0
- experiments/evaluate_predictions.py +67 -0
experiments/README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Experiments: Synthetic Data Generation & Evaluation
|
| 2 |
+
|
| 3 |
+
This folder contains scripts for running batch experiments and evaluations on your RAG pipeline using LangSmith.
|
| 4 |
+
|
| 5 |
+
## Contents
|
| 6 |
+
- `evaluate_on_dataset.py`: Runs your RAG pipeline on all questions in the LangSmith dataset and logs predictions.
|
| 7 |
+
- `evaluate_predictions.py`: Runs automated evaluation (Correctness, Helpfulness, Dopeness) on predictions using LangSmith evaluators.
|
| 8 |
+
|
| 9 |
+
## Prerequisites
|
| 10 |
+
- Python 3.10+
|
| 11 |
+
- All project dependencies installed (see project root requirements)
|
| 12 |
+
- API keys set as environment variables:
|
| 13 |
+
- `OPENAI_API_KEY`
|
| 14 |
+
- `LANGCHAIN_API_KEY`
|
| 15 |
+
- (Optional) **Vectorstore location:**
|
| 16 |
+
- `VECTORSTORE_PATH` (default: `/tmp/vectorstore`)
|
| 17 |
+
- **LangSmith Tracing:**
|
| 18 |
+
- `LANGCHAIN_TRACING_V2` (must be set to `true` to enable tracing in LangSmith)
|
| 19 |
+
|
| 20 |
+
## Usage
|
| 21 |
+
|
| 22 |
+
1. **Run the RAG pipeline and log predictions:**
|
| 23 |
+
```sh
|
| 24 |
+
export OPENAI_API_KEY=sk-...
|
| 25 |
+
export LANGCHAIN_API_KEY=ls-...
|
| 26 |
+
export LANGCHAIN_TRACING_V2=true
|
| 27 |
+
export VECTORSTORE_PATH=/tmp/vectorstore # or your preferred path
|
| 28 |
+
python evaluate_on_dataset.py
|
| 29 |
+
```
|
| 30 |
+
This will process all questions in the LangSmith dataset and log your app's predictions.
|
| 31 |
+
|
| 32 |
+
2. **Run evaluation on predictions:**
|
| 33 |
+
```sh
|
| 34 |
+
python evaluate_predictions.py
|
| 35 |
+
```
|
| 36 |
+
This will score your predictions for correctness, helpfulness, and dopeness, and log results to LangSmith.
|
| 37 |
+
|
| 38 |
+
3. **View Results:**
|
| 39 |
+
- Go to your [LangSmith dashboard](https://smith.langchain.com/) and open the relevant project/dataset to see experiment results and metrics.
|
| 40 |
+
|
| 41 |
+
## Notes
|
| 42 |
+
- Make sure your dataset name matches between scripts and LangSmith.
|
| 43 |
+
- You can rerun these scripts as you update your pipeline or data.
|
| 44 |
+
- The vectorstore will be stored in `/tmp/vectorstore` by default, which is suitable for cloud environments like Hugging Face Spaces. Set `VECTORSTORE_PATH` if you want to use a different location.
|
| 45 |
+
- **Tracing:** Setting `LANGCHAIN_TRACING_V2=true` is required for detailed trace logging in LangSmith. Without this, traces will not appear in your LangSmith dashboard.
|
experiments/evaluate_on_dataset.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langsmith import Client
|
| 4 |
+
from graph.types import SDGState
|
| 5 |
+
from graph.build_graph import build_sdg_graph
|
| 6 |
+
from preprocess.embed_documents import create_or_load_vectorstore
|
| 7 |
+
from preprocess.html_to_documents import extract_documents_from_html
|
| 8 |
+
from langchain_openai import ChatOpenAI
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import pickle
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
|
| 14 |
+
|
| 15 |
+
# --- CONFIG ---
|
| 16 |
+
DATASET_NAME = "State of AI Across the Years!"
|
| 17 |
+
PROJECT_NAME = "State of AI Across the Years!"
|
| 18 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 19 |
+
LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")
|
| 20 |
+
|
| 21 |
+
# --- SETUP ENV ---
|
| 22 |
+
os.environ["LANGCHAIN_PROJECT"] = PROJECT_NAME
|
| 23 |
+
if LANGCHAIN_API_KEY:
|
| 24 |
+
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
|
| 25 |
+
if OPENAI_API_KEY:
|
| 26 |
+
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
|
| 27 |
+
|
| 28 |
+
# --- LOAD DOCUMENTS & VECTORSTORE ---
|
| 29 |
+
def load_docs():
|
| 30 |
+
output_file = Path("generated/documents.pkl")
|
| 31 |
+
if output_file.exists():
|
| 32 |
+
with open(output_file, "rb") as f:
|
| 33 |
+
return pickle.load(f)
|
| 34 |
+
# Fallback: extract from HTML
|
| 35 |
+
docs = []
|
| 36 |
+
data_dir = Path("data")
|
| 37 |
+
for html_file in data_dir.glob("*.html"):
|
| 38 |
+
docs.extend(extract_documents_from_html(str(html_file), label=html_file.stem))
|
| 39 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
with open(output_file, "wb") as f:
|
| 41 |
+
pickle.dump(docs, f)
|
| 42 |
+
return docs
|
| 43 |
+
|
| 44 |
+
def main():
|
| 45 |
+
# Load dataset from LangSmith
|
| 46 |
+
client = Client()
|
| 47 |
+
dataset = client.read_dataset(dataset_name=DATASET_NAME)
|
| 48 |
+
examples = client.list_examples(dataset_id=dataset["id"])
|
| 49 |
+
|
| 50 |
+
# Load docs/vectorstore
|
| 51 |
+
docs = load_docs()
|
| 52 |
+
vectorstore_path = os.environ.get("VECTORSTORE_PATH", "/tmp/vectorstore")
|
| 53 |
+
vectorstore = create_or_load_vectorstore(docs, path=vectorstore_path)
|
| 54 |
+
llm = ChatOpenAI()
|
| 55 |
+
graph = build_sdg_graph(docs, vectorstore, llm)
|
| 56 |
+
|
| 57 |
+
# For each example, run the graph and log prediction
|
| 58 |
+
for example in examples:
|
| 59 |
+
question = example.inputs["question"]
|
| 60 |
+
reference = example.outputs["answer"]
|
| 61 |
+
# Prepare initial state
|
| 62 |
+
state = SDGState(input=question)
|
| 63 |
+
result = graph.invoke(state)
|
| 64 |
+
if not isinstance(result, SDGState):
|
| 65 |
+
result = SDGState(**dict(result))
|
| 66 |
+
# Log prediction to LangSmith
|
| 67 |
+
client.create_run(
|
| 68 |
+
name="SDG App Run",
|
| 69 |
+
inputs={"question": question},
|
| 70 |
+
outputs={"output": result.answer},
|
| 71 |
+
reference_outputs={"answer": reference},
|
| 72 |
+
example_id=example.id,
|
| 73 |
+
project_name=PROJECT_NAME,
|
| 74 |
+
)
|
| 75 |
+
print(f"Processed: {question}\n → {result.answer}\n")
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
main()
|
experiments/evaluate_predictions.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langsmith.evaluation import LangChainStringEvaluator, evaluate
|
| 4 |
+
from langchain_openai import ChatOpenAI
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
|
| 8 |
+
|
| 9 |
+
# --- CONFIG ---
|
| 10 |
+
DATASET_NAME = "State of AI Across the Years!"
|
| 11 |
+
PROJECT_NAME = "State of AI Across the Years!"
|
| 12 |
+
EVAL_LLM_MODEL = "gpt-4.1" # Match the notebook's model if possible
|
| 13 |
+
|
| 14 |
+
# --- SETUP ENV ---
|
| 15 |
+
if "LANGCHAIN_API_KEY" in os.environ:
|
| 16 |
+
os.environ["LANGCHAIN_API_KEY"] = os.environ["LANGCHAIN_API_KEY"]
|
| 17 |
+
if "OPENAI_API_KEY" in os.environ:
|
| 18 |
+
os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
|
| 19 |
+
|
| 20 |
+
# --- EVALUATORS ---
|
| 21 |
+
eval_llm = ChatOpenAI(model=EVAL_LLM_MODEL)
|
| 22 |
+
|
| 23 |
+
qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm})
|
| 24 |
+
|
| 25 |
+
labeled_helpfulness_evaluator = LangChainStringEvaluator(
|
| 26 |
+
"labeled_criteria",
|
| 27 |
+
config={
|
| 28 |
+
"criteria": {
|
| 29 |
+
"helpfulness": (
|
| 30 |
+
"Is this submission helpful to the user,"
|
| 31 |
+
" taking into account the correct reference answer?"
|
| 32 |
+
)
|
| 33 |
+
},
|
| 34 |
+
"llm": eval_llm
|
| 35 |
+
},
|
| 36 |
+
prepare_data=lambda run, example: {
|
| 37 |
+
"prediction": run.outputs["output"],
|
| 38 |
+
"reference": example.outputs["answer"],
|
| 39 |
+
"input": example.inputs["question"],
|
| 40 |
+
}
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
dope_or_nope_evaluator = LangChainStringEvaluator(
|
| 44 |
+
"criteria",
|
| 45 |
+
config={
|
| 46 |
+
"criteria": {
|
| 47 |
+
"dopeness": "Is this submission dope, lit, or cool?",
|
| 48 |
+
},
|
| 49 |
+
"llm": eval_llm
|
| 50 |
+
}
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# --- RUN EVALUATION ---
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
print("Running evaluation on predictions in LangSmith...")
|
| 56 |
+
results = evaluate(
|
| 57 |
+
None, # No need to pass a chain, just evaluate existing runs
|
| 58 |
+
data=DATASET_NAME,
|
| 59 |
+
evaluators=[
|
| 60 |
+
qa_evaluator,
|
| 61 |
+
labeled_helpfulness_evaluator,
|
| 62 |
+
dope_or_nope_evaluator
|
| 63 |
+
],
|
| 64 |
+
project_name=PROJECT_NAME,
|
| 65 |
+
metadata={"source": "app_evaluation"},
|
| 66 |
+
)
|
| 67 |
+
print("Evaluation complete! View results in your LangSmith dashboard.")
|