mwalker22 commited on
Commit
5b1bd96
·
1 Parent(s): 1371d47

Implemented processing of a dataset through the LangGraph, along with evaluation rules. This will allow comparing the processing of this LangGraph against RAGAS runs on the same dataset.

Browse files
experiments/README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experiments: Synthetic Data Generation & Evaluation
2
+
3
+ This folder contains scripts for running batch experiments and evaluations on your RAG pipeline using LangSmith.
4
+
5
+ ## Contents
6
+ - `evaluate_on_dataset.py`: Runs your RAG pipeline on all questions in the LangSmith dataset and logs predictions.
7
+ - `evaluate_predictions.py`: Runs automated evaluation (Correctness, Helpfulness, Dopeness) on predictions using LangSmith evaluators.
8
+
9
+ ## Prerequisites
10
+ - Python 3.10+
11
+ - All project dependencies installed (see project root requirements)
12
+ - API keys set as environment variables:
13
+ - `OPENAI_API_KEY`
14
+ - `LANGCHAIN_API_KEY`
15
+ - (Optional) **Vectorstore location:**
16
+ - `VECTORSTORE_PATH` (default: `/tmp/vectorstore`)
17
+ - **LangSmith Tracing:**
18
+ - `LANGCHAIN_TRACING_V2` (must be set to `true` to enable tracing in LangSmith)
19
+
20
+ ## Usage
21
+
22
+ 1. **Run the RAG pipeline and log predictions:**
23
+ ```sh
24
+ export OPENAI_API_KEY=sk-...
25
+ export LANGCHAIN_API_KEY=ls-...
26
+ export LANGCHAIN_TRACING_V2=true
27
+ export VECTORSTORE_PATH=/tmp/vectorstore # or your preferred path
28
+ python evaluate_on_dataset.py
29
+ ```
30
+ This will process all questions in the LangSmith dataset and log your app's predictions.
31
+
32
+ 2. **Run evaluation on predictions:**
33
+ ```sh
34
+ python evaluate_predictions.py
35
+ ```
36
+ This will score your predictions for correctness, helpfulness, and dopeness, and log results to LangSmith.
37
+
38
+ 3. **View Results:**
39
+ - Go to your [LangSmith dashboard](https://smith.langchain.com/) and open the relevant project/dataset to see experiment results and metrics.
40
+
41
+ ## Notes
42
+ - Make sure your dataset name matches between scripts and LangSmith.
43
+ - You can rerun these scripts as you update your pipeline or data.
44
+ - The vectorstore will be stored in `/tmp/vectorstore` by default, which is suitable for cloud environments like Hugging Face Spaces. Set `VECTORSTORE_PATH` if you want to use a different location.
45
+ - **Tracing:** Setting `LANGCHAIN_TRACING_V2=true` is required for detailed trace logging in LangSmith. Without this, traces will not appear in your LangSmith dashboard.
experiments/evaluate_on_dataset.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langsmith import Client
4
+ from graph.types import SDGState
5
+ from graph.build_graph import build_sdg_graph
6
+ from preprocess.embed_documents import create_or_load_vectorstore
7
+ from preprocess.html_to_documents import extract_documents_from_html
8
+ from langchain_openai import ChatOpenAI
9
+ from pathlib import Path
10
+ import pickle
11
+
12
+
13
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
14
+
15
+ # --- CONFIG ---
16
+ DATASET_NAME = "State of AI Across the Years!"
17
+ PROJECT_NAME = "State of AI Across the Years!"
18
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
19
+ LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")
20
+
21
+ # --- SETUP ENV ---
22
+ os.environ["LANGCHAIN_PROJECT"] = PROJECT_NAME
23
+ if LANGCHAIN_API_KEY:
24
+ os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
25
+ if OPENAI_API_KEY:
26
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
27
+
28
+ # --- LOAD DOCUMENTS & VECTORSTORE ---
29
+ def load_docs():
30
+ output_file = Path("generated/documents.pkl")
31
+ if output_file.exists():
32
+ with open(output_file, "rb") as f:
33
+ return pickle.load(f)
34
+ # Fallback: extract from HTML
35
+ docs = []
36
+ data_dir = Path("data")
37
+ for html_file in data_dir.glob("*.html"):
38
+ docs.extend(extract_documents_from_html(str(html_file), label=html_file.stem))
39
+ output_file.parent.mkdir(parents=True, exist_ok=True)
40
+ with open(output_file, "wb") as f:
41
+ pickle.dump(docs, f)
42
+ return docs
43
+
44
+ def main():
45
+ # Load dataset from LangSmith
46
+ client = Client()
47
+ dataset = client.read_dataset(dataset_name=DATASET_NAME)
48
+ examples = client.list_examples(dataset_id=dataset["id"])
49
+
50
+ # Load docs/vectorstore
51
+ docs = load_docs()
52
+ vectorstore_path = os.environ.get("VECTORSTORE_PATH", "/tmp/vectorstore")
53
+ vectorstore = create_or_load_vectorstore(docs, path=vectorstore_path)
54
+ llm = ChatOpenAI()
55
+ graph = build_sdg_graph(docs, vectorstore, llm)
56
+
57
+ # For each example, run the graph and log prediction
58
+ for example in examples:
59
+ question = example.inputs["question"]
60
+ reference = example.outputs["answer"]
61
+ # Prepare initial state
62
+ state = SDGState(input=question)
63
+ result = graph.invoke(state)
64
+ if not isinstance(result, SDGState):
65
+ result = SDGState(**dict(result))
66
+ # Log prediction to LangSmith
67
+ client.create_run(
68
+ name="SDG App Run",
69
+ inputs={"question": question},
70
+ outputs={"output": result.answer},
71
+ reference_outputs={"answer": reference},
72
+ example_id=example.id,
73
+ project_name=PROJECT_NAME,
74
+ )
75
+ print(f"Processed: {question}\n → {result.answer}\n")
76
+
77
+ if __name__ == "__main__":
78
+ main()
experiments/evaluate_predictions.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langsmith.evaluation import LangChainStringEvaluator, evaluate
4
+ from langchain_openai import ChatOpenAI
5
+
6
+
7
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
8
+
9
+ # --- CONFIG ---
10
+ DATASET_NAME = "State of AI Across the Years!"
11
+ PROJECT_NAME = "State of AI Across the Years!"
12
+ EVAL_LLM_MODEL = "gpt-4.1" # Match the notebook's model if possible
13
+
14
+ # --- SETUP ENV ---
15
+ if "LANGCHAIN_API_KEY" in os.environ:
16
+ os.environ["LANGCHAIN_API_KEY"] = os.environ["LANGCHAIN_API_KEY"]
17
+ if "OPENAI_API_KEY" in os.environ:
18
+ os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
19
+
20
+ # --- EVALUATORS ---
21
+ eval_llm = ChatOpenAI(model=EVAL_LLM_MODEL)
22
+
23
+ qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm})
24
+
25
+ labeled_helpfulness_evaluator = LangChainStringEvaluator(
26
+ "labeled_criteria",
27
+ config={
28
+ "criteria": {
29
+ "helpfulness": (
30
+ "Is this submission helpful to the user,"
31
+ " taking into account the correct reference answer?"
32
+ )
33
+ },
34
+ "llm": eval_llm
35
+ },
36
+ prepare_data=lambda run, example: {
37
+ "prediction": run.outputs["output"],
38
+ "reference": example.outputs["answer"],
39
+ "input": example.inputs["question"],
40
+ }
41
+ )
42
+
43
+ dope_or_nope_evaluator = LangChainStringEvaluator(
44
+ "criteria",
45
+ config={
46
+ "criteria": {
47
+ "dopeness": "Is this submission dope, lit, or cool?",
48
+ },
49
+ "llm": eval_llm
50
+ }
51
+ )
52
+
53
+ # --- RUN EVALUATION ---
54
+ if __name__ == "__main__":
55
+ print("Running evaluation on predictions in LangSmith...")
56
+ results = evaluate(
57
+ None, # No need to pass a chain, just evaluate existing runs
58
+ data=DATASET_NAME,
59
+ evaluators=[
60
+ qa_evaluator,
61
+ labeled_helpfulness_evaluator,
62
+ dope_or_nope_evaluator
63
+ ],
64
+ project_name=PROJECT_NAME,
65
+ metadata={"source": "app_evaluation"},
66
+ )
67
+ print("Evaluation complete! View results in your LangSmith dashboard.")