mwalker22 commited on
Commit
6f094a2
·
1 Parent(s): 1143fce

Add an experiment tag so the evaluation can be done on a specific run only. evaluate_on_dataset.py auto-generates the tag.

Browse files
experiments/evaluate_on_dataset.py CHANGED
@@ -8,6 +8,7 @@ from preprocess.html_to_documents import extract_documents_from_html
8
  from langchain_openai import ChatOpenAI
9
  from pathlib import Path
10
  import pickle
 
11
 
12
 
13
  load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
@@ -15,6 +16,8 @@ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
15
  # --- CONFIG ---
16
  DATASET_NAME = "State of AI Across the Years!"
17
  PROJECT_NAME = "State of AI Across the Years!"
 
 
18
 
19
  # --- LOAD DOCUMENTS & VECTORSTORE ---
20
  def load_docs():
@@ -62,7 +65,9 @@ def main():
62
  reference_outputs={"answer": reference},
63
  example_id=example.id,
64
  project_name=PROJECT_NAME,
 
65
  )
 
66
  print(f"Processed: {question}\n → {result.answer}\n")
67
 
68
  if __name__ == "__main__":
 
8
  from langchain_openai import ChatOpenAI
9
  from pathlib import Path
10
  import pickle
11
+ import uuid
12
 
13
 
14
  load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
 
16
  # --- CONFIG ---
17
  DATASET_NAME = "State of AI Across the Years!"
18
  PROJECT_NAME = "State of AI Across the Years!"
19
+ EXPERIMENT_TAG = f"exp_{uuid.uuid4().hex[:8]}"
20
+ print(f"Experiment tag for this batch: {EXPERIMENT_TAG}")
21
 
22
  # --- LOAD DOCUMENTS & VECTORSTORE ---
23
  def load_docs():
 
65
  reference_outputs={"answer": reference},
66
  example_id=example.id,
67
  project_name=PROJECT_NAME,
68
+ metadata={"experiment_tag": EXPERIMENT_TAG},
69
  )
70
+ print(f"Logged run with experiment_tag: {EXPERIMENT_TAG}")
71
  print(f"Processed: {question}\n → {result.answer}\n")
72
 
73
  if __name__ == "__main__":
experiments/evaluate_predictions.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  from dotenv import load_dotenv
3
  from langsmith.evaluation import LangChainStringEvaluator, evaluate
4
  from langchain_openai import ChatOpenAI
 
 
5
 
6
 
7
  load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
@@ -11,6 +13,20 @@ DATASET_NAME = "State of AI Across the Years!"
11
  PROJECT_NAME = "State of AI Across the Years!"
12
  EVAL_LLM_MODEL = "gpt-4.1" # Match the notebook's model if possible
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # --- EVALUATORS ---
15
  eval_llm = ChatOpenAI(model=EVAL_LLM_MODEL)
16
 
@@ -48,7 +64,7 @@ dope_or_nope_evaluator = LangChainStringEvaluator(
48
  if __name__ == "__main__":
49
  print("Running evaluation on predictions in LangSmith...")
50
  results = evaluate(
51
- None, # No need to pass a chain, just evaluate existing runs
52
  data=DATASET_NAME,
53
  evaluators=[
54
  qa_evaluator,
 
2
  from dotenv import load_dotenv
3
  from langsmith.evaluation import LangChainStringEvaluator, evaluate
4
  from langchain_openai import ChatOpenAI
5
+ import argparse
6
+ from langsmith import Client
7
 
8
 
9
  load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
 
13
  PROJECT_NAME = "State of AI Across the Years!"
14
  EVAL_LLM_MODEL = "gpt-4.1" # Match the notebook's model if possible
15
 
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("--experiment_tag", type=str, help="Only evaluate runs with this experiment_tag")
18
+ args = parser.parse_args()
19
+
20
+ if args.experiment_tag:
21
+ print(f"Evaluating only runs with experiment_tag: {args.experiment_tag}")
22
+
23
+ client = Client()
24
+ runs = list(client.list_runs(
25
+ project_name=PROJECT_NAME,
26
+ dataset_name=DATASET_NAME,
27
+ filters={"metadata.experiment_tag": args.experiment_tag} if args.experiment_tag else None,
28
+ ))
29
+
30
  # --- EVALUATORS ---
31
  eval_llm = ChatOpenAI(model=EVAL_LLM_MODEL)
32
 
 
64
  if __name__ == "__main__":
65
  print("Running evaluation on predictions in LangSmith...")
66
  results = evaluate(
67
+ runs,
68
  data=DATASET_NAME,
69
  evaluators=[
70
  qa_evaluator,