Spaces:
Sleeping
Sleeping
| import asyncio | |
| import json | |
| import os | |
| from datetime import datetime | |
| from test_workflow import run_workflow | |
| from workflow import create_workflow | |
| from generate_test_dataset import GOLDEN_DATASET_DIR, validate_test_case | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| async def run_golden_tests(): | |
| """Run tests using the golden dataset.""" | |
| # Load the golden dataset | |
| dataset_path = os.path.join(GOLDEN_DATASET_DIR, "golden_dataset.json") | |
| if not os.path.exists(dataset_path): | |
| print("Golden dataset not found. Generating new dataset...") | |
| from generate_test_dataset import generate_golden_dataset | |
| generate_golden_dataset() | |
| with open(dataset_path, 'r') as f: | |
| golden_dataset = json.load(f) | |
| # Initialize workflow | |
| workflow = create_workflow(os.getenv("TAVILY_API_KEY")) | |
| # Store test results | |
| test_results = { | |
| "metadata": { | |
| "timestamp": datetime.now().isoformat(), | |
| "dataset_version": golden_dataset["metadata"]["version"] | |
| }, | |
| "results": [] | |
| } | |
| # Run tests for each test case | |
| for test_case in golden_dataset["test_cases"]: | |
| print(f"\nRunning test case: {test_case['input']['query']}") | |
| try: | |
| # Run the workflow | |
| result = await run_workflow( | |
| workflow, | |
| test_case["input"]["query"], | |
| agent_type=test_case["input"]["agent_type"], | |
| context=test_case["input"]["context"] | |
| ) | |
| # Validate the results | |
| validation_result = validate_test_case(test_case, result) | |
| # Add results | |
| test_results["results"].append({ | |
| "test_case_id": test_case["id"], | |
| "query": test_case["input"]["query"], | |
| "success": all(v["passed"] for v in validation_result["validations"]), | |
| "validation_results": validation_result, | |
| "workflow_output": result | |
| }) | |
| # Print progress | |
| success = all(v["passed"] for v in validation_result["validations"]) | |
| status = "β Passed" if success else "β Failed" | |
| print(f"{status} - {test_case['input']['query']}") | |
| except Exception as e: | |
| print(f"β Error running test case: {str(e)}") | |
| test_results["results"].append({ | |
| "test_case_id": test_case["id"], | |
| "query": test_case["input"]["query"], | |
| "success": False, | |
| "error": str(e) | |
| }) | |
| # Save test results | |
| results_dir = os.path.join(GOLDEN_DATASET_DIR, "results") | |
| os.makedirs(results_dir, exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_file = os.path.join(results_dir, f"test_results_{timestamp}.json") | |
| with open(output_file, "w") as f: | |
| json.dump(test_results, f, indent=2) | |
| # Print summary | |
| total_tests = len(test_results["results"]) | |
| passed_tests = sum(1 for r in test_results["results"] if r.get("success", False)) | |
| print("\n" + "="*50) | |
| print("Test Summary:") | |
| print(f"Total Tests: {total_tests}") | |
| print(f"Passed: {passed_tests}") | |
| print(f"Failed: {total_tests - passed_tests}") | |
| print(f"Success Rate: {(passed_tests/total_tests)*100:.2f}%") | |
| print("="*50) | |
| print(f"\nDetailed results saved to: {output_file}") | |
| if __name__ == "__main__": | |
| print("\n" + "="*50) | |
| print("π§ͺ Running Golden Dataset Tests") | |
| print("="*50) | |
| try: | |
| asyncio.run(run_golden_tests()) | |
| except Exception as e: | |
| print(f"\nβ Critical error: {str(e)}") | |
| raise |