Spaces:
Running
Running
Commit
·
5fc842f
1
Parent(s):
f03f82b
update
Browse files- app.py +1 -1
- assessment-queue/langchain-ai_langchain_eval_request_FINISHED_v0.1.0.json +14 -0
- assessment-queue/microsoft_autogen_eval_request_FINISHED_v0.2.0.json +14 -0
- assessment-queue/pytorch_pytorch_eval_request_FINISHED_v2.1.0.json +14 -0
- src/display/utils.py +2 -2
- src/leaderboard/read_evals.py +2 -2
- src/populate.py +10 -6
app.py
CHANGED
|
@@ -92,7 +92,7 @@ def init_leaderboard(dataframe):
|
|
| 92 |
"""Initialize the leaderboard component"""
|
| 93 |
if dataframe is None or dataframe.empty:
|
| 94 |
# Create an empty dataframe with the expected columns
|
| 95 |
-
all_columns = COLS + [task.
|
| 96 |
empty_df = pd.DataFrame(columns=all_columns)
|
| 97 |
print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
|
| 98 |
dataframe = empty_df
|
|
|
|
| 92 |
"""Initialize the leaderboard component"""
|
| 93 |
if dataframe is None or dataframe.empty:
|
| 94 |
# Create an empty dataframe with the expected columns
|
| 95 |
+
all_columns = COLS + [task.value.col_name for task in Tasks]
|
| 96 |
empty_df = pd.DataFrame(columns=all_columns)
|
| 97 |
print("Warning: Leaderboard DataFrame is empty. Using empty dataframe.")
|
| 98 |
dataframe = empty_df
|
assessment-queue/langchain-ai_langchain_eval_request_FINISHED_v0.1.0.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"library": "langchain-ai/langchain",
|
| 3 |
+
"version": "v0.1.0",
|
| 4 |
+
"repository_url": "https://github.com/langchain-ai/langchain",
|
| 5 |
+
"language": "Python",
|
| 6 |
+
"framework": "Python SDK",
|
| 7 |
+
"library_type": "llm framework",
|
| 8 |
+
"license": "MIT",
|
| 9 |
+
"stars": 74500,
|
| 10 |
+
"status": "FINISHED",
|
| 11 |
+
"submitted_time": "2025-04-30T10:00:00Z",
|
| 12 |
+
"last_updated": "2025-05-01T12:00:00Z",
|
| 13 |
+
"assessment_id": "abc123"
|
| 14 |
+
}
|
assessment-queue/microsoft_autogen_eval_request_FINISHED_v0.2.0.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"library": "microsoft/autogen",
|
| 3 |
+
"version": "v0.2.0",
|
| 4 |
+
"repository_url": "https://github.com/microsoft/autogen",
|
| 5 |
+
"language": "Python",
|
| 6 |
+
"framework": "Agent Framework",
|
| 7 |
+
"library_type": "agent framework",
|
| 8 |
+
"license": "MIT",
|
| 9 |
+
"stars": 48700,
|
| 10 |
+
"status": "FINISHED",
|
| 11 |
+
"submitted_time": "2025-05-02T08:15:00Z",
|
| 12 |
+
"last_updated": "2025-05-03T09:15:00Z",
|
| 13 |
+
"assessment_id": "ghi789"
|
| 14 |
+
}
|
assessment-queue/pytorch_pytorch_eval_request_FINISHED_v2.1.0.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"library": "pytorch/pytorch",
|
| 3 |
+
"version": "v2.1.0",
|
| 4 |
+
"repository_url": "https://github.com/pytorch/pytorch",
|
| 5 |
+
"language": "Python",
|
| 6 |
+
"framework": "Machine Learning",
|
| 7 |
+
"library_type": "machine learning",
|
| 8 |
+
"license": "BSD-3",
|
| 9 |
+
"stars": 72300,
|
| 10 |
+
"status": "FINISHED",
|
| 11 |
+
"submitted_time": "2025-05-01T16:30:00Z",
|
| 12 |
+
"last_updated": "2025-05-02T14:30:00Z",
|
| 13 |
+
"assessment_id": "def456"
|
| 14 |
+
}
|
src/display/utils.py
CHANGED
|
@@ -105,6 +105,6 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
| 105 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 106 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 107 |
|
| 108 |
-
# Task columns for benchmarking - use the
|
| 109 |
-
BENCHMARK_COLS = [task.
|
| 110 |
|
|
|
|
| 105 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 106 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 107 |
|
| 108 |
+
# Task columns for benchmarking - use the display column names from the Tasks enum
|
| 109 |
+
BENCHMARK_COLS = [task.value.col_name for task in Tasks]
|
| 110 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -138,11 +138,11 @@ class AssessmentResult:
|
|
| 138 |
AutoEvalColumn.availability.name: self.availability,
|
| 139 |
}
|
| 140 |
|
| 141 |
-
# Add task-specific risk scores - map
|
| 142 |
for task in Tasks:
|
| 143 |
task_enum = task.value # Task dataclass instance
|
| 144 |
benchmark_key = task_enum.benchmark # e.g., "license_validation"
|
| 145 |
-
col_name =
|
| 146 |
risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
|
| 147 |
data_dict[col_name] = risk_score
|
| 148 |
|
|
|
|
| 138 |
AutoEvalColumn.availability.name: self.availability,
|
| 139 |
}
|
| 140 |
|
| 141 |
+
# Add task-specific risk scores - map to display column names
|
| 142 |
for task in Tasks:
|
| 143 |
task_enum = task.value # Task dataclass instance
|
| 144 |
benchmark_key = task_enum.benchmark # e.g., "license_validation"
|
| 145 |
+
col_name = task_enum.col_name # Use the display name, e.g., "License Risk"
|
| 146 |
risk_score = self.results.get(benchmark_key, 10) # Default to highest risk
|
| 147 |
data_dict[col_name] = risk_score
|
| 148 |
|
src/populate.py
CHANGED
|
@@ -14,7 +14,7 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
|
|
| 14 |
eval_results_path: Path to the assessment result files
|
| 15 |
eval_requests_path: Path to the assessment request files
|
| 16 |
cols: Columns names to include in the dataframe
|
| 17 |
-
benchmark_cols: Risk categories column names
|
| 18 |
|
| 19 |
Returns:
|
| 20 |
Pandas dataframe for the leaderboard
|
|
@@ -27,21 +27,25 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
|
|
| 27 |
# Create dataframe from assessment results
|
| 28 |
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
| 29 |
|
| 30 |
-
# Ensure the
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Sort by overall risk score (ascending - lower is better)
|
| 34 |
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
| 35 |
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
| 36 |
|
| 37 |
return all_df
|
| 38 |
|
| 39 |
-
return pd.DataFrame(columns=cols) # Empty dataframe with columns
|
| 40 |
except Exception as e:
|
| 41 |
print(f"Error reading evaluation results: {e}")
|
| 42 |
import traceback
|
| 43 |
traceback.print_exc()
|
| 44 |
-
return pd.DataFrame(columns=cols) # Return empty dataframe
|
| 45 |
|
| 46 |
|
| 47 |
def get_evaluation_queue_df(eval_requests_path, eval_cols):
|
|
|
|
| 14 |
eval_results_path: Path to the assessment result files
|
| 15 |
eval_requests_path: Path to the assessment request files
|
| 16 |
cols: Columns names to include in the dataframe
|
| 17 |
+
benchmark_cols: Risk categories column names (display names)
|
| 18 |
|
| 19 |
Returns:
|
| 20 |
Pandas dataframe for the leaderboard
|
|
|
|
| 27 |
# Create dataframe from assessment results
|
| 28 |
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
| 29 |
|
| 30 |
+
# Ensure we have all the needed display columns
|
| 31 |
+
all_columns = set(all_df.columns)
|
| 32 |
+
for col in benchmark_cols:
|
| 33 |
+
if col not in all_columns:
|
| 34 |
+
print(f"Warning: Column '{col}' missing, adding empty column")
|
| 35 |
+
all_df[col] = 10.0 # Default to highest risk
|
| 36 |
+
|
| 37 |
# Sort by overall risk score (ascending - lower is better)
|
| 38 |
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
| 39 |
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
| 40 |
|
| 41 |
return all_df
|
| 42 |
|
| 43 |
+
return pd.DataFrame(columns=cols + benchmark_cols) # Empty dataframe with all columns
|
| 44 |
except Exception as e:
|
| 45 |
print(f"Error reading evaluation results: {e}")
|
| 46 |
import traceback
|
| 47 |
traceback.print_exc()
|
| 48 |
+
return pd.DataFrame(columns=cols + benchmark_cols) # Return empty dataframe with all columns
|
| 49 |
|
| 50 |
|
| 51 |
def get_evaluation_queue_df(eval_requests_path, eval_cols):
|