pratikbhavsar commited on
Commit
ef63014
·
1 Parent(s): 073522f

added gpt 4.5 and flash lite

Browse files
Files changed (33) hide show
  1. data_loader.py +5 -5
  2. get_exp_data.ipynb +2 -4
  3. output/gemini-2.0-flash-lite-001/BFCL_v3_irrelevance.parquet +0 -0
  4. output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
  5. output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
  6. output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_composite.parquet +0 -0
  7. output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_long_context.parquet +0 -0
  8. output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_func.parquet +0 -0
  9. output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_param.parquet +0 -0
  10. output/gemini-2.0-flash-lite-001/tau_long_context.parquet +0 -0
  11. output/gemini-2.0-flash-lite-001/toolace_single_func_call_1.parquet +0 -0
  12. output/gemini-2.0-flash-lite-001/toolace_single_func_call_2.parquet +0 -0
  13. output/gemini-2.0-flash-lite-001/xlam_multiple_tool_multiple_call.parquet +0 -0
  14. output/gemini-2.0-flash-lite-001/xlam_multiple_tool_single_call.parquet +0 -0
  15. output/gemini-2.0-flash-lite-001/xlam_single_tool_multiple_call.parquet +0 -0
  16. output/gemini-2.0-flash-lite-001/xlam_single_tool_single_call.parquet +0 -0
  17. output/gemini-2.0-flash-lite-001/xlam_tool_miss.parquet +0 -0
  18. output/gpt-4.5-preview-2025-02-27/BFCL_v3_irrelevance.parquet +0 -0
  19. output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
  20. output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
  21. output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_composite.parquet +0 -0
  22. output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_long_context.parquet +0 -0
  23. output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_func.parquet +0 -0
  24. output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_param.parquet +0 -0
  25. output/gpt-4.5-preview-2025-02-27/tau_long_context.parquet +0 -0
  26. output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_1.parquet +0 -0
  27. output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_2.parquet +0 -0
  28. output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_multiple_call.parquet +0 -0
  29. output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_single_call.parquet +0 -0
  30. output/gpt-4.5-preview-2025-02-27/xlam_single_tool_multiple_call.parquet +0 -0
  31. output/gpt-4.5-preview-2025-02-27/xlam_single_tool_single_call.parquet +0 -0
  32. output/gpt-4.5-preview-2025-02-27/xlam_tool_miss.parquet +0 -0
  33. results.csv +3 -1
data_loader.py CHANGED
@@ -604,9 +604,9 @@ HEADER_CONTENT = (
604
 
605
  CARDS = """ <div class="metrics-grid">
606
  <div class="metric-card">
607
- <div class="metric-number metric-blue">18</div>
608
  <div class="metric-label">Total Models</div>
609
- <div class="metric-detail primary">13 Private</div>
610
  <div class="metric-detail primary">5 Open Source</div>
611
  </div>
612
 
@@ -1003,11 +1003,11 @@ METHODOLOGY = """
1003
  <tbody>
1004
  <tr>
1005
  <td>Performance Champion</td>
1006
- <td>Claude 3.7 Sonnet comes at the top with 0.953 but Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
1007
  </tr>
1008
  <tr>
1009
  <td>Price-Performance Paradox</td>
1010
- <td>Top 3 models span 10x price difference yet only 4% performance gap, challenging pricing assumptions</td>
1011
  </tr>
1012
  <tr>
1013
  <td>Open Vs Closed Source</td>
@@ -1278,7 +1278,7 @@ evaluate_handler.finish()
1278
  </div>
1279
  <h3 class="feature-title">Updated Periodically</h3>
1280
  <ul class="feature-list">
1281
- <li>12 private models evaluated</li>
1282
  <li>5 open source models included</li>
1283
  <li>Monthly model additions</li>
1284
  </ul>
 
604
 
605
  CARDS = """ <div class="metrics-grid">
606
  <div class="metric-card">
607
+ <div class="metric-number metric-blue">20</div>
608
  <div class="metric-label">Total Models</div>
609
+ <div class="metric-detail primary">15 Private</div>
610
  <div class="metric-detail primary">5 Open Source</div>
611
  </div>
612
 
 
1003
  <tbody>
1004
  <tr>
1005
  <td>Performance Champion</td>
1006
+ <td>Claude 3.7 Sonnet(0.953) comes at the top but Gemini-2.0-flash(0.938) & Gemini-2.0-flash-lite(0.933) dominate at a very affordable cost, excelling in both complex tasks and safety features.</td>
1007
  </tr>
1008
  <tr>
1009
  <td>Price-Performance Paradox</td>
1010
+ <td>Top 3 models span 10x price difference yet only 2% performance gap, challenging pricing assumptions</td>
1011
  </tr>
1012
  <tr>
1013
  <td>Open Vs Closed Source</td>
 
1278
  </div>
1279
  <h3 class="feature-title">Updated Periodically</h3>
1280
  <ul class="feature-list">
1281
+ <li>15 private models evaluated</li>
1282
  <li>5 open source models included</li>
1283
  <li>Monthly model additions</li>
1284
  </ul>
get_exp_data.ipynb CHANGED
@@ -22,7 +22,7 @@
22
  },
23
  {
24
  "cell_type": "code",
25
- "execution_count": 2,
26
  "metadata": {},
27
  "outputs": [],
28
  "source": [
@@ -38,8 +38,6 @@
38
  " rows = pq.get_rows(\n",
39
  " project_id=PROJECT_ID,\n",
40
  " run_id=run_id,\n",
41
- " task_type=None,\n",
42
- " config=None,\n",
43
  " starting_token=0,\n",
44
  " limit=1000,\n",
45
  " )\n",
@@ -127,7 +125,7 @@
127
  " ))\n",
128
  "\n",
129
  "\n",
130
- "models = [\"accounts/fireworks/models/qwen2p5-72b-instruct\", \"meta-llama/Llama-3.3-70B-Instruct-Turbo\", \"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\"]\n",
131
  "# models = load_data()[\"Model\"]\n",
132
  "\n",
133
  "# Process each model sequentially, but datasets in parallel\n",
 
22
  },
23
  {
24
  "cell_type": "code",
25
+ "execution_count": 5,
26
  "metadata": {},
27
  "outputs": [],
28
  "source": [
 
38
  " rows = pq.get_rows(\n",
39
  " project_id=PROJECT_ID,\n",
40
  " run_id=run_id,\n",
 
 
41
  " starting_token=0,\n",
42
  " limit=1000,\n",
43
  " )\n",
 
125
  " ))\n",
126
  "\n",
127
  "\n",
128
+ "models = [\"gpt-4.5-preview-2025-02-27\", \"gemini-2.0-flash-lite-001\"]\n",
129
  "# models = load_data()[\"Model\"]\n",
130
  "\n",
131
  "# Process each model sequentially, but datasets in parallel\n",
output/gemini-2.0-flash-lite-001/BFCL_v3_irrelevance.parquet ADDED
Binary file (31.7 kB). View file
 
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
Binary file (23.1 kB). View file
 
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
Binary file (22.1 kB). View file
 
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_composite.parquet ADDED
Binary file (40.6 kB). View file
 
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_long_context.parquet ADDED
Binary file (38.5 kB). View file
 
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_func.parquet ADDED
Binary file (40 kB). View file
 
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_param.parquet ADDED
Binary file (42.2 kB). View file
 
output/gemini-2.0-flash-lite-001/tau_long_context.parquet ADDED
Binary file (42.8 kB). View file
 
output/gemini-2.0-flash-lite-001/toolace_single_func_call_1.parquet ADDED
Binary file (15.6 kB). View file
 
output/gemini-2.0-flash-lite-001/toolace_single_func_call_2.parquet ADDED
Binary file (12.3 kB). View file
 
output/gemini-2.0-flash-lite-001/xlam_multiple_tool_multiple_call.parquet ADDED
Binary file (107 kB). View file
 
output/gemini-2.0-flash-lite-001/xlam_multiple_tool_single_call.parquet ADDED
Binary file (43.6 kB). View file
 
output/gemini-2.0-flash-lite-001/xlam_single_tool_multiple_call.parquet ADDED
Binary file (32.8 kB). View file
 
output/gemini-2.0-flash-lite-001/xlam_single_tool_single_call.parquet ADDED
Binary file (47.7 kB). View file
 
output/gemini-2.0-flash-lite-001/xlam_tool_miss.parquet ADDED
Binary file (48.8 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_irrelevance.parquet ADDED
Binary file (49 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
Binary file (23.8 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
Binary file (22.7 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_composite.parquet ADDED
Binary file (44.3 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_long_context.parquet ADDED
Binary file (37.9 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_func.parquet ADDED
Binary file (39.5 kB). View file
 
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_param.parquet ADDED
Binary file (41.7 kB). View file
 
output/gpt-4.5-preview-2025-02-27/tau_long_context.parquet ADDED
Binary file (43.5 kB). View file
 
output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_1.parquet ADDED
Binary file (16.1 kB). View file
 
output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_2.parquet ADDED
Binary file (11.7 kB). View file
 
output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_multiple_call.parquet ADDED
Binary file (101 kB). View file
 
output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_single_call.parquet ADDED
Binary file (41.4 kB). View file
 
output/gpt-4.5-preview-2025-02-27/xlam_single_tool_multiple_call.parquet ADDED
Binary file (30.4 kB). View file
 
output/gpt-4.5-preview-2025-02-27/xlam_single_tool_single_call.parquet ADDED
Binary file (43.9 kB). View file
 
output/gpt-4.5-preview-2025-02-27/xlam_tool_miss.parquet ADDED
Binary file (54 kB). View file
 
results.csv CHANGED
@@ -1,7 +1,9 @@
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
  claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
3
  gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
 
4
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
 
5
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
6
  gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
7
  o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
@@ -17,4 +19,4 @@ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,
17
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
18
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
19
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
20
- Dataset Avg,,,,,,,0.83,0.80,0.81,0.79,0.78,0.89,0.81,0.96,0.62,0.81,0.82,0.82,0.92,0.85,0.74,0.81
 
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
  claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
3
  gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
4
+ gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
5
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
6
+ gpt-4.5-preview-2025-02-27,Private,Normal,OpenAI,75,150,0.900,0.93,0.87,0.85,0.91,0.92,0.97,0.92,0.99,0.67,0.85,0.98,0.85,1,0.98,0.8,0.915
7
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
8
  gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
9
  o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
 
19
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
20
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
21
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
22
+ Dataset Avg,,,,,,,0.84,0.81,0.82,0.81,0.79,0.89,0.82,0.96,0.64,0.82,0.84,0.83,0.93,0.86,0.76,0.82