Spaces:
Runtime error
Runtime error
updates
Browse files- app.py +12 -5
- tabs/run_benchmark.py +4 -4
app.py
CHANGED
|
@@ -17,14 +17,14 @@ from tabs.run_benchmark import run_benchmark_main
|
|
| 17 |
demo = gr.Blocks()
|
| 18 |
|
| 19 |
|
| 20 |
-
def run_benchmark_gradio(tool_name, model_name, openai_api_key, anthropic_api_key):
|
| 21 |
"""Run the benchmark using inputs."""
|
| 22 |
if tool_name is None:
|
| 23 |
return "Please enter the name of your tool."
|
| 24 |
if openai_api_key is None and anthropic_api_key is None:
|
| 25 |
return "Please enter either OpenAI or Anthropic API key."
|
| 26 |
|
| 27 |
-
result = run_benchmark_main(tool_name, model_name, openai_api_key, anthropic_api_key)
|
| 28 |
if result == 'completed':
|
| 29 |
# get the results file in the results directory
|
| 30 |
fns = glob('results/*.csv')
|
|
@@ -106,8 +106,8 @@ with demo:
|
|
| 106 |
"claude-prediction-offline",
|
| 107 |
"claude-prediction-online",
|
| 108 |
'prediction-request-rag',
|
| 109 |
-
|
| 110 |
-
|
| 111 |
"prediction-request-reasoning-claude",
|
| 112 |
"prediction-request-rag-claude",
|
| 113 |
"prediction-url-cot-claude",
|
|
@@ -122,6 +122,13 @@ with demo:
|
|
| 122 |
with gr.Row():
|
| 123 |
openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
|
| 124 |
anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
with gr.Row():
|
| 126 |
run_button = gr.Button("Run Benchmark")
|
| 127 |
with gr.Row():
|
|
@@ -132,7 +139,7 @@ with demo:
|
|
| 132 |
summary = gr.Dataframe()
|
| 133 |
|
| 134 |
run_button.click(run_benchmark_gradio,
|
| 135 |
-
inputs=[tool_name, model_name, openai_api_key, anthropic_api_key],
|
| 136 |
outputs=[result, summary])
|
| 137 |
|
| 138 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 17 |
demo = gr.Blocks()
|
| 18 |
|
| 19 |
|
| 20 |
+
def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key):
|
| 21 |
"""Run the benchmark using inputs."""
|
| 22 |
if tool_name is None:
|
| 23 |
return "Please enter the name of your tool."
|
| 24 |
if openai_api_key is None and anthropic_api_key is None:
|
| 25 |
return "Please enter either OpenAI or Anthropic API key."
|
| 26 |
|
| 27 |
+
result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key)
|
| 28 |
if result == 'completed':
|
| 29 |
# get the results file in the results directory
|
| 30 |
fns = glob('results/*.csv')
|
|
|
|
| 106 |
"claude-prediction-offline",
|
| 107 |
"claude-prediction-online",
|
| 108 |
'prediction-request-rag',
|
| 109 |
+
"prediction-with-research-conservative",
|
| 110 |
+
"prediction-with-research-bold",
|
| 111 |
"prediction-request-reasoning-claude",
|
| 112 |
"prediction-request-rag-claude",
|
| 113 |
"prediction-url-cot-claude",
|
|
|
|
| 122 |
with gr.Row():
|
| 123 |
openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
|
| 124 |
anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
|
| 125 |
+
with gr.Row():
|
| 126 |
+
num_questions = gr.Slider(
|
| 127 |
+
minimum=1,
|
| 128 |
+
maximum=340,
|
| 129 |
+
value=10,
|
| 130 |
+
label="Number of questions to run the benchmark on",
|
| 131 |
+
)
|
| 132 |
with gr.Row():
|
| 133 |
run_button = gr.Button("Run Benchmark")
|
| 134 |
with gr.Row():
|
|
|
|
| 139 |
summary = gr.Dataframe()
|
| 140 |
|
| 141 |
run_button.click(run_benchmark_gradio,
|
| 142 |
+
inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key],
|
| 143 |
outputs=[result, summary])
|
| 144 |
|
| 145 |
demo.queue(default_concurrency_limit=40).launch()
|
tabs/run_benchmark.py
CHANGED
|
@@ -2,16 +2,16 @@ import os
|
|
| 2 |
from benchmark.run_benchmark import run_benchmark
|
| 3 |
|
| 4 |
|
| 5 |
-
def run_benchmark_main(tool_name, model_name, openai_api_key, anthropic_api_key):
|
| 6 |
"""Run the benchmark using the provided function and API key."""
|
| 7 |
# Empyt the results directory
|
| 8 |
os.system("rm -rf results/*")
|
| 9 |
|
| 10 |
-
print(f"Running benchmark with the following parameters: {tool_name}, {model_name}, {openai_api_key}, {anthropic_api_key}")
|
| 11 |
-
|
| 12 |
# Set the benchmark parameters
|
| 13 |
kwargs = {}
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
kwargs["tools"] = [tool_name]
|
| 16 |
if model_name:
|
| 17 |
kwargs["model"] = model_name
|
|
|
|
| 2 |
from benchmark.run_benchmark import run_benchmark
|
| 3 |
|
| 4 |
|
| 5 |
+
def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key):
|
| 6 |
"""Run the benchmark using the provided function and API key."""
|
| 7 |
# Empyt the results directory
|
| 8 |
os.system("rm -rf results/*")
|
| 9 |
|
|
|
|
|
|
|
| 10 |
# Set the benchmark parameters
|
| 11 |
kwargs = {}
|
| 12 |
+
if not num_questions:
|
| 13 |
+
num_questions = 10
|
| 14 |
+
kwargs["num_questions"] = num_questions
|
| 15 |
kwargs["tools"] = [tool_name]
|
| 16 |
if model_name:
|
| 17 |
kwargs["model"] = model_name
|