Spaces:
Running
Running
Oleksandr Shchur
commited on
Commit
·
079b094
1
Parent(s):
a6d6654
Update leaderboard
Browse files- app.py +36 -24
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -35,48 +35,60 @@ summary_urls = [
|
|
| 35 |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
|
| 36 |
]
|
| 37 |
|
| 38 |
-
selected_cols = ["gmean_relative_error", "avg_rank", "median_inference_time_s"]
|
| 39 |
rename_cols = {
|
| 40 |
"gmean_relative_error": "Average relative error",
|
| 41 |
"avg_rank": "Average rank",
|
| 42 |
"median_inference_time_s": "Median inference time (s)",
|
|
|
|
| 43 |
}
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
.
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
.
|
| 55 |
-
|
| 56 |
-
.reset_index()
|
| 57 |
-
.astype(str)
|
| 58 |
-
)
|
| 59 |
|
| 60 |
|
| 61 |
with gr.Blocks() as demo:
|
| 62 |
-
with gr.Tab("
|
| 63 |
gr.Markdown("""
|
| 64 |
-
## Chronos
|
| 65 |
|
| 66 |
-
This tab contains results for various forecasting models on the 28 datasets used in Benchmark II
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot).
|
| 69 |
-
""")
|
| 70 |
-
gr.Markdown("""### Point forecast accuracy (measured by MASE)
|
| 71 |
""")
|
|
|
|
| 72 |
gr.Dataframe(
|
| 73 |
-
value=
|
|
|
|
| 74 |
interactive=False,
|
| 75 |
)
|
| 76 |
|
| 77 |
-
gr.Markdown("###
|
|
|
|
| 78 |
gr.Dataframe(
|
| 79 |
-
value=
|
| 80 |
interactive=False,
|
| 81 |
)
|
| 82 |
|
|
|
|
| 35 |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
|
| 36 |
]
|
| 37 |
|
|
|
|
| 38 |
rename_cols = {
|
| 39 |
"gmean_relative_error": "Average relative error",
|
| 40 |
"avg_rank": "Average rank",
|
| 41 |
"median_inference_time_s": "Median inference time (s)",
|
| 42 |
+
"training_corpus_overlap": "Training corpus overlap (%)",
|
| 43 |
}
|
| 44 |
+
selected_cols = list(rename_cols.keys())
|
| 45 |
|
| 46 |
+
|
| 47 |
+
def is_zero_shot(model_name):
|
| 48 |
+
return model_name.startswith("chronos") or model_name in {"timesfm"}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
leaderboards = {}
|
| 52 |
+
for metric in ["WQL", "MASE"]:
|
| 53 |
+
lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols)
|
| 54 |
+
format_dict = {}
|
| 55 |
+
for col in lb.columns:
|
| 56 |
+
format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
|
| 57 |
+
leaderboards[metric] = lb.reset_index().style.format(format_dict)
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
with gr.Blocks() as demo:
|
| 61 |
+
with gr.Tab("Chronos Benchmark II"):
|
| 62 |
gr.Markdown("""
|
| 63 |
+
## Chronos Benchmark II results
|
| 64 |
|
| 65 |
+
This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
|
| 66 |
+
|
| 67 |
+
These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.
|
| 68 |
+
|
| 69 |
+
Each table contains the following information:
|
| 70 |
+
|
| 71 |
+
* **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
|
| 72 |
+
* **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
|
| 73 |
+
* **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
|
| 74 |
+
* **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus.
|
| 75 |
+
|
| 76 |
+
Lower values are better for all of the above metrics.
|
| 77 |
+
|
| 78 |
+
Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
""")
|
| 81 |
+
gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).")
|
| 82 |
gr.Dataframe(
|
| 83 |
+
value=leaderboards["WQL"],
|
| 84 |
+
datatype=["str", "number", "number", "number"],
|
| 85 |
interactive=False,
|
| 86 |
)
|
| 87 |
|
| 88 |
+
gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE).
|
| 89 |
+
""")
|
| 90 |
gr.Dataframe(
|
| 91 |
+
value=leaderboards["MASE"],
|
| 92 |
interactive=False,
|
| 93 |
)
|
| 94 |
|
requirements.txt
CHANGED
|
@@ -8,4 +8,4 @@ huggingface-hub>=0.18.0
|
|
| 8 |
matplotlib
|
| 9 |
numpy
|
| 10 |
pandas
|
| 11 |
-
fev==0.
|
|
|
|
| 8 |
matplotlib
|
| 9 |
numpy
|
| 10 |
pandas
|
| 11 |
+
fev==0.2.0
|