fev-leaderboard

Running

App Files Files Community

Oleksandr Shchur commited on Dec 12, 2024

Commit

079b094

1 Parent(s): a6d6654

Update leaderboard

Browse files

Files changed (2) hide show

app.py +36 -24
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -35,48 +35,60 @@ summary_urls = [
     "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
 ]
-selected_cols = ["gmean_relative_error", "avg_rank", "median_inference_time_s"]
 rename_cols = {
     "gmean_relative_error": "Average relative error",
     "avg_rank": "Average rank",
     "median_inference_time_s": "Median inference time (s)",
 }
-lb_mase = (
-    fev.leaderboard(summary_urls, metric_column="MASE")[selected_cols]
-    .rename(columns=rename_cols)
-    .round(3)
-    .reset_index()
-    .astype(str)
-)
-lb_wql = (
-    fev.leaderboard(summary_urls, metric_column="WQL")[selected_cols]
-    .rename(columns=rename_cols)
-    .round(3)
-    .reset_index()
-    .astype(str)
-)
 with gr.Blocks() as demo:
-    with gr.Tab("Leaderboard"):
         gr.Markdown("""
-                    ## Chronos zero-shot benchmark results
-                    This tab contains results for various forecasting models on the 28 datasets used in Benchmark II (zero-shot evaluation) in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
-                    Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot).
-                    """)
-        gr.Markdown("""### Point forecast accuracy (measured by MASE)
                     """)
         gr.Dataframe(
-            value=lb_mase,
             interactive=False,
         )
-        gr.Markdown("### Probabilistic forecast accuracy (measured by WQL)")
         gr.Dataframe(
-            value=lb_wql,
             interactive=False,
         )

     "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
 ]
 rename_cols = {
     "gmean_relative_error": "Average relative error",
     "avg_rank": "Average rank",
     "median_inference_time_s": "Median inference time (s)",
+    "training_corpus_overlap": "Training corpus overlap (%)",
 }
+selected_cols = list(rename_cols.keys())
+def is_zero_shot(model_name):
+    return model_name.startswith("chronos") or model_name in {"timesfm"}
+leaderboards = {}
+for metric in ["WQL", "MASE"]:
+    lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols)
+    format_dict = {}
+    for col in lb.columns:
+        format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
+    leaderboards[metric] = lb.reset_index().style.format(format_dict)
 with gr.Blocks() as demo:
+    with gr.Tab("Chronos Benchmark II"):
         gr.Markdown("""
+                    ## Chronos Benchmark II results
+                    This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
+                    These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.
+                    Each table contains the following information:
+                    * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
+                    * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
+                    * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
+                    * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus.
+                    Lower values are better for all of the above metrics.
+                    Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
                     """)
+        gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).")
         gr.Dataframe(
+            value=leaderboards["WQL"],
+            datatype=["str", "number", "number", "number"],
             interactive=False,
         )
+        gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE).
+                    """)
         gr.Dataframe(
+            value=leaderboards["MASE"],
             interactive=False,
         )

requirements.txt CHANGED Viewed

@@ -8,4 +8,4 @@ huggingface-hub>=0.18.0
 matplotlib
 numpy
 pandas
-fev==0.1.0

 matplotlib
 numpy
 pandas
+fev==0.2.0