From d7c7aba99cc2e6383af605aaa4671f4266fad979 Mon Sep 17 00:00:00 2001
From: Kai Fricke <krfricke@users.noreply.github.com>
Date: Wed, 9 Sep 2020 05:00:52 +0100
Subject: [PATCH] [tune] Tune experiment analysis improvements (#10645)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
---
 README.rst                                    |   2 +-
 doc/source/tune/api_docs/analysis.rst         |   2 +-
 doc/source/tune/key-concepts.rst              |  14 +-
 python/ray/dashboard/dashboard.py             |   2 +-
 .../ray/tune/analysis/experiment_analysis.py  | 161 +++++++++++++++++-
 python/ray/tune/commands.py                   |   3 +-
 .../pbt_dcgan_mnist_trainable.py              |   2 +-
 python/ray/tune/suggest/bayesopt.py           |   6 +-
 python/ray/tune/tests/example.py              |   2 +-
 python/ray/tune/tests/test_api.py             |   9 +-
 .../tune/tests/test_experiment_analysis.py    |  55 ++++--
 .../tests/test_experiment_analysis_mem.py     |  10 +-
 .../tune/tests/test_trial_scheduler_pbt.py    |  15 +-
 python/ray/tune/tests/tutorial.py             |   2 +-
 14 files changed, 247 insertions(+), 38 deletions(-)

diff --git a/README.rst b/README.rst
index 1888eb37a..b4b48da3b 100644
--- a/README.rst
+++ b/README.rst
@@ -120,7 +120,7 @@ This example runs a parallel grid search to optimize an example objective functi
     print("Best config: ", analysis.get_best_config(metric="mean_loss"))
 
     # Get a dataframe for analyzing trial results.
-    df = analysis.dataframe()
+    df = analysis.results_df
 
 If TensorBoard is installed, automatically visualize all trial results:
 
diff --git a/doc/source/tune/api_docs/analysis.rst b/doc/source/tune/api_docs/analysis.rst
index 2ec32f686..c9468fbab 100644
--- a/doc/source/tune/api_docs/analysis.rst
+++ b/doc/source/tune/api_docs/analysis.rst
@@ -18,7 +18,7 @@ Here are some example operations for obtaining a summary of your experiment:
 .. code-block:: python
 
     # Get a dataframe for the last reported results of all of the trials
-    df = analysis.dataframe()
+    df = analysis.results_df
 
     # Get a dataframe for the max accuracy seen for each trial
     df = analysis.dataframe(metric="mean_accuracy", mode="max")
diff --git a/doc/source/tune/key-concepts.rst b/doc/source/tune/key-concepts.rst
index 11247895b..213d680a0 100644
--- a/doc/source/tune/key-concepts.rst
+++ b/doc/source/tune/key-concepts.rst
@@ -219,16 +219,24 @@ Analysis
 
     analysis = tune.run(trainable, search_alg=algo, stop={"training_iteration": 20})
 
-    # Get the best hyperparameters
-    best_hyperparameters = analysis.get_best_config()
+    best_trial = analysis.best_trial  # Get best trial
+    best_config = analysis.best_config  # Get best trial's hyperparameters
+    best_logdir = analysis.best_logdir  # Get best trial's logdir
+    best_checkpoint = analysis.best_checkpoint  # Get best trial's best checkpoint
+    best_result = analysis.best_result  # Get best trial's last results
+    best_result_df = analysis.best_result_df  # Get best result as pandas dataframe
 
 This object can also retrieve all training runs as dataframes, allowing you to do ad-hoc data analysis over your results.
 
 .. code-block:: python
 
-    # Get a dataframe for the max score seen for each trial
+    # Get a dataframe with the last results for each trial
+    df_results = analysis.results_df
+
+    # Get a dataframe of results for a specific score or mode
     df = analysis.dataframe(metric="score", mode="max")
 
+
 What's Next?
 -------------
 
diff --git a/python/ray/dashboard/dashboard.py b/python/ray/dashboard/dashboard.py
index 76a75d053..ee82a5bad 100644
--- a/python/ray/dashboard/dashboard.py
+++ b/python/ray/dashboard/dashboard.py
@@ -806,7 +806,7 @@ class TuneCollector(threading.Thread):
 
         # search through all the sub_directories in log directory
         analysis = Analysis(str(self._logdir))
-        df = analysis.dataframe()
+        df = analysis.dataframe(metric="episode_reward_mean", mode="max")
 
         if len(df) == 0 or "trial_id" not in df.columns:
             return
diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py
index 2da4c33e8..afa5de622 100644
--- a/python/ray/tune/analysis/experiment_analysis.py
+++ b/python/ray/tune/analysis/experiment_analysis.py
@@ -1,11 +1,17 @@
 import json
 import logging
 import os
+from typing import Dict
+
+from ray.tune.checkpoint_manager import Checkpoint
+from ray.tune.utils import flatten_dict
 
 try:
     import pandas as pd
+    from pandas import DataFrame
 except ImportError:
     pd = None
+    DataFrame = None
 
 from ray.tune.error import TuneError
 from ray.tune.result import EXPR_PROGRESS_FILE, EXPR_PARAM_FILE,\
@@ -80,6 +86,9 @@ class Analysis:
         Returns:
             pd.DataFrame: Constructed from a result dict of each trial.
         """
+        metric = self._validate_metric(metric)
+        mode = self._validate_mode(mode)
+
         rows = self._retrieve_rows(metric=metric, mode=mode)
         all_configs = self.get_all_configs(prefix=True)
         for path, config in all_configs.items():
@@ -227,6 +236,9 @@ class Analysis:
         mode = self._validate_mode(mode)
 
         checkpoint_paths = self.get_trial_checkpoints_paths(trial, metric)
+        if not checkpoint_paths:
+            logger.error(f"No checkpoints have been found for trial {trial}.")
+            return None
         if mode == "max":
             return max(checkpoint_paths, key=lambda x: x[1])[0]
         else:
@@ -316,7 +328,150 @@ class ExperimentAnalysis(Analysis):
             os.path.dirname(experiment_checkpoint_path), default_metric,
             default_mode)
 
-    def get_best_trial(self, metric=None, mode=None, scope="all"):
+    @property
+    def best_trial(self) -> Trial:
+        """Get the best trial of the experiment
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_trial(metric, mode, scope)` instead.
+        """
+        if not self.default_metric or not self.default_mode:
+            raise ValueError(
+                "To fetch the `best_trial`, pass a `metric` and `mode` "
+                "parameter to `tune.run()`. Alternatively, use the "
+                "`get_best_trial(metric, mode)` method to set the metric "
+                "and mode explicitly.")
+        return self.get_best_trial(self.default_metric, self.default_mode)
+
+    @property
+    def best_config(self) -> Dict:
+        """Get the config of the best trial of the experiment
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_config(metric, mode, scope)` instead.
+        """
+        if not self.default_metric or not self.default_mode:
+            raise ValueError(
+                "To fetch the `best_config`, pass a `metric` and `mode` "
+                "parameter to `tune.run()`. Alternatively, use the "
+                "`get_best_config(metric, mode)` method to set the metric "
+                "and mode explicitly.")
+        return self.get_best_config(self.default_metric, self.default_mode)
+
+    @property
+    def best_checkpoint(self) -> Checkpoint:
+        """Get the checkpoint of the best trial of the experiment
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_checkpoint(trial, metric, mode)` instead.
+        """
+        if not self.default_metric or not self.default_mode:
+            raise ValueError(
+                "To fetch the `best_checkpoint`, pass a `metric` and `mode` "
+                "parameter to `tune.run()`. Alternatively, use the "
+                "`get_best_checkpoint(trial, metric, mode)` method to set the "
+                "metric and mode explicitly.")
+        best_trial = self.best_trial
+        return self.get_best_checkpoint(best_trial, self.default_metric,
+                                        self.default_mode)
+
+    @property
+    def best_logdir(self) -> str:
+        """Get the logdir of the best trial of the experiment
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_logdir(metric, mode)` instead.
+        """
+        if not self.default_metric or not self.default_mode:
+            raise ValueError(
+                "To fetch the `best_logdir`, pass a `metric` and `mode` "
+                "parameter to `tune.run()`. Alternatively, use the "
+                "`get_best_logdir(metric, mode, scope)` method to set the "
+                "metric and mode explicitly.")
+        return self.get_best_logdir(self.default_metric, self.default_mode)
+
+    @property
+    def best_dataframe(self) -> DataFrame:
+        """Get the full result dataframe of the best trial of the experiment
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_logdir(metric, mode)` and use it to look for the dataframe
+        in the `self.trial_dataframes` dict.
+        """
+        if not self.default_metric or not self.default_mode:
+            raise ValueError(
+                "To fetch the `best_result`, pass a `metric` and `mode` "
+                "parameter to `tune.run()`.")
+        best_logdir = self.best_logdir
+        return self.trial_dataframes[best_logdir]
+
+    @property
+    def best_result(self) -> Dict:
+        """Get the last result of the best trial of the experiment
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_trial(metric, mode, scope).last_result` instead.
+        """
+        if not self.default_metric or not self.default_mode:
+            raise ValueError(
+                "To fetch the `best_result`, pass a `metric` and `mode` "
+                "parameter to `tune.run()`. Alternatively, use "
+                "`get_best_trial(metric, mode).last_result` to set "
+                "the metric and mode explicitly and fetch the last result.")
+        return self.best_trial.last_result
+
+    @property
+    def best_result_df(self) -> DataFrame:
+        """Get the best result of the experiment as a pandas dataframe.
+
+        The best trial is determined by comparing the last trial results
+        using the `metric` and `mode` parameters passed to `tune.run()`.
+
+        If you didn't pass these parameters, use
+        `get_best_trial(metric, mode, scope).last_result` instead.
+        """
+        if not pd:
+            raise ValueError("`best_result_df` requires pandas. Install with "
+                             "`pip install pandas`.")
+        best_result = flatten_dict(self.best_result, delimiter=".")
+        return pd.DataFrame.from_records([best_result], index="trial_id")
+
+    @property
+    def results(self) -> Dict[str, Dict]:
+        """Get the last result of the all trials of the experiment"""
+        return {trial.trial_id: trial.last_result for trial in self.trials}
+
+    @property
+    def results_df(self) -> DataFrame:
+        if not pd:
+            raise ValueError("`best_result_df` requires pandas. Install with "
+                             "`pip install pandas`.")
+        return pd.DataFrame.from_records(
+            [
+                flatten_dict(trial.last_result, delimiter=".")
+                for trial in self.trials
+            ],
+            index="trial_id")
+
+    def get_best_trial(self, metric=None, mode=None, scope="last"):
         """Retrieve the best trial object.
 
         Compares all trials' scores on ``metric``.
@@ -380,7 +535,7 @@ class ExperimentAnalysis(Analysis):
                 "parameter?")
         return best_trial
 
-    def get_best_config(self, metric=None, mode=None, scope="all"):
+    def get_best_config(self, metric=None, mode=None, scope="last"):
         """Retrieve the best config corresponding to the trial.
 
         Compares all trials' scores on `metric`.
@@ -407,7 +562,7 @@ class ExperimentAnalysis(Analysis):
         best_trial = self.get_best_trial(metric, mode, scope)
         return best_trial.config if best_trial else None
 
-    def get_best_logdir(self, metric=None, mode=None, scope="all"):
+    def get_best_logdir(self, metric=None, mode=None, scope="last"):
         """Retrieve the logdir corresponding to the best trial.
 
         Compares all trials' scores on `metric`.
diff --git a/python/ray/tune/commands.py b/python/ray/tune/commands.py
index 2ab17e609..7fbbe9776 100644
--- a/python/ray/tune/commands.py
+++ b/python/ray/tune/commands.py
@@ -116,7 +116,8 @@ def list_trials(experiment_path,
     _check_tabulate()
 
     try:
-        checkpoints_df = Analysis(experiment_path).dataframe()
+        checkpoints_df = Analysis(experiment_path).dataframe(
+            metric="episode_reward_mean", mode="max")
     except TuneError:
         raise click.ClickException("No trial data found!")
 
diff --git a/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist_trainable.py b/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist_trainable.py
index 1d6b3b7e3..8dea4fbcd 100644
--- a/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist_trainable.py
+++ b/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist_trainable.py
@@ -160,6 +160,6 @@ if __name__ == "__main__":
 
     # demo of the trained Generators
     if not args.smoke_test:
-        logdirs = analysis.dataframe()["logdir"].tolist()
+        logdirs = analysis.results_df["logdir"].tolist()
         model_paths = [os.path.join(d, "exported_models") for d in logdirs]
         demo_gan(analysis, model_paths)
diff --git a/python/ray/tune/suggest/bayesopt.py b/python/ray/tune/suggest/bayesopt.py
index 8554d3a1d..d5c7684c1 100644
--- a/python/ray/tune/suggest/bayesopt.py
+++ b/python/ray/tune/suggest/bayesopt.py
@@ -285,8 +285,10 @@ class BayesOptSearch(Searcher):
             analysis (ExperimentAnalysis): Optionally, the previous analysis
                 to integrate.
         """
-        for (_, report), params in zip(analysis.dataframe().iterrows(),
-                                       analysis.get_all_configs().values()):
+        for (_, report), params in zip(
+                analysis.dataframe(metric=self._metric,
+                                   mode=self._mode).iterrows(),
+                analysis.get_all_configs().values()):
             # We add the obtained results to the
             # gaussian process optimizer
             self._register_result(params, report)
diff --git a/python/ray/tune/tests/example.py b/python/ray/tune/tests/example.py
index 69d1f854b..383dd5ecb 100644
--- a/python/ray/tune/tests/example.py
+++ b/python/ray/tune/tests/example.py
@@ -39,5 +39,5 @@ print("Best config: ", analysis.get_best_config(
     metric="mean_loss", mode="min"))
 
 # Get a dataframe for analyzing trial results.
-df = analysis.dataframe()
+df = analysis.results_df
 # __quick_start_end__
diff --git a/python/ray/tune/tests/test_api.py b/python/ray/tune/tests/test_api.py
index fa0213dd8..3dc3d9fb2 100644
--- a/python/ray/tune/tests/test_api.py
+++ b/python/ray/tune/tests/test_api.py
@@ -520,7 +520,8 @@ class TrainableFunctionApiTest(unittest.TestCase):
         analysis = tune.run(train, num_samples=10, stop=stopper)
         self.assertTrue(
             all(t.status == Trial.TERMINATED for t in analysis.trials))
-        self.assertTrue(len(analysis.dataframe()) <= top)
+        self.assertTrue(
+            len(analysis.dataframe(metric="test", mode="max")) <= top)
 
         patience = 5
         stopper = EarlyStopping("test", top=top, mode="min", patience=patience)
@@ -528,14 +529,16 @@ class TrainableFunctionApiTest(unittest.TestCase):
         analysis = tune.run(train, num_samples=20, stop=stopper)
         self.assertTrue(
             all(t.status == Trial.TERMINATED for t in analysis.trials))
-        self.assertTrue(len(analysis.dataframe()) <= patience)
+        self.assertTrue(
+            len(analysis.dataframe(metric="test", mode="max")) <= patience)
 
         stopper = EarlyStopping("test", top=top, mode="min")
 
         analysis = tune.run(train, num_samples=10, stop=stopper)
         self.assertTrue(
             all(t.status == Trial.TERMINATED for t in analysis.trials))
-        self.assertTrue(len(analysis.dataframe()) <= top)
+        self.assertTrue(
+            len(analysis.dataframe(metric="test", mode="max")) <= top)
 
     def testBadStoppingFunction(self):
         def train(config, reporter):
diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py
index 5195c7825..bac891cc9 100644
--- a/python/ray/tune/tests/test_experiment_analysis.py
+++ b/python/ray/tune/tests/test_experiment_analysis.py
@@ -7,7 +7,7 @@ import pandas as pd
 from numpy import nan
 
 import ray
-from ray.tune import run, sample_from
+from ray import tune
 from ray.tune.examples.async_hyperband_example import MyTrainableClass
 
 
@@ -26,7 +26,7 @@ class ExperimentAnalysisSuite(unittest.TestCase):
         ray.shutdown()
 
     def run_test_exp(self):
-        self.ea = run(
+        self.ea = tune.run(
             MyTrainableClass,
             name=self.test_name,
             local_dir=self.test_dir,
@@ -34,13 +34,14 @@ class ExperimentAnalysisSuite(unittest.TestCase):
             checkpoint_freq=1,
             num_samples=self.num_samples,
             config={
-                "width": sample_from(
+                "width": tune.sample_from(
                     lambda spec: 10 + int(90 * random.random())),
-                "height": sample_from(lambda spec: int(100 * random.random())),
+                "height": tune.sample_from(
+                    lambda spec: int(100 * random.random())),
             })
 
     def nan_test_exp(self):
-        nan_ea = run(
+        nan_ea = tune.run(
             lambda x: nan,
             name="testing_nan",
             local_dir=self.test_dir,
@@ -48,14 +49,15 @@ class ExperimentAnalysisSuite(unittest.TestCase):
             checkpoint_freq=1,
             num_samples=self.num_samples,
             config={
-                "width": sample_from(
+                "width": tune.sample_from(
                     lambda spec: 10 + int(90 * random.random())),
-                "height": sample_from(lambda spec: int(100 * random.random())),
+                "height": tune.sample_from(
+                    lambda spec: int(100 * random.random())),
             })
         return nan_ea
 
     def testDataframe(self):
-        df = self.ea.dataframe()
+        df = self.ea.dataframe(self.metric, mode="max")
 
         self.assertTrue(isinstance(df, pd.DataFrame))
         self.assertEquals(df.shape[0], self.num_samples)
@@ -143,21 +145,50 @@ class ExperimentAnalysisSuite(unittest.TestCase):
             self.assertEqual(df.training_iteration.max(), 1)
 
     def testIgnoreOtherExperiment(self):
-        analysis = run(
+        analysis = tune.run(
             MyTrainableClass,
             name="test_example",
             local_dir=self.test_dir,
             stop={"training_iteration": 1},
             num_samples=1,
             config={
-                "width": sample_from(
+                "width": tune.sample_from(
                     lambda spec: 10 + int(90 * random.random())),
-                "height": sample_from(lambda spec: int(100 * random.random())),
+                "height": tune.sample_from(
+                    lambda spec: int(100 * random.random())),
             })
-        df = analysis.dataframe()
+        df = analysis.dataframe(self.metric, mode="max")
         self.assertEquals(df.shape[0], 1)
 
 
+class ExperimentAnalysisPropertySuite(unittest.TestCase):
+    def testBestProperties(self):
+        def train(config):
+            for i in range(10):
+                with tune.checkpoint_dir(i):
+                    pass
+                tune.report(res=config["base"] + i)
+
+        ea = tune.run(
+            train,
+            config={"base": tune.grid_search([100, 200, 300])},
+            metric="res",
+            mode="max")
+
+        trials = ea.trials
+
+        self.assertEquals(ea.best_trial, trials[2])
+        self.assertEquals(ea.best_config, trials[2].config)
+        self.assertEquals(ea.best_logdir, trials[2].logdir)
+        self.assertEquals(ea.best_checkpoint, trials[2].checkpoint.value)
+        self.assertTrue(
+            all(ea.best_dataframe["trial_id"] == trials[2].trial_id))
+        self.assertEquals(ea.results_df.loc[trials[2].trial_id, "res"], 309)
+        self.assertEquals(ea.best_result["res"], 309)
+        self.assertEquals(ea.best_result_df.loc[trials[2].trial_id, "res"],
+                          309)
+
+
 if __name__ == "__main__":
     import pytest
     import sys
diff --git a/python/ray/tune/tests/test_experiment_analysis_mem.py b/python/ray/tune/tests/test_experiment_analysis_mem.py
index 4e299a758..4ef9a51f8 100644
--- a/python/ray/tune/tests/test_experiment_analysis_mem.py
+++ b/python/ray/tune/tests/test_experiment_analysis_mem.py
@@ -83,10 +83,10 @@ class ExperimentAnalysisInMemorySuite(unittest.TestCase):
             num_samples=1,
             config={"id": grid_search(list(range(5)))})
 
-        max_all = ea.get_best_trial("score",
-                                    "max").metric_analysis["score"]["max"]
-        min_all = ea.get_best_trial("score",
-                                    "min").metric_analysis["score"]["min"]
+        max_all = ea.get_best_trial("score", "max",
+                                    "all").metric_analysis["score"]["max"]
+        min_all = ea.get_best_trial("score", "min",
+                                    "all").metric_analysis["score"]["min"]
         max_last = ea.get_best_trial("score", "max",
                                      "last").metric_analysis["score"]["last"]
         max_avg = ea.get_best_trial("score", "max",
@@ -149,7 +149,7 @@ class AnalysisSuite(unittest.TestCase):
 
     def testDataframe(self):
         analysis = Analysis(self.test_dir)
-        df = analysis.dataframe()
+        df = analysis.dataframe(self.metric, mode="max")
         self.assertTrue(isinstance(df, pd.DataFrame))
         self.assertEqual(df.shape[0], self.num_samples * 2)
 
diff --git a/python/ray/tune/tests/test_trial_scheduler_pbt.py b/python/ray/tune/tests/test_trial_scheduler_pbt.py
index 740616e8c..5af7cb467 100644
--- a/python/ray/tune/tests/test_trial_scheduler_pbt.py
+++ b/python/ray/tune/tests/test_trial_scheduler_pbt.py
@@ -82,15 +82,24 @@ class PopulationBasedTrainingSynchTest(unittest.TestCase):
 
     def testAsynchFail(self):
         analysis = self.synchSetup(False)
-        self.assertTrue(any(analysis.dataframe()["mean_accuracy"] != 33))
+        self.assertTrue(
+            any(
+                analysis.dataframe(metric="mean_accuracy", mode="max")
+                ["mean_accuracy"] != 33))
 
     def testSynchPass(self):
         analysis = self.synchSetup(True)
-        self.assertTrue(all(analysis.dataframe()["mean_accuracy"] == 33))
+        self.assertTrue(
+            all(
+                analysis.dataframe(metric="mean_accuracy", mode="max")[
+                    "mean_accuracy"] == 33))
 
     def testSynchPassLast(self):
         analysis = self.synchSetup(True, param=[30, 20, 10])
-        self.assertTrue(all(analysis.dataframe()["mean_accuracy"] == 33))
+        self.assertTrue(
+            all(
+                analysis.dataframe(metric="mean_accuracy", mode="max")[
+                    "mean_accuracy"] == 33))
 
 
 class PopulationBasedTrainingConfigTest(unittest.TestCase):
diff --git a/python/ray/tune/tests/tutorial.py b/python/ray/tune/tests/tutorial.py
index f0e5fa5af..2aa442279 100644
--- a/python/ray/tune/tests/tutorial.py
+++ b/python/ray/tune/tests/tutorial.py
@@ -166,7 +166,7 @@ analysis = tune.run(train_mnist, num_samples=10, search_alg=hyperopt_search)
 # __run_analysis_begin__
 import os
 
-df = analysis.dataframe()
+df = analysis.results_df
 logdir = analysis.get_best_logdir("mean_accuracy", mode="max")
 state_dict = torch.load(os.path.join(logdir, "model.pth"))