From d9c4dea7cf57f653eb24833aec97e57b5a829a66 Mon Sep 17 00:00:00 2001
From: Kai Fricke <krfricke@users.noreply.github.com>
Date: Thu, 24 Sep 2020 18:00:48 +0100
Subject: [PATCH] [tune] strict metric checking (#10972)

---
 doc/source/tune/_tutorials/tune-xgboost.rst   | 216 ++++++++----------
 doc/source/tune/user-guide.rst                |  29 +++
 python/ray/tune/examples/xgboost_example.py   |  46 ++--
 python/ray/tune/progress_reporter.py          |   2 +-
 python/ray/tune/schedulers/trial_scheduler.py |  10 +
 python/ray/tune/suggest/search.py             |  10 +
 python/ray/tune/suggest/search_generator.py   |   4 +
 python/ray/tune/tests/test_api.py             |  48 ++++
 python/ray/tune/tests/test_sample.py          |   2 +-
 python/ray/tune/trial.py                      |   8 +-
 python/ray/tune/trial_runner.py               |  58 ++++-
 python/ray/tune/tune.py                       |   3 +-
 12 files changed, 275 insertions(+), 161 deletions(-)

diff --git a/doc/source/tune/_tutorials/tune-xgboost.rst b/doc/source/tune/_tutorials/tune-xgboost.rst
index d8ac98668..dbb768c63 100644
--- a/doc/source/tune/_tutorials/tune-xgboost.rst
+++ b/doc/source/tune/_tutorials/tune-xgboost.rst
@@ -73,7 +73,6 @@ Here is the full code to train a simple XGBoost model:
 
 .. code-block:: python
 
-    import numpy as np
     import sklearn.datasets
     import sklearn.metrics
     from sklearn.model_selection import train_test_split
@@ -90,31 +89,37 @@ Here is the full code to train a simple XGBoost model:
         train_set = xgb.DMatrix(train_x, label=train_y)
         test_set = xgb.DMatrix(test_x, label=test_y)
         # Train the classifier
-        bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False)
-        # Predict labels for the test set
-        preds = bst.predict(test_set)
-        pred_labels = np.rint(preds)
-        # Return prediction accuracy
-        accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
-        return accuracy
+        results = {}
+        bst = xgb.train(
+            config,
+            train_set,
+            evals=[(test_set, "eval")],
+            evals_result=results,
+            verbose_eval=False)
+        return results
 
 
     if __name__ == "__main__":
-        accuracy = train_breast_cancer({
-            "objective": "binary:logistic"
+        results = train_breast_cancer({
+            "objective": "binary:logistic",
+            "eval_metric": ["logloss", "error"]
         })
-        print("Accuracy: {:.2f}".format(accuracy))
+        accuracy = 1. - results["eval"]["error"][-1]
+        print(f"Accuracy: {accuracy:.4f}")
+
 
 As you can see, the code is quite simple. First, the dataset is loaded and split
-into a ``test`` and ``train`` set. The XGBoost model is trained with ``xgb.train()``
-and the predictions for the test set are obtained with ``bst.predict()``. Lastly, we
-return the accuracy of our predictions. Even in this simple example, most runs result
+into a ``test`` and ``train`` set. The XGBoost model is trained with ``xgb.train()``.
+XGBoost automatically evaluates metrics we specified on the test set. In our case
+it calculates the *logloss* and the prediction *error*, which is the percentage of
+misclassified examples. To calculate the accuracy, we just have to subtract the error
+from ``1.0``. Even in this simple example, most runs result
 in a good accuracy of over ``0.90``.
 
 Maybe you have noticed the ``config`` parameter we pass to the XGBoost algorithm. This
 is a ``dict`` in which you can specify parameters for the XGBoost algorithm. In this
-simple example, the only parameter we passed is the ``objective`` parameter. The value
-``binary:logistic`` tells XGBoost that we aim to train a logistic regression model for
+simple example, the only parameters we passed are the ``objective`` and ``eval_metric`` parameters.
+The value ``binary:logistic`` tells XGBoost that we aim to train a logistic regression model for
 a binary classification task. You can find an overview over all valid objectives
 `here in the XGBoost documentation <https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters>`_.
 
@@ -228,13 +233,15 @@ Let's see how this looks like in code! We just need to adjust our ``config`` dic
     if __name__ == "__main__":
         config = {
             "objective": "binary:logistic",
+            "eval_metric": ["logloss", "error"]
             "max_depth": 2,
             "min_child_weight": 0,
             "subsample": 0.8,
             "eta": 0.2
         }
-        accuracy = train_breast_cancer(config)
-        print("Accuracy: {:.2f}".format(accuracy))
+        results = train_breast_cancer(config)
+        accuracy = 1. - results["eval"]["error"][-1]
+        print(f"Accuracy: {accuracy:.4f}")
 
 The rest stays the same. Please note that we do not adjust the ``num_boost_rounds`` here.
 The result should also show a high accuracy of over 90%.
@@ -261,9 +268,8 @@ Let's start with a basic example on how to use Tune for this. We just need to ma
 a few changes to our code-block:
 
 .. code-block:: python
-   :emphasize-lines: 26,32,33,34,35,37,38,39,40,41
+   :emphasize-lines: 26-28,35-38,40-44
 
-    import numpy as np
     import sklearn.datasets
     import sklearn.metrics
     from sklearn.model_selection import train_test_split
@@ -282,29 +288,34 @@ a few changes to our code-block:
         train_set = xgb.DMatrix(train_x, label=train_y)
         test_set = xgb.DMatrix(test_x, label=test_y)
         # Train the classifier
-        bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False)
-        # Predict labels for the test set
-        preds = bst.predict(test_set)
-        pred_labels = np.rint(preds)
+        results = {}
+        xgb.train(
+            config,
+            train_set,
+            evals=[(test_set, "eval")],
+            evals_result=results,
+            verbose_eval=False)
         # Return prediction accuracy
-        accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
+        accuracy = 1. - results["eval"]["error"][-1]
         tune.report(mean_accuracy=accuracy, done=True)
 
 
     if __name__ == "__main__":
         config = {
             "objective": "binary:logistic",
+            "eval_metric": ["logloss", "error"],
             "max_depth": tune.randint(1, 9),
             "min_child_weight": tune.choice([1, 2, 3]),
             "subsample": tune.uniform(0.5, 1.0),
             "eta": tune.loguniform(1e-4, 1e-1)
         }
-        tune.run(
+        analysis = tune.run(
             train_breast_cancer,
             resources_per_trial={"cpu": 1},
             config=config,
             num_samples=10)
 
+
 As you can see, the changes in the actual training function are minimal. Instead of
 returning the accuracy value, we report it back to Tune using ``tune.report()``.
 Our ``config`` dictionary only changed slightly. Instead of passing hard-coded
@@ -332,26 +343,27 @@ hyperparameter configurations from this search space.
 The output of our training run coud look like this:
 
 .. code-block:: bash
-   :emphasize-lines: 10
+   :emphasize-lines: 14
 
+    Number of trials: 10/10 (10 TERMINATED)
     +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+
     | Trial name                      | status     | loc   |         eta |   max_depth |   min_child_weight |   subsample |      acc |   iter |   total time (s) |
     |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------|
-    | train_breast_cancer_c817a_00000 | TERMINATED |       | 0.00334038  |           8 |                  1 |    0.640256 | 0.93007  |      1 |        0.050081  |
-    | train_breast_cancer_c817a_00001 | TERMINATED |       | 0.00285335  |           4 |                  3 |    0.951621 | 0.93007  |      1 |        0.0453899 |
-    | train_breast_cancer_c817a_00002 | TERMINATED |       | 0.0597631   |           5 |                  2 |    0.96479  | 0.986014 |      1 |        0.0503612 |
-    | train_breast_cancer_c817a_00003 | TERMINATED |       | 0.000650095 |           6 |                  2 |    0.923812 | 0.951049 |      1 |        0.0588872 |
-    | train_breast_cancer_c817a_00004 | TERMINATED |       | 0.00753275  |           1 |                  1 |    0.973499 | 0.881119 |      1 |        0.0347321 |
-    | train_breast_cancer_c817a_00005 | TERMINATED |       | 0.000411214 |           5 |                  1 |    0.672503 | 0.958042 |      1 |        0.0477931 |
-    | train_breast_cancer_c817a_00006 | TERMINATED |       | 0.0940201   |           5 |                  2 |    0.711124 | 0.972028 |      1 |        0.069901  |
-    | train_breast_cancer_c817a_00007 | TERMINATED |       | 0.0372492   |           1 |                  1 |    0.76303  | 0.895105 |      1 |        0.0496318 |
-    | train_breast_cancer_c817a_00008 | TERMINATED |       | 0.000140322 |           1 |                  2 |    0.885415 | 0.909091 |      1 |        0.045424  |
-    | train_breast_cancer_c817a_00009 | TERMINATED |       | 0.000341654 |           5 |                  3 |    0.720523 | 0.937063 |      1 |        0.0657773 |
+    | train_breast_cancer_b63aa_00000 | TERMINATED |       | 0.000117625 |           2 |                  2 |    0.616347 | 0.916084 |      1 |        0.0306492 |
+    | train_breast_cancer_b63aa_00001 | TERMINATED |       | 0.0382954   |           8 |                  2 |    0.581549 | 0.937063 |      1 |        0.0357082 |
+    | train_breast_cancer_b63aa_00002 | TERMINATED |       | 0.000217926 |           1 |                  3 |    0.528428 | 0.874126 |      1 |        0.0264609 |
+    | train_breast_cancer_b63aa_00003 | TERMINATED |       | 0.000120929 |           8 |                  1 |    0.634508 | 0.958042 |      1 |        0.036406  |
+    | train_breast_cancer_b63aa_00004 | TERMINATED |       | 0.00839715  |           5 |                  1 |    0.730624 | 0.958042 |      1 |        0.0389378 |
+    | train_breast_cancer_b63aa_00005 | TERMINATED |       | 0.000732948 |           8 |                  2 |    0.915863 | 0.958042 |      1 |        0.0382841 |
+    | train_breast_cancer_b63aa_00006 | TERMINATED |       | 0.000856226 |           4 |                  1 |    0.645209 | 0.916084 |      1 |        0.0357089 |
+    | train_breast_cancer_b63aa_00007 | TERMINATED |       | 0.00769908  |           7 |                  1 |    0.729443 | 0.909091 |      1 |        0.0390737 |
+    | train_breast_cancer_b63aa_00008 | TERMINATED |       | 0.00186339  |           5 |                  3 |    0.595744 | 0.944056 |      1 |        0.0343912 |
+    | train_breast_cancer_b63aa_00009 | TERMINATED |       | 0.000950272 |           3 |                  2 |    0.835504 | 0.965035 |      1 |        0.0348201 |
     +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+
 
-The best configuration we found used ``eta=0.0940201``, ``max_depth=5``,
-``min_child_weight=2``, ``subsample=0.711124`` and reached an accuracy of
-``0.972028``.
+The best configuration we found used ``eta=0.000950272``, ``max_depth=3``,
+``min_child_weight=2``, ``subsample=0.835504`` and reached an accuracy of
+``0.965035``.
 
 Early stopping
 --------------
@@ -385,97 +397,49 @@ Lastly, we have to report the loss metric to Tune. We do this with a ``Callback`
 XGBoost accepts and calls after each evaluation round. Ray Tune comes
 with :ref:`two XGBoost callbacks <tune-integration-xgboost>`
 we can use for this. The ``TuneReportCallback`` just reports the evaluation
-metrics back to Tune. The ``TuneReportCheckpointCallback`` would also save
-checkpoints after each evaluation round. We will just use the former in this
-example.
+metrics back to Tune. The ``TuneReportCheckpointCallback`` also saves
+checkpoints after each evaluation round. We will just use the latter in this
+example so that we can retrieve the saved model later.
 
-We also tell XGBoost which loss metrics to calculate in the ``eval_metric``
-parameter in the config. These parameters are then reported to Tune
-via the callback.
+These parameters from the ``eval_metrics`` configuration setting are then automatically
+reported to Tune via the callback. Here, the raw error will be reported, not the accuracy.
+To display the best reached accuracy, we will inverse it later.
 
-.. code-block:: python
-   :emphasize-lines: 9,26,42,44-49
+We will also load the best checkpointed model so that we can use it for predictions.
+The best model is selected with respect to the ``metric`` and ``mode`` parameters we
+pass to ``tune.run()``.
 
-    import numpy as np
-    import sklearn.datasets
-    import sklearn.metrics
-    from ray.tune.schedulers import ASHAScheduler
-    from sklearn.model_selection import train_test_split
-    import xgboost as xgb
-
-    from ray import tune
-    from ray.tune.integration.xgboost import TuneReportCallback
-
-    def train_breast_cancer(config):
-        # Load dataset
-        data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
-        # Split into train and test set
-        train_x, test_x, train_y, test_y = train_test_split(
-            data, labels, test_size=0.25)
-        # Build input matrices for XGBoost
-        train_set = xgb.DMatrix(train_x, label=train_y)
-        test_set = xgb.DMatrix(test_x, label=test_y)
-        # Train the classifier
-        bst = xgb.train(
-            config,
-            train_set,
-            evals=[(test_set, "eval")],
-            verbose_eval=False,
-            callbacks=[TuneReportCallback()])
-        # Predict labels for the test set
-        preds = bst.predict(test_set)
-        pred_labels = np.rint(preds)
-        # Return prediction accuracy
-        accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
-        tune.report(mean_accuracy=accuracy, done=True)
-
-
-    if __name__ == "__main__":
-        config = {
-            "objective": "binary:logistic",
-            "max_depth": tune.randint(1, 9),
-            "min_child_weight": tune.choice([1, 2, 3]),
-            "subsample": tune.uniform(0.5, 1.0),
-            "eta": tune.loguniform(1e-4, 1e-1),
-            "eval_metric": ["auc", "ams@0", "logloss"]
-        }
-        scheduler = ASHAScheduler(
-            metric="eval-logloss",  # The `eval` prefix is defined in xgb.train
-            mode="min",  # Retain configurations with a low logloss
-            max_t=11,  # 10 training iterations + 1 final evaluation
-            grace_period=1,  # Number of minimum iterations for each trial
-            reduction_factor=2)  # How aggressively to stop trials
-        tune.run(
-            train_breast_cancer,
-            resources_per_trial={"cpu": 1},
-            config=config,
-            num_samples=10,
-            scheduler=scheduler)
+.. literalinclude:: /../../python/ray/tune/examples/xgboost_example.py
+   :language: python
+   :emphasize-lines: 8,25,37-40,44-45,49,51-57
 
 The output of our run could look like this:
 
 .. code-block:: bash
-   :emphasize-lines: 13
+   :emphasize-lines: 7
 
-    +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+
-    | Trial name                      | status     | loc   |         eta |   max_depth |   min_child_weight |   subsample |      acc |   iter |   total time (s) |
-    |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------|
-    | train_breast_cancer_806ea_00000 | TERMINATED |       | 0.0371055   |           2 |                  1 |    0.611729 | 0.951049 |     11 |        0.339279  |
-    | train_breast_cancer_806ea_00001 | TERMINATED |       | 0.0324613   |           3 |                  2 |    0.643815 |          |      4 |        0.230338  |
-    | train_breast_cancer_806ea_00002 | TERMINATED |       | 0.0100875   |           4 |                  3 |    0.985147 |          |      2 |        0.0661929 |
-    | train_breast_cancer_806ea_00003 | TERMINATED |       | 0.00124263  |           1 |                  3 |    0.890299 |          |      1 |        0.0201721 |
-    | train_breast_cancer_806ea_00004 | TERMINATED |       | 0.000230373 |           5 |                  3 |    0.627611 |          |      1 |        0.0265107 |
-    | train_breast_cancer_806ea_00005 | TERMINATED |       | 0.000186942 |           5 |                  2 |    0.831801 |          |      1 |        0.026082  |
-    | train_breast_cancer_806ea_00006 | TERMINATED |       | 0.00871051  |           2 |                  3 |    0.721523 | 0.958042 |     11 |        0.299392  |
-    | train_breast_cancer_806ea_00007 | TERMINATED |       | 0.00440949  |           2 |                  3 |    0.606252 |          |      1 |        0.0210171 |
-    | train_breast_cancer_806ea_00008 | TERMINATED |       | 0.00948289  |           5 |                  2 |    0.892979 |          |      2 |        0.140424  |
-    | train_breast_cancer_806ea_00009 | TERMINATED |       | 0.0514017   |           2 |                  1 |    0.859864 | 0.972028 |     11 |        0.365437  |
-    +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+
+    Number of trials: 10/10 (10 TERMINATED)
+    +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+
+    | Trial name                      | status     | loc   |         eta |   max_depth |   min_child_weight |   subsample |   iter |   total time (s) |   eval-logloss |   eval-error |
+    |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------|
+    | train_breast_cancer_ba275_00000 | TERMINATED |       | 0.00205087  |           2 |                  1 |    0.898391 |     10 |        0.380619  |       0.678039 |     0.090909 |
+    | train_breast_cancer_ba275_00001 | TERMINATED |       | 0.000183834 |           4 |                  3 |    0.924939 |      1 |        0.0228798 |       0.693009 |     0.111888 |
+    | train_breast_cancer_ba275_00002 | TERMINATED |       | 0.0242721   |           7 |                  2 |    0.501551 |     10 |        0.376154  |       0.54472  |     0.06993  |
+    | train_breast_cancer_ba275_00003 | TERMINATED |       | 0.000449692 |           5 |                  3 |    0.890212 |      1 |        0.0234981 |       0.692811 |     0.090909 |
+    | train_breast_cancer_ba275_00004 | TERMINATED |       | 0.000376393 |           7 |                  2 |    0.883609 |      1 |        0.0231569 |       0.692847 |     0.062937 |
+    | train_breast_cancer_ba275_00005 | TERMINATED |       | 0.00231942  |           3 |                  3 |    0.877464 |      2 |        0.104867  |       0.689541 |     0.083916 |
+    | train_breast_cancer_ba275_00006 | TERMINATED |       | 0.000542326 |           1 |                  2 |    0.578584 |      1 |        0.0213971 |       0.692765 |     0.083916 |
+    | train_breast_cancer_ba275_00007 | TERMINATED |       | 0.0016801   |           1 |                  2 |    0.975302 |      1 |        0.02226   |       0.691999 |     0.083916 |
+    | train_breast_cancer_ba275_00008 | TERMINATED |       | 0.000595756 |           8 |                  3 |    0.58429  |      1 |        0.0221152 |       0.692657 |     0.06993  |
+    | train_breast_cancer_ba275_00009 | TERMINATED |       | 0.000357845 |           8 |                  1 |    0.637776 |      1 |        0.022635  |       0.692859 |     0.090909 |
+    +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+
 
-As you can see, four trials have been stopped after just one iteration, two after two iterations,
-one after four iterations, and the three most promising configurations have been run for
-ten iterations. The 11 is due to the fact that we finally report the accuracy after
-training the full model, which is internally interpreted as another iteration.
+
+    Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5015513240240503, 'eta': 0.024272050872920895}
+    Best model total accuracy: 0.9301
+
+As you can see, most trials have been stopped only after a few iterations. Only the
+two most promising trials were run for the full 10 iterations.
 
 Using fractional GPUs
 ---------------------
@@ -487,16 +451,16 @@ Tune supports *fractional GPUs*. This means that each task is assigned a fractio
 of the GPU memory for training. For 10 tasks, this could look like this:
 
 .. code-block:: python
-   :emphasize-lines: 8,12
+   :emphasize-lines: 4,12
 
     config = {
         "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+        "tree_method": "gpu_hist",
         "max_depth": tune.randint(1, 9),
         "min_child_weight": tune.choice([1, 2, 3]),
         "subsample": tune.uniform(0.5, 1.0),
-        "eta": tune.loguniform(1e-4, 1e-1),
-        "eval_metric": ["auc", "ams@0", "logloss"],
-        "tree_method": "gpu_hist"
+        "eta": tune.loguniform(1e-4, 1e-1)
     }
     tune.run(
         train_breast_cancer,
diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst
index 082f74eb5..57ab549ef 100644
--- a/doc/source/tune/user-guide.rst
+++ b/doc/source/tune/user-guide.rst
@@ -535,6 +535,35 @@ By default, ``tune.run`` will continue executing until all trials have terminate
 
 This is useful when you are trying to setup a large hyperparameter experiment.
 
+Environment variables
+---------------------
+Some of Ray Tune's behavior can be configured using environment variables.
+These are the environment variables Ray Tune currently considers:
+
+* **TUNE_CLUSTER_SSH_KEY**: SSH key used by the Tune driver process to connect
+  to remote cluster machines for checkpoint syncing. If this is not set,
+  ``~/ray_bootstrap_key.pem`` will be used.
+* **TUNE_DISABLE_AUTO_INIT**: Disable automatically calling ``ray.init()`` if
+  not attached to a Ray session.
+* **TUNE_DISABLE_STRICT_METRIC_CHECKING**: When you report metrics to Tune via
+  ``tune.report()`` and passed a ``metric`` parameter to ``tune.run()``, a scheduler,
+  or a search algorithm, Tune will error
+  if the metric was not reported in the result. Setting this environment variable
+  to ``1`` will disable this check.
+* **TUNE_GLOBAL_CHECKPOINT_S**: Time in seconds that limits how often Tune's
+  experiment state is checkpointed. If not set this will default to ``10``.
+* **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those
+  with the parameter values in them)
+* **TUNE_RESULT_DIR**: Directory where Tune trial results are stored. If this
+  is not set, ``~/ray_results`` will be used.
+
+
+There are some environment variables that are mostly relevant for integrated libraries:
+
+* **SIGOPT_KEY**: SigOpt API access key.
+* **WANDB_API_KEY**: Weights and Biases API key. You can also use ``wandb login``
+  instead.
+
 
 Further Questions or Issues?
 ----------------------------
diff --git a/python/ray/tune/examples/xgboost_example.py b/python/ray/tune/examples/xgboost_example.py
index 73303e221..285734722 100644
--- a/python/ray/tune/examples/xgboost_example.py
+++ b/python/ray/tune/examples/xgboost_example.py
@@ -1,4 +1,3 @@
-import numpy as np
 import sklearn.datasets
 import sklearn.metrics
 from ray.tune.schedulers import ASHAScheduler
@@ -6,7 +5,7 @@ from sklearn.model_selection import train_test_split
 import xgboost as xgb
 
 from ray import tune
-from ray.tune.integration.xgboost import TuneReportCallback
+from ray.tune.integration.xgboost import TuneReportCheckpointCallback
 
 
 def train_breast_cancer(config):
@@ -19,39 +18,44 @@ def train_breast_cancer(config):
     train_set = xgb.DMatrix(train_x, label=train_y)
     test_set = xgb.DMatrix(test_x, label=test_y)
     # Train the classifier
-    bst = xgb.train(
+    xgb.train(
         config,
         train_set,
         evals=[(test_set, "eval")],
         verbose_eval=False,
-        callbacks=[TuneReportCallback()])
-    # Predict labels for the test set
-    preds = bst.predict(test_set)
-    pred_labels = np.rint(preds)
-    # Return prediction accuracy
-    accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
-    tune.report(mean_accuracy=accuracy, done=True)
+        callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])
 
 
 if __name__ == "__main__":
     config = {
         "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
         "max_depth": tune.randint(1, 9),
         "min_child_weight": tune.choice([1, 2, 3]),
         "subsample": tune.uniform(0.5, 1.0),
-        "eta": tune.loguniform(1e-4, 1e-1),
-        "eval_metric": ["auc", "ams@0", "logloss"]
+        "eta": tune.loguniform(1e-4, 1e-1)
     }
-    # The ASHAScheduler stops bad performing configurations early
     scheduler = ASHAScheduler(
-        metric="eval-logloss",  # The `eval` prefix is defined in xgb.train
-        mode="min",  # Retain configurations with a low logloss
-        max_t=11,  # 10 training iterations + 1 final evaluation
-        grace_period=1,  # Number of minimum iterations for each trial
-        reduction_factor=2)  # How aggressively to stop trials
-    tune.run(
-        train_breast_cancer,  # your training function
+        max_t=10,  # 10 training iterations
+        grace_period=1,
+        reduction_factor=2)
+
+    analysis = tune.run(
+        train_breast_cancer,
+        metric="eval-logloss",
+        mode="min",
         resources_per_trial={"cpu": 1},  # You can add "gpu": 0.1 here
         config=config,
-        num_samples=10,  # number of parameter configurations to try
+        num_samples=10,
         scheduler=scheduler)
+
+    # Load the best model checkpoint
+    import os
+    best_bst = xgb.Booster()
+    best_bst.load_model(os.path.join(analysis.best_checkpoint, "model.xgb"))
+    accuracy = 1. - analysis.best_result["eval-error"]
+    print(f"Best model parameters: {analysis.best_config}")
+    print(f"Best model total accuracy: {accuracy:.4f}")
+
+    # You could now do further predictions with
+    # best_bst.predict(...)
diff --git a/python/ray/tune/progress_reporter.py b/python/ray/tune/progress_reporter.py
index f54a8ea58..531c1da57 100644
--- a/python/ray/tune/progress_reporter.py
+++ b/python/ray/tune/progress_reporter.py
@@ -278,7 +278,7 @@ class TuneReporterBase(ProgressReporter):
                 continue
             if not best_metric or \
                t.last_result[metric] * metric_op > best_metric:
-                best_metric = t.last_result[metric]
+                best_metric = t.last_result[metric] * metric_op
                 best_trial = t
         return best_trial, metric
 
diff --git a/python/ray/tune/schedulers/trial_scheduler.py b/python/ray/tune/schedulers/trial_scheduler.py
index 9b61287b7..56df73943 100644
--- a/python/ray/tune/schedulers/trial_scheduler.py
+++ b/python/ray/tune/schedulers/trial_scheduler.py
@@ -11,6 +11,12 @@ class TrialScheduler:
     PAUSE = "PAUSE"  #: Status for pausing trial execution
     STOP = "STOP"  #: Status for stopping trial execution
 
+    _metric = None
+
+    @property
+    def metric(self):
+        return self._metric
+
     def set_search_properties(self, metric: Optional[str],
                               mode: Optional[str]) -> bool:
         """Pass search properties to scheduler.
@@ -22,6 +28,10 @@ class TrialScheduler:
             metric (str): Metric to optimize
             mode (str): One of ["min", "max"]. Direction to optimize.
         """
+        if self._metric and metric:
+            return False
+        if metric:
+            self._metric = metric
         return True
 
     def on_trial_add(self, trial_runner: "trial_runner.TrialRunner",
diff --git a/python/ray/tune/suggest/search.py b/python/ray/tune/suggest/search.py
index 7654924e5..f667bf615 100644
--- a/python/ray/tune/suggest/search.py
+++ b/python/ray/tune/suggest/search.py
@@ -17,6 +17,12 @@ class SearchAlgorithm:
     """
     _finished = False
 
+    _metric = None
+
+    @property
+    def metric(self):
+        return self._metric
+
     def set_search_properties(self, metric: Optional[str], mode: Optional[str],
                               config: Dict) -> bool:
         """Pass search properties to search algorithm.
@@ -33,6 +39,10 @@ class SearchAlgorithm:
             mode (str): One of ["min", "max"]. Direction to optimize.
             config (dict): Tune config dict.
         """
+        if self._metric and metric:
+            return False
+        if metric:
+            self._metric = metric
         return True
 
     @property
diff --git a/python/ray/tune/suggest/search_generator.py b/python/ray/tune/suggest/search_generator.py
index 19dfa8be3..15b24286d 100644
--- a/python/ray/tune/suggest/search_generator.py
+++ b/python/ray/tune/suggest/search_generator.py
@@ -70,6 +70,10 @@ class SearchGenerator(SearchAlgorithm):
         self._total_samples = 0  # int: total samples to evaluate.
         self._finished = False
 
+    @property
+    def metric(self):
+        return self.searcher.metric
+
     def set_search_properties(self, metric: Optional[str], mode: Optional[str],
                               config: Dict) -> bool:
         return self.searcher.set_search_properties(metric, mode, config)
diff --git a/python/ray/tune/tests/test_api.py b/python/ray/tune/tests/test_api.py
index f362967ad..b612ba4db 100644
--- a/python/ray/tune/tests/test_api.py
+++ b/python/ray/tune/tests/test_api.py
@@ -1146,6 +1146,54 @@ class TrainableFunctionApiTest(unittest.TestCase):
         diff = time.time() - start
         self.assertLess(diff, 9)
 
+    def testMetricCheckingEndToEnd(self):
+        from ray import tune
+
+        def train(config):
+            tune.report(val=4, second=8)
+
+        def train2(config):
+            return
+
+        os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "0"
+        # `acc` is not reported, should raise
+        with self.assertRaises(TuneError):
+            # The trial runner raises a ValueError, but the experiment fails
+            # with a TuneError
+            tune.run(train, metric="acc")
+
+        # `val` is reported, should not raise
+        tune.run(train, metric="val")
+
+        # Run does not report anything, should not raise
+        tune.run(train2, metric="val")
+
+        # Only the scheduler requires a metric
+        with self.assertRaises(TuneError):
+            tune.run(
+                train,
+                scheduler=AsyncHyperBandScheduler(metric="acc", mode="max"))
+
+        tune.run(
+            train, scheduler=AsyncHyperBandScheduler(metric="val", mode="max"))
+
+        # Only the search alg requires a metric
+        with self.assertRaises(TuneError):
+            tune.run(
+                train,
+                config={"a": tune.choice([1, 2])},
+                search_alg=HyperOptSearch(metric="acc", mode="max"))
+
+        # Metric is passed
+        tune.run(
+            train,
+            config={"a": tune.choice([1, 2])},
+            search_alg=HyperOptSearch(metric="val", mode="max"))
+
+        os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1"
+        # With strict metric checking disabled, this should not raise
+        tune.run(train, metric="acc")
+
 
 class ShimCreationTest(unittest.TestCase):
     def testCreateScheduler(self):
diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py
index c534d3144..e2f9bff50 100644
--- a/python/ray/tune/tests/test_sample.py
+++ b/python/ray/tune/tests/test_sample.py
@@ -252,7 +252,7 @@ class SearchSpaceTest(unittest.TestCase):
         with self.assertRaises(ValueError):
             searcher.set_search_properties("none", "max", invalid_config)
 
-        searcher = BayesOptSearch(metric="a", mode="max")
+        searcher = BayesOptSearch(metric="b", mode="max")
         analysis = tune.run(
             _mock_objective, config=config, search_alg=searcher, num_samples=1)
         trial = analysis.trials[0]
diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py
index f67cf0fcf..8db3a0e72 100644
--- a/python/ray/tune/trial.py
+++ b/python/ray/tune/trial.py
@@ -26,8 +26,14 @@ from ray.tune.utils import flatten_dict
 from ray.utils import binary_to_hex, hex_to_binary
 
 DEBUG_PRINT_INTERVAL = 5
-MAX_LEN_IDENTIFIER = int(os.environ.get("MAX_LEN_IDENTIFIER", 130))
 logger = logging.getLogger(__name__)
+if "MAX_LEN_IDENTIFIER" in os.environ:
+    logger.error(
+        "The MAX_LEN_IDENTIFIER environment variable is deprecated and will "
+        "be removed in the future. Use TUNE_MAX_LEN_IDENTIFIER instead.")
+MAX_LEN_IDENTIFIER = int(
+    os.environ.get("TUNE_MAX_LEN_IDENTIFIER",
+                   os.environ.get("MAX_LEN_IDENTIFIER", 130)))
 
 
 def date_str():
diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py
index d13073ad4..593993e0a 100644
--- a/python/ray/tune/trial_runner.py
+++ b/python/ray/tune/trial_runner.py
@@ -132,15 +132,20 @@ class TrialRunner:
                  fail_fast=False,
                  verbose=True,
                  checkpoint_period=None,
-                 trial_executor=None):
+                 trial_executor=None,
+                 metric=None):
         self._search_alg = search_alg or BasicVariantGenerator()
         self._scheduler_alg = scheduler or FIFOScheduler()
         self.trial_executor = trial_executor or RayTrialExecutor()
 
-        # For debugging, it may be useful to halt trials after some time has
-        # elapsed. TODO(ekl) consider exposing this in the API.
-        self._global_time_limit = float(
-            os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf")))
+        self._metric = metric
+
+        if "TRIALRUNNER_WALLTIME_LIMIT" in os.environ:
+            raise ValueError(
+                "The TRIALRUNNER_WALLTIME_LIMIT environment variable is "
+                "deprecated. "
+                "Use `tune.run(time_budget_s=limit)` instead.")
+
         self._total_time = 0
         self._iteration = 0
         self._has_errored = False
@@ -349,11 +354,6 @@ class TrialRunner:
 
     def is_finished(self):
         """Returns whether all trials have finished running."""
-        if self._total_time > self._global_time_limit:
-            logger.warning("Exceeded global time limit {} / {}".format(
-                self._total_time, self._global_time_limit))
-            return True
-
         trials_done = all(trial.is_finished() for trial in self._trials)
         return trials_done and self._search_alg.is_finished()
 
@@ -527,6 +527,7 @@ class TrialRunner:
                 result = trial.last_result
                 result.update(done=True)
 
+            self._validate_result_metrics(result)
             self._total_time += result.get(TIME_THIS_ITER_S, 0)
 
             flat_result = flatten_dict(result)
@@ -572,6 +573,43 @@ class TrialRunner:
                 raise
             self._process_trial_failure(trial, traceback.format_exc())
 
+    def _validate_result_metrics(self, result):
+        """
+        Check if any of the required metrics was not reported
+        in the last result. If the only item is `done=True`, this
+        means that no result was ever received and the trial just
+        returned. This is also okay and will not raise an error.
+        """
+        if int(os.environ.get("TUNE_DISABLE_STRICT_METRIC_CHECKING",
+                              0)) != 1 and (len(result) > 1
+                                            or "done" not in result):
+            base_metric = self._metric
+            scheduler_metric = self._scheduler_alg.metric
+            search_metric = self._search_alg.metric
+
+            if base_metric and base_metric not in result:
+                report_metric = base_metric
+                location = "tune.run()"
+            elif scheduler_metric and scheduler_metric not in result:
+                report_metric = scheduler_metric
+                location = type(self._scheduler_alg).__name__
+            elif search_metric and search_metric not in result:
+                report_metric = search_metric
+                location = type(self._search_alg).__name__
+            else:
+                report_metric = None
+                location = None
+
+            if report_metric:
+                raise ValueError(
+                    "Trial returned a result which did not include the "
+                    "specified metric `{}` that `{}` expects. "
+                    "Make sure your calls to `tune.report()` include the "
+                    "metric, or set the "
+                    "TUNE_DISABLE_STRICT_METRIC_CHECKING "
+                    "environment variable to 1. Result: {}".format(
+                        report_metric, location, result))
+
     def _process_trial_save(self, trial):
         """Processes a trial save.
 
diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py
index a4117599b..bc7d48a44 100644
--- a/python/ray/tune/tune.py
+++ b/python/ray/tune/tune.py
@@ -374,7 +374,8 @@ def run(
         server_port=server_port,
         verbose=bool(verbose > 1),
         fail_fast=fail_fast,
-        trial_executor=trial_executor)
+        trial_executor=trial_executor,
+        metric=metric)
 
     if not runner.resumed:
         for exp in experiments: