From d9c4dea7cf57f653eb24833aec97e57b5a829a66 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Thu, 24 Sep 2020 18:00:48 +0100 Subject: [PATCH] [tune] strict metric checking (#10972) --- doc/source/tune/_tutorials/tune-xgboost.rst | 216 ++++++++---------- doc/source/tune/user-guide.rst | 29 +++ python/ray/tune/examples/xgboost_example.py | 46 ++-- python/ray/tune/progress_reporter.py | 2 +- python/ray/tune/schedulers/trial_scheduler.py | 10 + python/ray/tune/suggest/search.py | 10 + python/ray/tune/suggest/search_generator.py | 4 + python/ray/tune/tests/test_api.py | 48 ++++ python/ray/tune/tests/test_sample.py | 2 +- python/ray/tune/trial.py | 8 +- python/ray/tune/trial_runner.py | 58 ++++- python/ray/tune/tune.py | 3 +- 12 files changed, 275 insertions(+), 161 deletions(-) diff --git a/doc/source/tune/_tutorials/tune-xgboost.rst b/doc/source/tune/_tutorials/tune-xgboost.rst index d8ac98668..dbb768c63 100644 --- a/doc/source/tune/_tutorials/tune-xgboost.rst +++ b/doc/source/tune/_tutorials/tune-xgboost.rst @@ -73,7 +73,6 @@ Here is the full code to train a simple XGBoost model: .. code-block:: python - import numpy as np import sklearn.datasets import sklearn.metrics from sklearn.model_selection import train_test_split @@ -90,31 +89,37 @@ Here is the full code to train a simple XGBoost model: train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) # Train the classifier - bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False) - # Predict labels for the test set - preds = bst.predict(test_set) - pred_labels = np.rint(preds) - # Return prediction accuracy - accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) - return accuracy + results = {} + bst = xgb.train( + config, + train_set, + evals=[(test_set, "eval")], + evals_result=results, + verbose_eval=False) + return results if __name__ == "__main__": - accuracy = train_breast_cancer({ - "objective": "binary:logistic" + results = train_breast_cancer({ + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"] }) - print("Accuracy: {:.2f}".format(accuracy)) + accuracy = 1. - results["eval"]["error"][-1] + print(f"Accuracy: {accuracy:.4f}") + As you can see, the code is quite simple. First, the dataset is loaded and split -into a ``test`` and ``train`` set. The XGBoost model is trained with ``xgb.train()`` -and the predictions for the test set are obtained with ``bst.predict()``. Lastly, we -return the accuracy of our predictions. Even in this simple example, most runs result +into a ``test`` and ``train`` set. The XGBoost model is trained with ``xgb.train()``. +XGBoost automatically evaluates metrics we specified on the test set. In our case +it calculates the *logloss* and the prediction *error*, which is the percentage of +misclassified examples. To calculate the accuracy, we just have to subtract the error +from ``1.0``. Even in this simple example, most runs result in a good accuracy of over ``0.90``. Maybe you have noticed the ``config`` parameter we pass to the XGBoost algorithm. This is a ``dict`` in which you can specify parameters for the XGBoost algorithm. In this -simple example, the only parameter we passed is the ``objective`` parameter. The value -``binary:logistic`` tells XGBoost that we aim to train a logistic regression model for +simple example, the only parameters we passed are the ``objective`` and ``eval_metric`` parameters. +The value ``binary:logistic`` tells XGBoost that we aim to train a logistic regression model for a binary classification task. You can find an overview over all valid objectives `here in the XGBoost documentation `_. @@ -228,13 +233,15 @@ Let's see how this looks like in code! We just need to adjust our ``config`` dic if __name__ == "__main__": config = { "objective": "binary:logistic", + "eval_metric": ["logloss", "error"] "max_depth": 2, "min_child_weight": 0, "subsample": 0.8, "eta": 0.2 } - accuracy = train_breast_cancer(config) - print("Accuracy: {:.2f}".format(accuracy)) + results = train_breast_cancer(config) + accuracy = 1. - results["eval"]["error"][-1] + print(f"Accuracy: {accuracy:.4f}") The rest stays the same. Please note that we do not adjust the ``num_boost_rounds`` here. The result should also show a high accuracy of over 90%. @@ -261,9 +268,8 @@ Let's start with a basic example on how to use Tune for this. We just need to ma a few changes to our code-block: .. code-block:: python - :emphasize-lines: 26,32,33,34,35,37,38,39,40,41 + :emphasize-lines: 26-28,35-38,40-44 - import numpy as np import sklearn.datasets import sklearn.metrics from sklearn.model_selection import train_test_split @@ -282,29 +288,34 @@ a few changes to our code-block: train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) # Train the classifier - bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False) - # Predict labels for the test set - preds = bst.predict(test_set) - pred_labels = np.rint(preds) + results = {} + xgb.train( + config, + train_set, + evals=[(test_set, "eval")], + evals_result=results, + verbose_eval=False) # Return prediction accuracy - accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) + accuracy = 1. - results["eval"]["error"][-1] tune.report(mean_accuracy=accuracy, done=True) if __name__ == "__main__": config = { "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], "max_depth": tune.randint(1, 9), "min_child_weight": tune.choice([1, 2, 3]), "subsample": tune.uniform(0.5, 1.0), "eta": tune.loguniform(1e-4, 1e-1) } - tune.run( + analysis = tune.run( train_breast_cancer, resources_per_trial={"cpu": 1}, config=config, num_samples=10) + As you can see, the changes in the actual training function are minimal. Instead of returning the accuracy value, we report it back to Tune using ``tune.report()``. Our ``config`` dictionary only changed slightly. Instead of passing hard-coded @@ -332,26 +343,27 @@ hyperparameter configurations from this search space. The output of our training run coud look like this: .. code-block:: bash - :emphasize-lines: 10 + :emphasize-lines: 14 + Number of trials: 10/10 (10 TERMINATED) +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) | |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------| - | train_breast_cancer_c817a_00000 | TERMINATED | | 0.00334038 | 8 | 1 | 0.640256 | 0.93007 | 1 | 0.050081 | - | train_breast_cancer_c817a_00001 | TERMINATED | | 0.00285335 | 4 | 3 | 0.951621 | 0.93007 | 1 | 0.0453899 | - | train_breast_cancer_c817a_00002 | TERMINATED | | 0.0597631 | 5 | 2 | 0.96479 | 0.986014 | 1 | 0.0503612 | - | train_breast_cancer_c817a_00003 | TERMINATED | | 0.000650095 | 6 | 2 | 0.923812 | 0.951049 | 1 | 0.0588872 | - | train_breast_cancer_c817a_00004 | TERMINATED | | 0.00753275 | 1 | 1 | 0.973499 | 0.881119 | 1 | 0.0347321 | - | train_breast_cancer_c817a_00005 | TERMINATED | | 0.000411214 | 5 | 1 | 0.672503 | 0.958042 | 1 | 0.0477931 | - | train_breast_cancer_c817a_00006 | TERMINATED | | 0.0940201 | 5 | 2 | 0.711124 | 0.972028 | 1 | 0.069901 | - | train_breast_cancer_c817a_00007 | TERMINATED | | 0.0372492 | 1 | 1 | 0.76303 | 0.895105 | 1 | 0.0496318 | - | train_breast_cancer_c817a_00008 | TERMINATED | | 0.000140322 | 1 | 2 | 0.885415 | 0.909091 | 1 | 0.045424 | - | train_breast_cancer_c817a_00009 | TERMINATED | | 0.000341654 | 5 | 3 | 0.720523 | 0.937063 | 1 | 0.0657773 | + | train_breast_cancer_b63aa_00000 | TERMINATED | | 0.000117625 | 2 | 2 | 0.616347 | 0.916084 | 1 | 0.0306492 | + | train_breast_cancer_b63aa_00001 | TERMINATED | | 0.0382954 | 8 | 2 | 0.581549 | 0.937063 | 1 | 0.0357082 | + | train_breast_cancer_b63aa_00002 | TERMINATED | | 0.000217926 | 1 | 3 | 0.528428 | 0.874126 | 1 | 0.0264609 | + | train_breast_cancer_b63aa_00003 | TERMINATED | | 0.000120929 | 8 | 1 | 0.634508 | 0.958042 | 1 | 0.036406 | + | train_breast_cancer_b63aa_00004 | TERMINATED | | 0.00839715 | 5 | 1 | 0.730624 | 0.958042 | 1 | 0.0389378 | + | train_breast_cancer_b63aa_00005 | TERMINATED | | 0.000732948 | 8 | 2 | 0.915863 | 0.958042 | 1 | 0.0382841 | + | train_breast_cancer_b63aa_00006 | TERMINATED | | 0.000856226 | 4 | 1 | 0.645209 | 0.916084 | 1 | 0.0357089 | + | train_breast_cancer_b63aa_00007 | TERMINATED | | 0.00769908 | 7 | 1 | 0.729443 | 0.909091 | 1 | 0.0390737 | + | train_breast_cancer_b63aa_00008 | TERMINATED | | 0.00186339 | 5 | 3 | 0.595744 | 0.944056 | 1 | 0.0343912 | + | train_breast_cancer_b63aa_00009 | TERMINATED | | 0.000950272 | 3 | 2 | 0.835504 | 0.965035 | 1 | 0.0348201 | +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ -The best configuration we found used ``eta=0.0940201``, ``max_depth=5``, -``min_child_weight=2``, ``subsample=0.711124`` and reached an accuracy of -``0.972028``. +The best configuration we found used ``eta=0.000950272``, ``max_depth=3``, +``min_child_weight=2``, ``subsample=0.835504`` and reached an accuracy of +``0.965035``. Early stopping -------------- @@ -385,97 +397,49 @@ Lastly, we have to report the loss metric to Tune. We do this with a ``Callback` XGBoost accepts and calls after each evaluation round. Ray Tune comes with :ref:`two XGBoost callbacks ` we can use for this. The ``TuneReportCallback`` just reports the evaluation -metrics back to Tune. The ``TuneReportCheckpointCallback`` would also save -checkpoints after each evaluation round. We will just use the former in this -example. +metrics back to Tune. The ``TuneReportCheckpointCallback`` also saves +checkpoints after each evaluation round. We will just use the latter in this +example so that we can retrieve the saved model later. -We also tell XGBoost which loss metrics to calculate in the ``eval_metric`` -parameter in the config. These parameters are then reported to Tune -via the callback. +These parameters from the ``eval_metrics`` configuration setting are then automatically +reported to Tune via the callback. Here, the raw error will be reported, not the accuracy. +To display the best reached accuracy, we will inverse it later. -.. code-block:: python - :emphasize-lines: 9,26,42,44-49 +We will also load the best checkpointed model so that we can use it for predictions. +The best model is selected with respect to the ``metric`` and ``mode`` parameters we +pass to ``tune.run()``. - import numpy as np - import sklearn.datasets - import sklearn.metrics - from ray.tune.schedulers import ASHAScheduler - from sklearn.model_selection import train_test_split - import xgboost as xgb - - from ray import tune - from ray.tune.integration.xgboost import TuneReportCallback - - def train_breast_cancer(config): - # Load dataset - data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) - # Split into train and test set - train_x, test_x, train_y, test_y = train_test_split( - data, labels, test_size=0.25) - # Build input matrices for XGBoost - train_set = xgb.DMatrix(train_x, label=train_y) - test_set = xgb.DMatrix(test_x, label=test_y) - # Train the classifier - bst = xgb.train( - config, - train_set, - evals=[(test_set, "eval")], - verbose_eval=False, - callbacks=[TuneReportCallback()]) - # Predict labels for the test set - preds = bst.predict(test_set) - pred_labels = np.rint(preds) - # Return prediction accuracy - accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) - tune.report(mean_accuracy=accuracy, done=True) - - - if __name__ == "__main__": - config = { - "objective": "binary:logistic", - "max_depth": tune.randint(1, 9), - "min_child_weight": tune.choice([1, 2, 3]), - "subsample": tune.uniform(0.5, 1.0), - "eta": tune.loguniform(1e-4, 1e-1), - "eval_metric": ["auc", "ams@0", "logloss"] - } - scheduler = ASHAScheduler( - metric="eval-logloss", # The `eval` prefix is defined in xgb.train - mode="min", # Retain configurations with a low logloss - max_t=11, # 10 training iterations + 1 final evaluation - grace_period=1, # Number of minimum iterations for each trial - reduction_factor=2) # How aggressively to stop trials - tune.run( - train_breast_cancer, - resources_per_trial={"cpu": 1}, - config=config, - num_samples=10, - scheduler=scheduler) +.. literalinclude:: /../../python/ray/tune/examples/xgboost_example.py + :language: python + :emphasize-lines: 8,25,37-40,44-45,49,51-57 The output of our run could look like this: .. code-block:: bash - :emphasize-lines: 13 + :emphasize-lines: 7 - +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ - | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) | - |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------| - | train_breast_cancer_806ea_00000 | TERMINATED | | 0.0371055 | 2 | 1 | 0.611729 | 0.951049 | 11 | 0.339279 | - | train_breast_cancer_806ea_00001 | TERMINATED | | 0.0324613 | 3 | 2 | 0.643815 | | 4 | 0.230338 | - | train_breast_cancer_806ea_00002 | TERMINATED | | 0.0100875 | 4 | 3 | 0.985147 | | 2 | 0.0661929 | - | train_breast_cancer_806ea_00003 | TERMINATED | | 0.00124263 | 1 | 3 | 0.890299 | | 1 | 0.0201721 | - | train_breast_cancer_806ea_00004 | TERMINATED | | 0.000230373 | 5 | 3 | 0.627611 | | 1 | 0.0265107 | - | train_breast_cancer_806ea_00005 | TERMINATED | | 0.000186942 | 5 | 2 | 0.831801 | | 1 | 0.026082 | - | train_breast_cancer_806ea_00006 | TERMINATED | | 0.00871051 | 2 | 3 | 0.721523 | 0.958042 | 11 | 0.299392 | - | train_breast_cancer_806ea_00007 | TERMINATED | | 0.00440949 | 2 | 3 | 0.606252 | | 1 | 0.0210171 | - | train_breast_cancer_806ea_00008 | TERMINATED | | 0.00948289 | 5 | 2 | 0.892979 | | 2 | 0.140424 | - | train_breast_cancer_806ea_00009 | TERMINATED | | 0.0514017 | 2 | 1 | 0.859864 | 0.972028 | 11 | 0.365437 | - +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ + Number of trials: 10/10 (10 TERMINATED) + +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+ + | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | iter | total time (s) | eval-logloss | eval-error | + |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------| + | train_breast_cancer_ba275_00000 | TERMINATED | | 0.00205087 | 2 | 1 | 0.898391 | 10 | 0.380619 | 0.678039 | 0.090909 | + | train_breast_cancer_ba275_00001 | TERMINATED | | 0.000183834 | 4 | 3 | 0.924939 | 1 | 0.0228798 | 0.693009 | 0.111888 | + | train_breast_cancer_ba275_00002 | TERMINATED | | 0.0242721 | 7 | 2 | 0.501551 | 10 | 0.376154 | 0.54472 | 0.06993 | + | train_breast_cancer_ba275_00003 | TERMINATED | | 0.000449692 | 5 | 3 | 0.890212 | 1 | 0.0234981 | 0.692811 | 0.090909 | + | train_breast_cancer_ba275_00004 | TERMINATED | | 0.000376393 | 7 | 2 | 0.883609 | 1 | 0.0231569 | 0.692847 | 0.062937 | + | train_breast_cancer_ba275_00005 | TERMINATED | | 0.00231942 | 3 | 3 | 0.877464 | 2 | 0.104867 | 0.689541 | 0.083916 | + | train_breast_cancer_ba275_00006 | TERMINATED | | 0.000542326 | 1 | 2 | 0.578584 | 1 | 0.0213971 | 0.692765 | 0.083916 | + | train_breast_cancer_ba275_00007 | TERMINATED | | 0.0016801 | 1 | 2 | 0.975302 | 1 | 0.02226 | 0.691999 | 0.083916 | + | train_breast_cancer_ba275_00008 | TERMINATED | | 0.000595756 | 8 | 3 | 0.58429 | 1 | 0.0221152 | 0.692657 | 0.06993 | + | train_breast_cancer_ba275_00009 | TERMINATED | | 0.000357845 | 8 | 1 | 0.637776 | 1 | 0.022635 | 0.692859 | 0.090909 | + +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+ -As you can see, four trials have been stopped after just one iteration, two after two iterations, -one after four iterations, and the three most promising configurations have been run for -ten iterations. The 11 is due to the fact that we finally report the accuracy after -training the full model, which is internally interpreted as another iteration. + + Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5015513240240503, 'eta': 0.024272050872920895} + Best model total accuracy: 0.9301 + +As you can see, most trials have been stopped only after a few iterations. Only the +two most promising trials were run for the full 10 iterations. Using fractional GPUs --------------------- @@ -487,16 +451,16 @@ Tune supports *fractional GPUs*. This means that each task is assigned a fractio of the GPU memory for training. For 10 tasks, this could look like this: .. code-block:: python - :emphasize-lines: 8,12 + :emphasize-lines: 4,12 config = { "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + "tree_method": "gpu_hist", "max_depth": tune.randint(1, 9), "min_child_weight": tune.choice([1, 2, 3]), "subsample": tune.uniform(0.5, 1.0), - "eta": tune.loguniform(1e-4, 1e-1), - "eval_metric": ["auc", "ams@0", "logloss"], - "tree_method": "gpu_hist" + "eta": tune.loguniform(1e-4, 1e-1) } tune.run( train_breast_cancer, diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst index 082f74eb5..57ab549ef 100644 --- a/doc/source/tune/user-guide.rst +++ b/doc/source/tune/user-guide.rst @@ -535,6 +535,35 @@ By default, ``tune.run`` will continue executing until all trials have terminate This is useful when you are trying to setup a large hyperparameter experiment. +Environment variables +--------------------- +Some of Ray Tune's behavior can be configured using environment variables. +These are the environment variables Ray Tune currently considers: + +* **TUNE_CLUSTER_SSH_KEY**: SSH key used by the Tune driver process to connect + to remote cluster machines for checkpoint syncing. If this is not set, + ``~/ray_bootstrap_key.pem`` will be used. +* **TUNE_DISABLE_AUTO_INIT**: Disable automatically calling ``ray.init()`` if + not attached to a Ray session. +* **TUNE_DISABLE_STRICT_METRIC_CHECKING**: When you report metrics to Tune via + ``tune.report()`` and passed a ``metric`` parameter to ``tune.run()``, a scheduler, + or a search algorithm, Tune will error + if the metric was not reported in the result. Setting this environment variable + to ``1`` will disable this check. +* **TUNE_GLOBAL_CHECKPOINT_S**: Time in seconds that limits how often Tune's + experiment state is checkpointed. If not set this will default to ``10``. +* **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those + with the parameter values in them) +* **TUNE_RESULT_DIR**: Directory where Tune trial results are stored. If this + is not set, ``~/ray_results`` will be used. + + +There are some environment variables that are mostly relevant for integrated libraries: + +* **SIGOPT_KEY**: SigOpt API access key. +* **WANDB_API_KEY**: Weights and Biases API key. You can also use ``wandb login`` + instead. + Further Questions or Issues? ---------------------------- diff --git a/python/ray/tune/examples/xgboost_example.py b/python/ray/tune/examples/xgboost_example.py index 73303e221..285734722 100644 --- a/python/ray/tune/examples/xgboost_example.py +++ b/python/ray/tune/examples/xgboost_example.py @@ -1,4 +1,3 @@ -import numpy as np import sklearn.datasets import sklearn.metrics from ray.tune.schedulers import ASHAScheduler @@ -6,7 +5,7 @@ from sklearn.model_selection import train_test_split import xgboost as xgb from ray import tune -from ray.tune.integration.xgboost import TuneReportCallback +from ray.tune.integration.xgboost import TuneReportCheckpointCallback def train_breast_cancer(config): @@ -19,39 +18,44 @@ def train_breast_cancer(config): train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) # Train the classifier - bst = xgb.train( + xgb.train( config, train_set, evals=[(test_set, "eval")], verbose_eval=False, - callbacks=[TuneReportCallback()]) - # Predict labels for the test set - preds = bst.predict(test_set) - pred_labels = np.rint(preds) - # Return prediction accuracy - accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) - tune.report(mean_accuracy=accuracy, done=True) + callbacks=[TuneReportCheckpointCallback(filename="model.xgb")]) if __name__ == "__main__": config = { "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], "max_depth": tune.randint(1, 9), "min_child_weight": tune.choice([1, 2, 3]), "subsample": tune.uniform(0.5, 1.0), - "eta": tune.loguniform(1e-4, 1e-1), - "eval_metric": ["auc", "ams@0", "logloss"] + "eta": tune.loguniform(1e-4, 1e-1) } - # The ASHAScheduler stops bad performing configurations early scheduler = ASHAScheduler( - metric="eval-logloss", # The `eval` prefix is defined in xgb.train - mode="min", # Retain configurations with a low logloss - max_t=11, # 10 training iterations + 1 final evaluation - grace_period=1, # Number of minimum iterations for each trial - reduction_factor=2) # How aggressively to stop trials - tune.run( - train_breast_cancer, # your training function + max_t=10, # 10 training iterations + grace_period=1, + reduction_factor=2) + + analysis = tune.run( + train_breast_cancer, + metric="eval-logloss", + mode="min", resources_per_trial={"cpu": 1}, # You can add "gpu": 0.1 here config=config, - num_samples=10, # number of parameter configurations to try + num_samples=10, scheduler=scheduler) + + # Load the best model checkpoint + import os + best_bst = xgb.Booster() + best_bst.load_model(os.path.join(analysis.best_checkpoint, "model.xgb")) + accuracy = 1. - analysis.best_result["eval-error"] + print(f"Best model parameters: {analysis.best_config}") + print(f"Best model total accuracy: {accuracy:.4f}") + + # You could now do further predictions with + # best_bst.predict(...) diff --git a/python/ray/tune/progress_reporter.py b/python/ray/tune/progress_reporter.py index f54a8ea58..531c1da57 100644 --- a/python/ray/tune/progress_reporter.py +++ b/python/ray/tune/progress_reporter.py @@ -278,7 +278,7 @@ class TuneReporterBase(ProgressReporter): continue if not best_metric or \ t.last_result[metric] * metric_op > best_metric: - best_metric = t.last_result[metric] + best_metric = t.last_result[metric] * metric_op best_trial = t return best_trial, metric diff --git a/python/ray/tune/schedulers/trial_scheduler.py b/python/ray/tune/schedulers/trial_scheduler.py index 9b61287b7..56df73943 100644 --- a/python/ray/tune/schedulers/trial_scheduler.py +++ b/python/ray/tune/schedulers/trial_scheduler.py @@ -11,6 +11,12 @@ class TrialScheduler: PAUSE = "PAUSE" #: Status for pausing trial execution STOP = "STOP" #: Status for stopping trial execution + _metric = None + + @property + def metric(self): + return self._metric + def set_search_properties(self, metric: Optional[str], mode: Optional[str]) -> bool: """Pass search properties to scheduler. @@ -22,6 +28,10 @@ class TrialScheduler: metric (str): Metric to optimize mode (str): One of ["min", "max"]. Direction to optimize. """ + if self._metric and metric: + return False + if metric: + self._metric = metric return True def on_trial_add(self, trial_runner: "trial_runner.TrialRunner", diff --git a/python/ray/tune/suggest/search.py b/python/ray/tune/suggest/search.py index 7654924e5..f667bf615 100644 --- a/python/ray/tune/suggest/search.py +++ b/python/ray/tune/suggest/search.py @@ -17,6 +17,12 @@ class SearchAlgorithm: """ _finished = False + _metric = None + + @property + def metric(self): + return self._metric + def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool: """Pass search properties to search algorithm. @@ -33,6 +39,10 @@ class SearchAlgorithm: mode (str): One of ["min", "max"]. Direction to optimize. config (dict): Tune config dict. """ + if self._metric and metric: + return False + if metric: + self._metric = metric return True @property diff --git a/python/ray/tune/suggest/search_generator.py b/python/ray/tune/suggest/search_generator.py index 19dfa8be3..15b24286d 100644 --- a/python/ray/tune/suggest/search_generator.py +++ b/python/ray/tune/suggest/search_generator.py @@ -70,6 +70,10 @@ class SearchGenerator(SearchAlgorithm): self._total_samples = 0 # int: total samples to evaluate. self._finished = False + @property + def metric(self): + return self.searcher.metric + def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool: return self.searcher.set_search_properties(metric, mode, config) diff --git a/python/ray/tune/tests/test_api.py b/python/ray/tune/tests/test_api.py index f362967ad..b612ba4db 100644 --- a/python/ray/tune/tests/test_api.py +++ b/python/ray/tune/tests/test_api.py @@ -1146,6 +1146,54 @@ class TrainableFunctionApiTest(unittest.TestCase): diff = time.time() - start self.assertLess(diff, 9) + def testMetricCheckingEndToEnd(self): + from ray import tune + + def train(config): + tune.report(val=4, second=8) + + def train2(config): + return + + os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "0" + # `acc` is not reported, should raise + with self.assertRaises(TuneError): + # The trial runner raises a ValueError, but the experiment fails + # with a TuneError + tune.run(train, metric="acc") + + # `val` is reported, should not raise + tune.run(train, metric="val") + + # Run does not report anything, should not raise + tune.run(train2, metric="val") + + # Only the scheduler requires a metric + with self.assertRaises(TuneError): + tune.run( + train, + scheduler=AsyncHyperBandScheduler(metric="acc", mode="max")) + + tune.run( + train, scheduler=AsyncHyperBandScheduler(metric="val", mode="max")) + + # Only the search alg requires a metric + with self.assertRaises(TuneError): + tune.run( + train, + config={"a": tune.choice([1, 2])}, + search_alg=HyperOptSearch(metric="acc", mode="max")) + + # Metric is passed + tune.run( + train, + config={"a": tune.choice([1, 2])}, + search_alg=HyperOptSearch(metric="val", mode="max")) + + os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1" + # With strict metric checking disabled, this should not raise + tune.run(train, metric="acc") + class ShimCreationTest(unittest.TestCase): def testCreateScheduler(self): diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index c534d3144..e2f9bff50 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -252,7 +252,7 @@ class SearchSpaceTest(unittest.TestCase): with self.assertRaises(ValueError): searcher.set_search_properties("none", "max", invalid_config) - searcher = BayesOptSearch(metric="a", mode="max") + searcher = BayesOptSearch(metric="b", mode="max") analysis = tune.run( _mock_objective, config=config, search_alg=searcher, num_samples=1) trial = analysis.trials[0] diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index f67cf0fcf..8db3a0e72 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -26,8 +26,14 @@ from ray.tune.utils import flatten_dict from ray.utils import binary_to_hex, hex_to_binary DEBUG_PRINT_INTERVAL = 5 -MAX_LEN_IDENTIFIER = int(os.environ.get("MAX_LEN_IDENTIFIER", 130)) logger = logging.getLogger(__name__) +if "MAX_LEN_IDENTIFIER" in os.environ: + logger.error( + "The MAX_LEN_IDENTIFIER environment variable is deprecated and will " + "be removed in the future. Use TUNE_MAX_LEN_IDENTIFIER instead.") +MAX_LEN_IDENTIFIER = int( + os.environ.get("TUNE_MAX_LEN_IDENTIFIER", + os.environ.get("MAX_LEN_IDENTIFIER", 130))) def date_str(): diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index d13073ad4..593993e0a 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -132,15 +132,20 @@ class TrialRunner: fail_fast=False, verbose=True, checkpoint_period=None, - trial_executor=None): + trial_executor=None, + metric=None): self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() - # For debugging, it may be useful to halt trials after some time has - # elapsed. TODO(ekl) consider exposing this in the API. - self._global_time_limit = float( - os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf"))) + self._metric = metric + + if "TRIALRUNNER_WALLTIME_LIMIT" in os.environ: + raise ValueError( + "The TRIALRUNNER_WALLTIME_LIMIT environment variable is " + "deprecated. " + "Use `tune.run(time_budget_s=limit)` instead.") + self._total_time = 0 self._iteration = 0 self._has_errored = False @@ -349,11 +354,6 @@ class TrialRunner: def is_finished(self): """Returns whether all trials have finished running.""" - if self._total_time > self._global_time_limit: - logger.warning("Exceeded global time limit {} / {}".format( - self._total_time, self._global_time_limit)) - return True - trials_done = all(trial.is_finished() for trial in self._trials) return trials_done and self._search_alg.is_finished() @@ -527,6 +527,7 @@ class TrialRunner: result = trial.last_result result.update(done=True) + self._validate_result_metrics(result) self._total_time += result.get(TIME_THIS_ITER_S, 0) flat_result = flatten_dict(result) @@ -572,6 +573,43 @@ class TrialRunner: raise self._process_trial_failure(trial, traceback.format_exc()) + def _validate_result_metrics(self, result): + """ + Check if any of the required metrics was not reported + in the last result. If the only item is `done=True`, this + means that no result was ever received and the trial just + returned. This is also okay and will not raise an error. + """ + if int(os.environ.get("TUNE_DISABLE_STRICT_METRIC_CHECKING", + 0)) != 1 and (len(result) > 1 + or "done" not in result): + base_metric = self._metric + scheduler_metric = self._scheduler_alg.metric + search_metric = self._search_alg.metric + + if base_metric and base_metric not in result: + report_metric = base_metric + location = "tune.run()" + elif scheduler_metric and scheduler_metric not in result: + report_metric = scheduler_metric + location = type(self._scheduler_alg).__name__ + elif search_metric and search_metric not in result: + report_metric = search_metric + location = type(self._search_alg).__name__ + else: + report_metric = None + location = None + + if report_metric: + raise ValueError( + "Trial returned a result which did not include the " + "specified metric `{}` that `{}` expects. " + "Make sure your calls to `tune.report()` include the " + "metric, or set the " + "TUNE_DISABLE_STRICT_METRIC_CHECKING " + "environment variable to 1. Result: {}".format( + report_metric, location, result)) + def _process_trial_save(self, trial): """Processes a trial save. diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index a4117599b..bc7d48a44 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -374,7 +374,8 @@ def run( server_port=server_port, verbose=bool(verbose > 1), fail_fast=fail_fast, - trial_executor=trial_executor) + trial_executor=trial_executor, + metric=metric) if not runner.resumed: for exp in experiments: