From 34f6d2fc5c9bb924fe23145285f6f949316f2c10 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Wed, 4 Sep 2019 12:44:42 -0700 Subject: [PATCH] [tune] Update trainable docs and support hparams (#5558) --- doc/source/rllib-concepts.rst | 2 +- doc/source/rllib-training.rst | 2 +- doc/source/tune-tutorial.rst | 6 +- doc/source/tune-usage.rst | 190 ++++++++++++-------- doc/source/tune.rst | 11 +- python/ray/tune/examples/pbt_ppo_example.py | 5 +- python/ray/tune/logger.py | 31 +++- python/ray/tune/sample.py | 30 +++- python/ray/tune/suggest/basic_variant.py | 1 + python/ray/tune/suggest/suggestion.py | 1 + python/ray/tune/trainable.py | 99 +++++++--- python/ray/tune/trial.py | 4 + 12 files changed, 269 insertions(+), 113 deletions(-) diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index 7dbac3c8a..fc18ae056 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -594,7 +594,7 @@ This is how the example in the previous section looks when written using a polic Trainers -------- -Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the rollout workers and optimizer, and collection of training metrics. Trainers also implement the `Trainable API `__ for easy experiment management. +Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the rollout workers and optimizer, and collection of training metrics. Trainers also implement the `Trainable API `__ for easy experiment management. Example of three equivalent ways of interacting with the PPO trainer, all of which log results in ``~/ray_results``: diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 2d58ad89f..5bf8dade7 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -178,7 +178,7 @@ Tune will schedule the trials to run in parallel on your Ray cluster: Custom Training Workflows ~~~~~~~~~~~~~~~~~~~~~~~~~ -In the `basic training example `__, Tune will call ``train()`` on your trainer once per iteration and report the new training results. Sometimes, it is desirable to have full control over training, but still run inside Tune. Tune supports `custom trainable functions `__ that can be used to implement `custom training workflows (example) `__. +In the `basic training example `__, Tune will call ``train()`` on your trainer once per iteration and report the new training results. Sometimes, it is desirable to have full control over training, but still run inside Tune. Tune supports `custom trainable functions `__ that can be used to implement `custom training workflows (example) `__. For even finer-grained control over training, you can use RLlib's lower-level `building blocks `__ directly to implement `fully customized training workflows `__. diff --git a/doc/source/tune-tutorial.rst b/doc/source/tune-tutorial.rst index 17fdcb3a6..a5a90cae9 100644 --- a/doc/source/tune-tutorial.rst +++ b/doc/source/tune-tutorial.rst @@ -1,5 +1,5 @@ -Tune Example Walkthrough -======================== +Tune Walkthrough +================ This tutorial will walk you through the following process to setup a Tune experiment. Specifically, we'll leverage ASHA and Bayesian Optimization (via HyperOpt) via the following steps: @@ -60,6 +60,8 @@ We can then plot the performance of this trial. :start-after: __plot_begin__ :end-before: __plot_end__ +.. important:: Tune will automatically run parallel trials across all available cores/GPUs on your machine or cluster. To limit the number of cores that Tune uses, you can call ``ray.init(num_cpus=, num_gpus=)`` before ``tune.run``. + Early Stopping with ASHA ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/tune-usage.rst b/doc/source/tune-usage.rst index 809972d99..2a3db7aa1 100644 --- a/doc/source/tune-usage.rst +++ b/doc/source/tune-usage.rst @@ -4,26 +4,30 @@ Tune User Guide Tune Overview ------------- +Tune takes a user-defined Python function or class and evaluates it on a set of hyperparameter configurations. + +Each hyperparameter configuration evaluation is called a *trial*, and multiple trials are run in parallel. Configurations are either generated by Tune or drawn from a user-specified **search algorithm**. The trials are scheduled and managed by a **trial scheduler**. + .. image:: images/tune-api.svg -Tune schedules a number of *trials* in a cluster. Each trial runs a user-defined Python function or class and is parameterized either by a *config* variation from Tune's Variant Generator or a user-specified **search algorithm**. The trials are scheduled and managed by a **trial scheduler**. +More information about Tune's `search algorithms can be found here `__. More information about Tune's `trial schedulers can be found here `__. You can check out our `examples page `__ for more code examples. -More information about Tune's `search algorithms can be found here `__. More information about Tune's `trial schedulers can be found here `__. +Tune Training API +----------------- -Experiment Configuration ------------------------- +The Tune training API [``tune.run(Trainable)``] has two concepts: -This section will cover the main steps needed to modify your code to run Tune: using the `Training API `__ and `executing your Tune experiment `__. +1. The `Trainable `__ API, and +2. `tune.run `__. -You can checkout out our `examples page `__ for more code examples. +Training can be done with either the Trainable **Class API** or **function-based API**. -Training API -~~~~~~~~~~~~ +Trainable API +~~~~~~~~~~~~~ -Training can be done with either the **Trainable Class API** or **function-based API**. - -**Python classes** passed into Tune will need to subclass ``ray.tune.Trainable``. The Trainable interface `can be found here `__. Here is an example: +The class-based API will require users to subclass ``ray.tune.Trainable``. The Trainable interface `can be found here `__. +Here is an example: .. code-block:: python @@ -36,7 +40,14 @@ Training can be done with either the **Trainable Class API** or **function-based result_dict = {"accuracy": 0.5, "f1": 0.1, ...} return result_dict -**Python functions** will need to have the following signature and call ``tune.track.log``, which will allow you to report metrics used for scheduling, search, or early stopping.: + +.. autoclass:: ray.tune.Trainable + :noindex: + +Tune function-based API +~~~~~~~~~~~~~~~~~~~~~~~ + +User-defined functions will need to have following signature and call ``tune.track.log``, which will allow you to report metrics used for scheduling, search, or early stopping: .. code-block:: python @@ -59,22 +70,21 @@ Both the Trainable and function-based API will have `autofilled metrics `_, which provides more details about GPU usage and trials that are distributed: + +.. code-block:: python + + # If you have 4 CPUs on your machine and 1 GPU, this will run 1 trial at a time. + tune.run(trainable, num_samples=10, resources_per_trial={"cpu": 2, "gpu": 1}) + + +To attach to a Ray cluster or use ``ray.init`` manual resource overrides, simply run ``ray.init`` before ``tune.run``: + +.. code-block:: python + + # Setup a local ray cluster and override resources. This will run 50 trials in parallel: + ray.init(num_cpus=100) + tune.run(trainable, num_samples=100, resources_per_trial={"cpu": 2}) + + # Connect to an existing distributed Ray cluster + ray.init(address=) + tune.run(trainable, num_samples=100, resources_per_trial={"cpu": 2, "gpu": 1}) + +.. tip:: To run everything sequentially, use `Ray Local Mode `_. + Analyzing Results ----------------- @@ -149,6 +196,22 @@ You can use ``tune.grid_search`` to specify an axis of a grid search. By default .. note:: If you specify an explicit Search Algorithm such as any SuggestionAlgorithm, you may not be able to specify lambdas or grid search with this interface, as the search algorithm may require a different search space declaration. + +Use ``tune.sample_from()`` to sample a value for a hyperparameter. The ``func`` should take in a ``spec`` object, which has a ``config`` namespace from which you can access other hyperparameters. This is useful for conditional distributions: + +.. code-block:: python + + tune.run( + ..., + config={ + "alpha": tune.sample_from(lambda spec: np.random.uniform(100)), + "beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()) + } + ) + +Tune provides a couple helper functions for common parameter distributions, wrapping numpy random utilities such as ``np.random.uniform``, ``np.random.choice``, and ``np.random.randn``. See the `Package Reference `_ for more details. + + The following shows grid search over two nested parameters combined with random sampling from two lambda functions, generating 9 different trials. Note that the value of ``beta`` depends on the value of ``alpha``, which is represented by referencing ``spec.config.alpha`` in the lambda function. This lets you specify conditional parameter distributions. .. code-block:: python @@ -167,11 +230,6 @@ The following shows grid search over two nested parameters combined with random } ) -.. note:: - Use ``tune.sample_from(...)`` to sample from a function during trial variant generation. - -For more information on variant generation, see `basic_variant.py `__. - Custom Trial Names ------------------ @@ -224,7 +282,7 @@ By default, each random variable and grid search point is sampled once. To take E.g. in the above, ``num_samples=10`` repeats the 3x3 grid search 10 times, for a total of 90 trials, each with randomly sampled values of ``alpha`` and ``beta``. -Using GPUs (Resource Allocation) +Resource Allocation (Using GPUs) -------------------------------- Tune will allocate the specified GPU and CPU ``resources_per_trial`` to each individual trial (defaulting to 1 CPU per trial). Under the hood, Tune runs each trial as a Ray actor, using Ray's resource handling to allocate resources and place actors. A trial will not be scheduled unless at least that amount of resources is available in the cluster, preventing the cluster from being overloaded. @@ -234,8 +292,10 @@ Fractional values are also supported, (i.e., ``"gpu": 0.2``). You can find an ex If GPU resources are not requested, the ``CUDA_VISIBLE_DEVICES`` environment variable will be set as empty, disallowing GPU access. Otherwise, it will be set to the GPUs in the list (this is managed by Ray). +Advanced Resource Allocation +---------------------------- -If your trainable function / class creates further Ray actors or tasks that also consume CPU / GPU resources, you will also want to set ``extra_cpu`` or ``extra_gpu`` to reserve extra resource slots for the actors you will create. For example, if a trainable class requires 1 GPU itself, but will launch 4 actors each using another GPU, then it should set ``"gpu": 1, "extra_gpu": 4``. +Trainables can themselves be distributed. If your trainable function / class creates further Ray actors or tasks that also consume CPU / GPU resources, you will also want to set ``extra_cpu`` or ``extra_gpu`` to reserve extra resource slots for the actors you will create. For example, if a trainable class requires 1 GPU itself, but will launch 4 actors each using another GPU, then it should set ``"gpu": 1, "extra_gpu": 4``. .. code-block:: python :emphasize-lines: 4-8 @@ -250,6 +310,12 @@ If your trainable function / class creates further Ray actors or tasks that also } ) +The ``Trainable`` also provides the ``default_resource_requests`` interface to automatically declare the ``resources_per_trial`` based on the given configuration. + +.. automethod:: ray.tune.Trainable.default_resource_request + :noindex: + + Save and Restore ---------------- @@ -260,34 +326,32 @@ When running a hyperparameter search, Tune can automatically and periodically sa * fault-tolerance in experiments with pre-emptible machines. * enables certain Trial Schedulers such as HyperBand and PBT. -To enable checkpointing, you must implement a `Trainable class `__ (Trainable functions are not checkpointable, since they never return control back to their caller). The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement ``_save``, and ``_restore`` abstract methods, as seen in `this example `__. +To enable checkpointing, you must implement a `Trainable class `__ (Trainable functions are not checkpointable, since they never return control back to their caller). The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement ``_save``, and ``_restore`` abstract methods, as seen in `this example `__. -For TensorFlow model training, this would look something like this `tensorflow example `__: +For PyTorch model training, this would look something like this `PyTorch example `__: .. code-block:: python class MyTrainableClass(Trainable): - def _setup(self, config): - self.saver = tf.train.Saver() - self.sess = ... + def _save(self, tmp_checkpoint_dir): + checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth") + torch.save(self.model.state_dict(), checkpoint_path) + return tmp_checkpoint_dir - def _train(self): - return {"mean_accuracy: self.sess.run(...)} + def _restore(self, tmp_checkpoint_dir): + checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth") + self.model.load_state_dict(torch.load(checkpoint_path)) - def _save(self, checkpoint_dir): - return self.saver.save(self.sess, os.path.join(checkpoint_dir, save)) +Checkpoints will be saved by training iteration to ``local_dir/exp_name/trial_name/checkpoint_``. You can restore a single trial checkpoint by using ``tune.run(restore=)``. - def _restore(self, checkpoint_prefix): - self.saver.restore(self.sess, checkpoint_prefix) +Tune also generates temporary checkpoints for pausing and switching between trials. For this purpose, it is important not to depend on absolute paths in the implementation of ``save``. See the below reference: -Checkpoints will be saved by training iteration to ``local_dir/exp_name/trial_name/checkpoint_``. You can restore a single trial checkpoint by using ``tune.run(restore=)``. To test if your Trainable will checkpoint and restore correctly, you can use ``tune.util.validate_save_restore`` as follows: +.. automethod:: ray.tune.Trainable._save + :noindex: - .. code-block:: python - from ray.tune.util import validate_save_restore - - validate_save_restore(MyTrainableClass) - validate_save_restore(MyTrainableClass, use_object_store=True) +.. automethod:: ray.tune.Trainable._restore + :noindex: Trainable (Trial) Checkpointing @@ -562,8 +626,8 @@ And stopping a trial (``PUT /trials/:id``): $ curl -X PUT http://
:/trials/ -Debugging (Single Process) --------------------------- +Debugging +--------- By default, Tune will run hyperparameter evaluations on multiple processes. However, if you need to debug your training process, it may be easier to do everything on a single process. You can force all Ray functions to occur on a single process with ``local_mode`` by calling the following before ``tune.run``. @@ -614,34 +678,6 @@ Here are a few examples of command line calls. Dropped columns: ['status', 'last_update_time'] Please increase your terminal size to view remaining columns. -- ``tune list-experiments``: List tabular information about experiments within a project. Empty columns will be dropped by default. Add the ``--sort`` flag to sort the output by specific columns. Add the ``--filter`` flag to filter the output in the format ``" "``. Add the ``--output`` flag to write the trial information to a specific file (CSV or Pickle). Add the ``--columns`` flag to select specific columns to display. - -.. code-block:: bash - - $ tune list-experiments [PROJECT_DIR] --output note.csv - - +----------------------+----------------+------------------+---------------------+ - | name | total_trials | running_trials | terminated_trials | - |----------------------+----------------+------------------+---------------------| - | pbt_test | 10 | 0 | 0 | - | test | 1 | 0 | 0 | - | hyperband_test | 1 | 0 | 1 | - +----------------------+----------------+------------------+---------------------+ - Dropped columns: ['error_trials', 'last_updated'] - Please increase your terminal size to view remaining columns. - Output saved at: note.csv - - $ tune list-experiments [PROJECT_DIR] --filter "total_trials <= 1" --sort name - - +----------------------+----------------+------------------+---------------------+ - | name | total_trials | running_trials | terminated_trials | - |----------------------+----------------+------------------+---------------------| - | hyperband_test | 1 | 0 | 1 | - | test | 1 | 0 | 0 | - +----------------------+----------------+------------------+---------------------+ - Dropped columns: ['error_trials', 'last_updated'] - Please increase your terminal size to view remaining columns. - Further Questions or Issues? ---------------------------- diff --git a/doc/source/tune.rst b/doc/source/tune.rst index b83247eb3..81c9124fd 100644 --- a/doc/source/tune.rst +++ b/doc/source/tune.rst @@ -1,14 +1,16 @@ -Tune: Scalable Hyperparameter Tuning -==================================== +Tune: A Scalable Hyperparameter Tuning Library +============================================== + +.. important:: Take the 3 minute `2019 Ray Tune User Survey `_! .. image:: images/tune.png :scale: 30% :align: center -Tune is a library for hyperparameter tuning at any scale. +Tune is a Python library for hyperparameter tuning at any scale. Core features: * Launch a multi-node distributed hyperparameter sweep in less than 10 lines of code. - * Supports any deep learning framework, including PyTorch, TensorFlow, and Keras. + * Supports any machine learning framework, including PyTorch, XGBoost, MXNet, and Keras. * Visualize results with `TensorBoard `__. * Choose among scalable SOTA algorithms such as `Population Based Training (PBT)`_, `Vizier's Median Stopping Rule`_, `HyperBand/ASHA`_. * Tune integrates with many optimization libraries such as `Facebook Ax `_, `HyperOpt `_, and `Bayesian Optimization `_ and enables you to scale them transparently. @@ -17,6 +19,7 @@ Tune is a library for hyperparameter tuning at any scale. .. _`Vizier's Median Stopping Rule`: tune-schedulers.html#median-stopping-rule .. _`HyperBand/ASHA`: tune-schedulers.html#asynchronous-hyperband + Quick Start ----------- diff --git a/python/ray/tune/examples/pbt_ppo_example.py b/python/ray/tune/examples/pbt_ppo_example.py index b555a68ac..7957488a3 100755 --- a/python/ray/tune/examples/pbt_ppo_example.py +++ b/python/ray/tune/examples/pbt_ppo_example.py @@ -4,6 +4,9 @@ Note that this requires a cluster with at least 8 GPUs in order for all trials to run concurrently, otherwise PBT will round-robin train the trials which is less efficient (or you can set {"gpu": 0} to use CPUs for SGD instead). + +Note that Tune in general does not need 8 GPUs, and this is just a more +computationally demainding example. """ from __future__ import absolute_import @@ -51,9 +54,9 @@ if __name__ == "__main__": name="pbt_humanoid_test", scheduler=pbt, **{ - "env": "Humanoid-v1", "num_samples": 8, "config": { + "env": "Humanoid-v1", "kl_coeff": 1.0, "num_workers": 8, "num_gpus": 1, diff --git a/python/ray/tune/logger.py b/python/ray/tune/logger.py index 71b58f121..059769995 100644 --- a/python/ray/tune/logger.py +++ b/python/ray/tune/logger.py @@ -136,6 +136,7 @@ class JsonLogger(Logger): def tf2_compat_logger(config, logdir): + """Chooses TensorBoard logger depending on imported TF version.""" global tf if "RLLIB_TEST_NO_TF_IMPORT" in os.environ: logger.warning("Not importing TensorFlow for test purposes") @@ -153,6 +154,16 @@ def tf2_compat_logger(config, logdir): class TF2Logger(Logger): + """TensorBoard Logger for TF version >= 1.14. + + Automatically flattens nested dicts to show on TensorBoard: + + {"a": {"b": 1, "c": 2}} -> {"a/b": 1, "a/c": 2} + + If you need to do more advanced logging, it is recommended + to use a Summary Writer in the Trainable yourself. + """ + def _init(self): self._file_writer = None @@ -202,6 +213,16 @@ def to_tf_values(result, path): class TFLogger(Logger): + """TensorBoard Logger for TF version < 1.14. + + Automatically flattens nested dicts to show on TensorBoard: + + {"a": {"b": 1, "c": 2}} -> {"a/b": 1, "a/c": 2} + + If you need to do more advanced logging, it is recommended + to use a Summary Writer in the Trainable yourself. + """ + def _init(self): logger.info("Initializing TFLogger instead of TF2Logger.") self._file_writer = tf.compat.v1.summary.FileWriter(self.logdir) @@ -232,9 +253,17 @@ class TFLogger(Logger): class CSVLogger(Logger): + """Logs results to progress.csv under the trial directory. + + Automatically flattens nested dicts in the result dict before writing + to csv: + + {"a": {"b": 1, "c": 2}} -> {"a/b": 1, "a/c": 2} + + """ + def _init(self): """CSV outputted with Headers as first set of results.""" - # Note that we assume params.json was already created by JsonLogger progress_file = os.path.join(self.logdir, EXPR_PROGRESS_FILE) self._continuing = os.path.exists(progress_file) self._file = open(progress_file, "a") diff --git a/python/ray/tune/sample.py b/python/ray/tune/sample.py index b919f264c..ae457f374 100644 --- a/python/ray/tune/sample.py +++ b/python/ray/tune/sample.py @@ -33,7 +33,12 @@ def function(func): def uniform(*args, **kwargs): - """A wrapper around np.random.uniform.""" + """Wraps tune.sample_from around ``np.random.uniform``. + + ``tune.uniform(1, 10)`` is equivalent to + ``tune.sample_from(lambda _: np.random.uniform(1, 10))`` + + """ return sample_from(lambda _: np.random.uniform(*args, **kwargs)) @@ -44,7 +49,7 @@ def loguniform(min_bound, max_bound, base=10): min_bound (float): Lower boundary of the output interval (1e-4) max_bound (float): Upper boundary of the output interval (1e-2) base (float): Base of the log. Defaults to 10. - """ + """ logmin = np.log(min_bound) / np.log(base) logmax = np.log(max_bound) / np.log(base) @@ -55,15 +60,30 @@ def loguniform(min_bound, max_bound, base=10): def choice(*args, **kwargs): - """A wrapper around np.random.choice.""" + """Wraps tune.sample_from around ``np.random.choice``. + + ``tune.choice(10)`` is equivalent to + ``tune.sample_from(lambda _: np.random.choice(10))`` + + """ return sample_from(lambda _: np.random.choice(*args, **kwargs)) def randint(*args, **kwargs): - """A wrapper around np.random.randint.""" + """Wraps tune.sample_from around ``np.random.randint``. + + ``tune.randint(10)`` is equivalent to + ``tune.sample_from(lambda _: np.random.randint(10))`` + + """ return sample_from(lambda _: np.random.randint(*args, **kwargs)) def randn(*args, **kwargs): - """A wrapper around np.random.randn.""" + """Wraps tune.sample_from around ``np.random.randn``. + + ``tune.randn(10)`` is equivalent to + ``tune.sample_from(lambda _: np.random.randn(10))`` + + """ return sample_from(lambda _: np.random.randn(*args, **kwargs)) diff --git a/python/ray/tune/suggest/basic_variant.py b/python/ray/tune/suggest/basic_variant.py index 30da21c16..47e820b63 100644 --- a/python/ray/tune/suggest/basic_variant.py +++ b/python/ray/tune/suggest/basic_variant.py @@ -84,6 +84,7 @@ class BasicVariantGenerator(SearchAlgorithm): spec, output_path, self._parser, + evaluated_params=resolved_vars, experiment_tag=experiment_tag) def is_finished(self): diff --git a/python/ray/tune/suggest/suggestion.py b/python/ray/tune/suggest/suggestion.py index 64f702191..49b706ef9 100644 --- a/python/ray/tune/suggest/suggestion.py +++ b/python/ray/tune/suggest/suggestion.py @@ -98,6 +98,7 @@ class SuggestionAlgorithm(SearchAlgorithm): spec, output_path, self._parser, + evaluated_params=list(suggested_config), experiment_tag=tag, trial_id=trial_id) diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py index 475616ca7..f2c7fb95a 100644 --- a/python/ray/tune/trainable.py +++ b/python/ray/tune/trainable.py @@ -40,14 +40,11 @@ class Trainable(object): Calling ``save()`` should save the training state of a trainable to disk, and ``restore(path)`` should restore a trainable to the given state. - Generally you only need to implement ``_train``, ``_save``, and - ``_restore`` here when subclassing Trainable. + Generally you only need to implement ``_setup``, ``_train``, + ``_save``, and ``_restore`` when subclassing Trainable. - Note that, if you don't require checkpoint/restore functionality, then - instead of implementing this class you can also get away with supplying - just a ``my_train(config, reporter)`` function to the config. - The function will be automatically converted to this interface - (sans checkpoint functionality). + Other implementation methods that may be helpful to override are + ``_log_result``, ``reset_config``, ``_stop``, and ``_export_model``. When using Tune, Tune will convert this class into a Ray actor, which runs on a separate process. Tune will also change the current working @@ -112,6 +109,14 @@ class Trainable(object): This can be overriden by sub-classes to set the correct trial resource allocation, so the user does not need to. + + Example: + >>> def default_resource_request(cls, config): + return Resources( + cpu=0, + gpu=0, + extra_cpu=config["workers"], + extra_gpu=int(config["use_gpu"]) * config["workers"]) """ return None @@ -451,7 +456,7 @@ class Trainable(object): The return value will be automatically passed to the loggers. Users can also return `tune.result.DONE` or `tune.result.SHOULD_CHECKPOINT` - to manually trigger termination of this trial or checkpointing of this + as a key to manually trigger termination or checkpointing of this trial. Note that manual checkpointing only works when subclassing Trainables. @@ -462,26 +467,38 @@ class Trainable(object): raise NotImplementedError - def _save(self, checkpoint_dir): - """Subclasses should override this to implement save(). + def _save(self, tmp_checkpoint_dir): + """Subclasses should override this to implement ``save()``. + + Warning: + Do not rely on absolute paths in the implementation of ``_save`` + and ``_restore``. + + Use ``validate_save_restore`` to catch ``_save``/``_restore`` errors + before execution. + + >>> from ray.tune.util import validate_save_restore + >>> validate_save_restore(MyTrainableClass) + >>> validate_save_restore(MyTrainableClass, use_object_store=True) Args: - checkpoint_dir (str): The directory where the checkpoint - file must be stored. In a Tune run, this defaults to - `/checkpoint_` (which is the same as - `local_dir/exp_name/trial_name/checkpoint_`). + tmp_checkpoint_dir (str): The directory where the checkpoint + file must be stored. In a Tune run, if the trial is paused, + the provided path may be temporary and moved. Returns: - checkpoint (str | dict): If string, the return value is - expected to be the checkpoint path or prefix to be passed to - `_restore()`. If dict, the return value will be automatically - serialized by Tune and passed to `_restore()`. + A dict or string. If string, the return value is expected to be + prefixed by `tmp_checkpoint_dir`. If dict, the return value will + be automatically serialized by Tune and passed to `_restore()`. Examples: >>> print(trainable1._save("/tmp/checkpoint_1")) "/tmp/checkpoint_1/my_checkpoint_file" >>> print(trainable2._save("/tmp/checkpoint_2")) {"some": "data"} + + >>> trainable._save("/tmp/bad_example") + "/tmp/NEW_CHECKPOINT_PATH/my_checkpoint_file" # This will error. """ raise NotImplementedError @@ -489,9 +506,42 @@ class Trainable(object): def _restore(self, checkpoint): """Subclasses should override this to implement restore(). + Warning: + In this method, do not rely on absolute paths. The absolute + path of the checkpoint_dir used in ``_save`` may be changed. + + If ``_save`` returned a prefixed string, the prefix of the checkpoint + string returned by ``_save`` may be changed. This is because trial + pausing depends on temporary directories. + + The directory structure under the checkpoint_dir provided to ``_save`` + is preserved. + + See the example below. + + .. code-block:: python + + class Example(Trainable): + def _save(self, checkpoint_path): + print(checkpoint_path) + return os.path.join(checkpoint_path, "my/check/point") + + def _restore(self, checkpoint): + print(checkpoint) + + >>> trainer = Example() + >>> obj = trainer.save_to_object() # This is used when PAUSED. + /tmpc8k_c_6hsave_to_object/checkpoint_0/my/check/point + >>> trainer.restore_from_object(obj) # Note the different prefix. + /tmpb87b5axfrestore_from_object/checkpoint_0/my/check/point + + Args: - checkpoint (str | dict): Value as returned by `_save`. - If a string, then it is the checkpoint path. + checkpoint (str|dict): If dict, the return value is as + returned by `_save`. If a string, then it is a checkpoint path + that may have a different prefix than that returned by `_save`. + The directory structure underneath the `checkpoint_dir` + `_save` is preserved. """ raise NotImplementedError @@ -514,7 +564,14 @@ class Trainable(object): self._result_logger.on_result(result) def _stop(self): - """Subclasses should override this for any cleanup on stop.""" + """Subclasses should override this for any cleanup on stop. + + If any Ray actors are launched in the Trainable (i.e., with a RLlib + trainer), be sure to kill the Ray actor process here. + + You can kill a Ray actor by calling `actor.__ray_terminate__.remote()` + on the actor. + """ pass def _export_model(self, export_formats, export_dir): diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 5773c2dff..6793ad1c1 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -108,6 +108,7 @@ class Trial(object): config=None, trial_id=None, local_dir=DEFAULT_RESULTS_DIR, + evaluated_params=None, experiment_tag="", resources=None, stopping_criterion=None, @@ -133,6 +134,9 @@ class Trial(object): self.trial_id = Trial.generate_id() if trial_id is None else trial_id self.config = config or {} self.local_dir = local_dir # This remains unexpanded for syncing. + + #: Parameters that Tune varies across searches. + self.evaluated_params = evaluated_params or [] self.experiment_tag = experiment_tag trainable_cls = self._get_trainable_cls() if trainable_cls and hasattr(trainable_cls,