diff --git a/doc/source/tune-package-ref.rst b/doc/source/tune-package-ref.rst index d6f13cd98..d2531254b 100644 --- a/doc/source/tune-package-ref.rst +++ b/doc/source/tune-package-ref.rst @@ -12,6 +12,11 @@ ray.tune :members: :private-members: + +.. autoclass:: ray.tune.function_runner.StatusReporter + :members: __call__ + + ray.tune.schedulers ------------------- diff --git a/doc/source/tune-usage.rst b/doc/source/tune-usage.rst index 7d80875f3..dc030bf6c 100644 --- a/doc/source/tune-usage.rst +++ b/doc/source/tune-usage.rst @@ -8,10 +8,7 @@ Tune Overview Tune schedules a number of *trials* in a cluster. Each trial runs a user-defined Python function or class and is parameterized either by a *config* variation from Tune's Variant Generator or a user-specified **search algorithm**. The trials are scheduled and managed by a **trial scheduler**. -More information about Tune's `search algorithms can be found here `__. - -More information about Tune's `trial schedulers can be found here `__. - +More information about Tune's `search algorithms can be found here `__. More information about Tune's `trial schedulers can be found here `__. Start by installing, importing, and initializing Ray. @@ -22,30 +19,49 @@ Start by installing, importing, and initializing Ray. ray.init() -Tune provides a ``run_experiments`` function that generates and runs the trials as described by the `experiment specification `__. - -.. autofunction:: ray.tune.run_experiments - :noindex: - -This function will report status on the command line until all Trials stop: - -:: - - == Status == - Using FIFO scheduling algorithm. - Resources used: 4/8 CPUs, 0/0 GPUs - Result logdir: ~/ray_results/my_experiment - - train_func_0_lr=0.2,momentum=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc - - train_func_1_lr=0.4,momentum=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc - - train_func_2_lr=0.6,momentum=1: TERMINATED [pid=6789], 21 s, 2190 ts, 100 acc - - train_func_3_lr=0.2,momentum=2: RUNNING [pid=6791], 208 s, 41004 ts, 8.37 acc - - train_func_4_lr=0.4,momentum=2: RUNNING [pid=6800], 209 s, 41204 ts, 70.1 acc - - train_func_5_lr=0.6,momentum=2: TERMINATED [pid=6809], 10 s, 2164 ts, 100 acc - Experiment Configuration ------------------------ +This section will cover the main steps needed to modify your code to run Tune: using the `Training API `__ and `executing your Tune experiment `__. + +You can checkout out our `examples page `__ for more code examples. + +Training API +~~~~~~~~~~~~ + +Training can be done with either the **function-based API** or **Trainable API**. + +**Python functions** will need to have the following signature: + +.. code-block:: python + + def trainable(config, reporter): + """ + Args: + config (dict): Parameters provided from the search algorithm + or variant generation. + reporter (Reporter): Handle to report intermediate metrics to Tune. + """ + + while True: + # ... + reporter(**kwargs) + +The reporter will allow you to report metrics used for scheduling, search, or early stopping. + +Tune will run this function on a separate thread in a Ray actor process. Note that this API is not checkpointable, since the thread will never return control back to its caller. The reporter documentation can be `found here `__. + +.. note:: + If you have a lambda function that you want to train, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``lambda_id`` in place of ``my_trainable``. + +**Python classes** passed into Tune will need to subclass ``ray.tune.Trainable``. The Trainable interface `can be found here `__. + +Both the Trainable and function-based API will have `autofilled metrics `__ in addition to the metrics reported. + +See the `experiment specification `__ section on how to specify and execute your training. + + Specifying Experiments ~~~~~~~~~~~~~~~~~~~~~~ @@ -79,54 +95,33 @@ dictionary. Tune will convert the dict into an ``ray.tune.Experiment`` object. "max_failures": 2 } } - run_experiments(experiment_spec) + +Tune provides a ``run_experiments`` function that generates and runs the trials. + +.. autofunction:: ray.tune.run_experiments + :noindex: + +This function will report status on the command line until all Trials stop: + +:: + + == Status == + Using FIFO scheduling algorithm. + Resources used: 4/8 CPUs, 0/0 GPUs + Result logdir: ~/ray_results/my_experiment + - train_func_0_lr=0.2,momentum=1: RUNNING [pid=6778], 209 s, 20604 ts, 7.29 acc + - train_func_1_lr=0.4,momentum=1: RUNNING [pid=6780], 208 s, 20522 ts, 53.1 acc + - train_func_2_lr=0.6,momentum=1: TERMINATED [pid=6789], 21 s, 2190 ts, 100 acc + - train_func_3_lr=0.2,momentum=2: RUNNING [pid=6791], 208 s, 41004 ts, 8.37 acc + - train_func_4_lr=0.4,momentum=2: RUNNING [pid=6800], 209 s, 41204 ts, 70.1 acc + - train_func_5_lr=0.6,momentum=2: TERMINATED [pid=6809], 10 s, 2164 ts, 100 acc An example of this can be found in `async_hyperband_example.py `__. -Model API -~~~~~~~~~ - -You can either pass in a Python function or Python class for model training as follows, each requiring a specific signature/interface: - -.. code-block:: python - :emphasize-lines: 3,8 - - experiment_spec = { - "my_experiment_name": { - "run": my_trainable - } - } - - # or with the Experiment API - experiment_spec = Experiment("my_experiment_name", my_trainable) - - run_experiments(experiments=experiment_spec) - - -**Python functions** will need to have the following signature: - -.. code-block:: python - - def trainable(config, reporter): - """ - Args: - config (dict): Parameters provided from the search algorithm - or variant generation. - reporter (Reporter): Handle to report intermediate metrics to Tune. - """ - -Tune will run this function on a separate thread in a Ray actor process. Note that trainable functions are not checkpointable, since they never return control back to their caller. See `Trial Checkpointing for more details `__. - -.. note:: - If you have a lambda function that you want to train, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``lambda_id`` in place of ``my_trainable``. - -**Python classes** passed into Tune will need to subclass ``ray.tune.Trainable``. - -.. autoclass:: ray.tune.Trainable - :members: __init__, _save, _restore, _train, _setup, _stop - :noindex: +Training Features +----------------- Tune Search Space (Default) ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -219,7 +214,7 @@ If your trainable function / class creates further Ray actors or tasks that also Trial Checkpointing ~~~~~~~~~~~~~~~~~~~ -To enable checkpointing, you must implement a `Trainable class `__ (Trainable functions are not checkpointable, since they never return control back to their caller). The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement its ``_train``, ``_save``, and ``_restore`` abstract methods `(example) `__. Implementing this interface is required to support resource multiplexing in Trial Schedulers such as HyperBand and PBT. +To enable checkpointing, you must implement a `Trainable class `__ (Trainable functions are not checkpointable, since they never return control back to their caller). The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement its ``_train``, ``_save``, and ``_restore`` abstract methods `(example) `__. Implementing this interface is required to support resource multiplexing in Trial Schedulers such as HyperBand and PBT. For TensorFlow model training, this would look something like this `(full tensorflow example) `__: @@ -300,6 +295,28 @@ You often will want to compute a large object (e.g., training data, model weight } }) +Auto-Filled Results +------------------- + +During training, Tune will automatically fill certain fields if not already provided. All of these can be used as stopping conditions or in the Scheduler/Search Algorithm specification. + +.. literalinclude:: ../../python/ray/tune/result.py + :language: python + :start-after: __sphinx_doc_begin__ + :end-before: __sphinx_doc_end__ + +The following fields will automatically show up on the console output, if provided: + +1. ``episode_reward_mean`` +2. ``mean_loss`` +3. ``mean_accuracy`` +4. ``timesteps_this_iter`` (aggregated into ``timesteps_total``). + +.. code-block:: bash + + Example_0: TERMINATED [pid=68248], 179 s, 2 iter, 60000 ts, 94 rew + + Logging and Visualizing Results ------------------------------- @@ -363,12 +380,6 @@ Then, on the client side, you can use the following class. The server address de For an example notebook for using the Client API, see the `Client API Example `__. -Examples --------- - -You can find a comprehensive of examples `using Tune and its various features here `__, including examples using Keras, TensorFlow, and Population-Based Training. - - Further Questions or Issues? ---------------------------- diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py index 1b93d3b6c..47593f221 100644 --- a/python/ray/tune/function_runner.py +++ b/python/ray/tune/function_runner.py @@ -14,11 +14,12 @@ logger = logging.getLogger(__name__) class StatusReporter(object): - """Object passed into your main() that you can report status through. + """Object passed into your function that you can report status through. Example: - >>> reporter = StatusReporter() - >>> reporter(timesteps_total=1) + >>> def trainable_function(config, reporter): + >>> assert isinstance(reporter, StatusReporter) + >>> reporter(timesteps_total=1) """ def __init__(self): @@ -33,6 +34,9 @@ class StatusReporter(object): Args: kwargs: Latest training result status. + + Example: + >>> reporter(mean_accuracy=1, training_iteration=4) """ with self._lock: diff --git a/python/ray/tune/result.py b/python/ray/tune/result.py index a30124536..0d5aeb0d0 100644 --- a/python/ray/tune/result.py +++ b/python/ray/tune/result.py @@ -4,6 +4,8 @@ from __future__ import print_function import os +# yapf: disable +# __sphinx_doc_begin__ # (Optional/Auto-filled) training is terminated. Filled only if not provided. DONE = "done" @@ -37,6 +39,8 @@ TIME_TOTAL_S = "time_total_s" # (Auto-filled) The index of this training iteration. TRAINING_ITERATION = "training_iteration" +# __sphinx_doc_end__ +# yapf: enable # Where Tune writes result files by default DEFAULT_RESULTS_DIR = (os.environ.get("TUNE_RESULT_DIR")