diff --git a/ci/jenkins_tests/run_tune_tests.sh b/ci/jenkins_tests/run_tune_tests.sh index 9368930b6..86b3e6a50 100755 --- a/ci/jenkins_tests/run_tune_tests.sh +++ b/ci/jenkins_tests/run_tune_tests.sh @@ -40,6 +40,12 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ pytest /ray/python/ray/tune/tests/test_tune_restore.py +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/tests/example.py + +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/tests/tutorial.py + $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_ray.py \ --smoke-test @@ -94,7 +100,7 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/mnist_pytorch_trainable.py \ - --smoke-test --no-cuda + --smoke-test $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/genetic_example.py \ diff --git a/doc/source/images/tune-df-plot.png b/doc/source/images/tune-df-plot.png new file mode 100644 index 000000000..1014fee55 Binary files /dev/null and b/doc/source/images/tune-df-plot.png differ diff --git a/doc/source/images/tune-start-tb.png b/doc/source/images/tune-start-tb.png new file mode 100644 index 000000000..15be99b18 Binary files /dev/null and b/doc/source/images/tune-start-tb.png differ diff --git a/doc/source/images/tune-upload.png b/doc/source/images/tune-upload.png new file mode 100644 index 000000000..6cb94184a Binary files /dev/null and b/doc/source/images/tune-upload.png differ diff --git a/doc/source/index.rst b/doc/source/index.rst index 574220ccb..7166e5862 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -80,7 +80,9 @@ Ray comes with libraries that accelerate deep learning and reinforcement learnin :caption: Tune tune.rst + tune-tutorial.rst tune-usage.rst + tune-distributed.rst tune-schedulers.rst tune-searchalg.rst tune-package-ref.rst diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index e1f96f949..7a37f833b 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -148,7 +148,7 @@ We can create a `Trainer <#trainers>`__ and try running this policy on a toy env MyTrainer = build_trainer( name="MyCustomTrainer", default_policy=MyTFPolicy) - + ray.init() tune.run(MyTrainer, config={"env": "CartPole-v0", "num_workers": 2}) @@ -250,7 +250,7 @@ Suppose we want to customize PPO to use an asynchronous-gradient optimization st The ``with_updates`` method that we use here is also available for Torch and TF policies built from templates. - + Now let's take a look at the ``update_kl`` function. This is used to adaptively adjust the KL penalty coefficient on the PPO loss, which bounds the policy change per training step. You'll notice the code handles both single and multi-agent cases (where there are be multiple policies each with different KL coeffs): .. code-block:: python @@ -604,7 +604,7 @@ This is how the example in the previous section looks when written using a polic policy=CustomPolicy, env_creator=lambda c: gym.make("CartPole-v0"), num_workers=10) - + # this optimizer implements the IMPALA architecture optimizer = AsyncSamplesOptimizer(workers, train_batch_size=500) @@ -615,7 +615,7 @@ This is how the example in the previous section looks when written using a polic Trainers -------- -Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the rollout workers and optimizer, and collection of training metrics. Trainers also implement the `Trainable API `__ for easy experiment management. +Trainers are the boilerplate classes that put the above components together, making algorithms accessible via Python API and the command line. They manage algorithm configuration, setup of the rollout workers and optimizer, and collection of training metrics. Trainers also implement the `Trainable API `__ for easy experiment management. Example of three equivalent ways of interacting with the PPO trainer, all of which log results in ``~/ray_results``: @@ -630,6 +630,6 @@ Example of three equivalent ways of interacting with the PPO trainer, all of whi rllib train --run=PPO --env=CartPole-v0 --config='{"train_batch_size": 4000}' .. code-block:: python - + from ray import tune tune.run(PPOTrainer, config={"env": "CartPole-v0", "train_batch_size": 4000}) diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 1bb77aefe..b4c45d58b 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -258,7 +258,7 @@ You can provide callback functions to be called at points during policy evaluati info["trainer"].__name__, info["result"]["episodes_this_iter"])) ray.init() - trials = tune.run( + analysis = tune.run( "PG", config={ "env": "CartPole-v0", diff --git a/doc/source/tune-distributed.rst b/doc/source/tune-distributed.rst new file mode 100644 index 000000000..e521e1b07 --- /dev/null +++ b/doc/source/tune-distributed.rst @@ -0,0 +1,239 @@ +Tune Distributed Experiments +============================ + +Tune is commonly used for large-scale distributed hyperparameter optimization. Tune and Ray provide many utilities that enable an effective workflow for interacting with a cluster, including fast file mounting, one-line cluster launching, and result uploading to cloud storage. + +This page will overview the tooling for distributed experiments, covering how to connect to a cluster, how to launch a distributed experiment, and commonly used commands. + +Connecting to a cluster +----------------------- + +One common approach to modifying an existing Tune experiment to go distributed is to set an `argparse` variable so that toggling between distributed and single-node is seamless. This allows Tune to utilize all the resources available to the Ray cluster. + +.. code-block:: python + + import ray + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--ray-redis-address") + args = parser.parse_args() + ray.init(redis_address=args.ray_redis_address) + +Note that connecting to cluster requires a pre-existing Ray cluster to be started already (`Manual Cluster Setup `_). The script should be run on the head node of the Ray cluster. Below, ``tune_script.py`` can be any script that runs a Tune hyperparameter search. + +.. code-block:: bash + + # Single-node execution + $ python tune_script.py + + # On the head node, connect to an existing ray cluster + $ python tune_script.py --ray-redis-address=localhost:XXXX + +.. literalinclude:: ../../python/ray/tune/examples/mnist_pytorch.py + :language: python + :start-after: if __name__ == "__main__": + + +Launching a cloud cluster +------------------------- + +.. tip:: + + If you have already have a list of nodes, skip down to the `Local Cluster Setup`_ section. + +Ray currently supports AWS and GCP. Below, we will launch nodes on AWS that will default to using the Deep Learning AMI. See the `cluster setup documentation `_. + +.. literalinclude:: ../../python/ray/tune/examples/tune-default.yaml + :language: yaml + :name: tune-default.yaml + +This code starts a cluster as specified by the given cluster configuration YAML file, uploads ``tune_script.py`` to the cluster, and runs ``python tune_script.py``. + +.. code-block:: bash + + ray submit tune-default.yaml tune_script.py --start + +.. image:: images/tune-upload.png + :scale: 50% + :align: center + +Analyze your results on TensorBoard by starting TensorBoard on the remote head machine. + +.. code-block:: bash + + # Go to http://localhost:6006 to access TensorBoard. + ray exec tune-default.yaml 'tensorboard --logdir=~/ray_results/ --port 6006' --port-forward 6006 + + +Note that you can customize the directory of results by running: ``tune.run(local_dir=..)``. You can then point TensorBoard to that directory to visualize results. You can also use `awless `_ for easy cluster management on AWS. + +Local Cluster Setup +------------------- + +If you run into issues (or want to add nodes manually), you can use the manual cluster setup `documentation here `__. At a glance, On the head node, run the following. + +.. code-block:: bash + + # If the ``--redis-port`` argument is omitted, Ray will choose a port at random. + $ ray start --head --redis-port=6379 + +The command will print out the address of the Redis server that was started (and some other address information). + +**Then on all of the other nodes**, run the following. Make sure to replace ```` with the value printed by the command on the head node (it should look something like ``123.45.67.89:6379``). + +.. code-block:: bash + + $ ray start --redis-address= + +If you have already have a list of nodes, you can follow the private autoscaling cluster setup `instructions here `_. + + +Pre-emptible Instances (Cloud) +------------------------------ + +Running on spot instances (or pre-emptible instances) can reduce the cost of your experiment. You can enable spot instances in AWS via the following configuration modification: + +.. code-block:: yaml + + # Provider-specific config for worker nodes, e.g. instance type. + worker_nodes: + InstanceType: m5.large + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 + + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + SpotOptions: + MaxPrice: 1.0 # Max Hourly Price + +In GCP, you can use the following configuration modification: + +.. code-block:: yaml + + worker_nodes: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/tf-1-13-cpu + + # Run workers on preemtible instances. + scheduling: + - preemptible: true + +Spot instances may be removed suddenly while trials are still running. Often times this may be difficult to deal with when using other distributed hyperparameter optimization frameworks. Tune allows users to mitigate the effects of this by preserving the progress of your model training through checkpointing. + +The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement ``_save``, and ``_restore`` abstract methods, as seen in the example below: + +.. literalinclude:: ../../python/ray/tune/examples/mnist_pytorch_trainable.py + :language: python + :start-after: __trainable_example_begin__ + :end-before: __trainable_example_end__ + +This can then be used similarly to the Function API as before: + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __trainable_run_begin__ + :end-before: __trainable_run_end__ + + +Example for using spot instances (AWS) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here is an example for running Tune on spot instances. This assumes your AWS credentials have already been setup (``aws configure``): + +1. Download a full example Tune experiment script here. This includes a Trainable with checkpointing: :download:`mnist_pytorch_trainable.py <../../python/ray/tune/examples/mnist_pytorch_trainable.py>`. To run this example, you will need to install the following: + +.. code-block:: bash + + $ pip install ray torch torchvision filelock + +2. Download an example cluster yaml here: :download:`tune-default.yaml <../../python/ray/tune/examples/tune-default.yaml>` +3. Run ``ray submit`` as below to run Tune across them. Append ``[--start]`` if the cluster is not up yet. Append ``[--stop]`` to automatically shutdown your nodes after running. + +.. code-block:: bash + + ray submit tune-default.yaml mnist_pytorch_trainable.py \ + --args="--ray-redis-address=localhost:6379" \ + --start + +4. Optionally for testing on AWS or GCP, you can use the following to kill a random worker node after all the worker nodes are up + +.. code-block:: bash + + $ ray kill-random-node tune-default.yaml --hard + +To summarize, here are the commands to run: + +.. code-block:: bash + + wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/tune/examples/mnist_pytorch_trainable.py + wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/tune/tune-default.yaml + ray submit tune-default.yaml mnist_pytorch_trainable.py --args="--ray-redis-address=localhost:6379" --start + + # wait a while until after all nodes have started + ray kill-random-node tune-default.yaml --hard + +You should see Tune eventually continue the trials on a different worker node. See the `Saving and Recovery `__ section for more details. + +You can also specify ``tune.run(upload_dir=...)`` to sync results with a cloud storage like S3, persisting results in case you want to start and stop your cluster automatically. + +Common Commands +--------------- + +Below are some commonly used commands for submitting experiments. Please see the `Autoscaler page `__ to see find more comprehensive documentation of commands. + +.. code-block:: bash + + # Upload `tune_experiment.py` from your local machine onto the cluster. Then, + # run `python tune_experiment.py --redis-address=localhost:6379` on the remote machine. + $ ray submit CLUSTER.YAML tune_experiment.py --args="--redis-address=localhost:6379" + + # Start a cluster and run an experiment in a detached tmux session, + # and shut down the cluster as soon as the experiment completes. + # In `tune_experiment.py`, set `tune.run(upload_dir="s3://...")` to persist results + $ ray submit CLUSTER.YAML --tmux --start --stop tune_experiment.py --args="--redis-address=localhost:6379" + + # To start or update your cluster: + $ ray up CLUSTER.YAML [-y] + + # Shut-down all instances of your cluster: + $ ray down CLUSTER.YAML [-y] + + # Run Tensorboard and forward the port to your own machine. + $ ray exec CLUSTER.YAML 'tensorboard --logdir ~/ray_results/ --port 6006' --port-forward 6006 + + # Run Jupyter Lab and forward the port to your own machine. + $ ray exec CLUSTER.YAML 'jupyter lab --port 6006' --port-forward 6006 + + # Get a summary of all the experiments and trials that have executed so far. + $ ray exec CLUSTER.YAML 'tune ls ~/ray_results' + + # Upload and sync file_mounts up to the cluster with this command. + $ ray rsync-up CLUSTER.YAML + + # Download the results directory from your cluster head node to your local machine on ``~/cluster_results``. + $ ray rsync-down CLUSTER.YAML '~/ray_results' ~/cluster_results + + # Launching multiple clusters using the same configuration. + $ ray up CLUSTER.YAML -n="cluster1" + $ ray up CLUSTER.YAML -n="cluster2" + $ ray up CLUSTER.YAML -n="cluster3" + +Troubleshooting +--------------- + +Sometimes, your program may freeze. Run this to restart the Ray cluster without running any of the installation commands. + +.. code-block:: bash + + $ ray up CLUSTER.YAML --restart-only + + +.. Local Cluster Setup: tune-distributed.html#local-cluster-setup diff --git a/doc/source/tune-package-ref.rst b/doc/source/tune-package-ref.rst index 7788ceed9..c2c24f396 100644 --- a/doc/source/tune-package-ref.rst +++ b/doc/source/tune-package-ref.rst @@ -6,16 +6,15 @@ ray.tune .. automodule:: ray.tune :members: + :show-inheritance: :exclude-members: TuneError, Trainable .. autoclass:: ray.tune.Trainable :members: :private-members: - .. autoclass:: ray.tune.function_runner.StatusReporter - :members: __call__ - + :members: __call__, logdir ray.tune.schedulers ------------------- @@ -37,10 +36,10 @@ ray.tune.suggest :private-members: :show-inheritance: -ray.tune.analysis ------------------ +ray.tune.track +-------------- -.. autoclass:: ray.tune.analysis.ExperimentAnalysis +.. automodule:: ray.tune.track :members: diff --git a/doc/source/tune-schedulers.rst b/doc/source/tune-schedulers.rst index 2f6957f3f..cb3f1aced 100644 --- a/doc/source/tune-schedulers.rst +++ b/doc/source/tune-schedulers.rst @@ -35,7 +35,7 @@ Tune includes a distributed implementation of `Population Based Training (PBT) < }) tune.run( ... , scheduler=pbt_scheduler) -When the PBT scheduler is enabled, each trial variant is treated as a member of the population. Periodically, top-performing trials are checkpointed (this requires your Trainable to support `checkpointing `__). Low-performing trials clone the checkpoints of top performers and perturb the configurations in the hope of discovering an even better variation. +When the PBT scheduler is enabled, each trial variant is treated as a member of the population. Periodically, top-performing trials are checkpointed (this requires your Trainable to support `save and restore `__). Low-performing trials clone the checkpoints of top performers and perturb the configurations in the hope of discovering an even better variation. You can run this `toy PBT example `__ to get an idea of how how PBT operates. When training in PBT mode, a single trial may see many different hyperparameters over its lifetime, which is recorded in its ``result.json`` file. The following figure generated by the example shows PBT with optimizing a LR schedule over the course of a single experiment: @@ -69,7 +69,7 @@ Compared to the original version of HyperBand, this implementation provides bett HyperBand --------- -.. note:: Note that the HyperBand scheduler requires your trainable to support checkpointing, which is described in `Tune User Guide `__. Checkpointing enables the scheduler to multiplex many concurrent trials onto a limited size cluster. +.. note:: Note that the HyperBand scheduler requires your trainable to support saving and restoring, which is described in `Tune User Guide `__. Checkpointing enables the scheduler to multiplex many concurrent trials onto a limited size cluster. Tune also implements the `standard version of HyperBand `__. You can use it as such: diff --git a/doc/source/tune-tutorial.rst b/doc/source/tune-tutorial.rst new file mode 100644 index 000000000..17fdcb3a6 --- /dev/null +++ b/doc/source/tune-tutorial.rst @@ -0,0 +1,121 @@ +Tune Example Walkthrough +======================== + +This tutorial will walk you through the following process to setup a Tune experiment. Specifically, we'll leverage ASHA and Bayesian Optimization (via HyperOpt) via the following steps: + + 1. Integrating Tune into your workflow + 2. Specifying a TrialScheduler + 3. Adding a SearchAlgorithm + 4. Getting the best model and analyzing results + +.. note:: + + To run this example, you will need to install the following: + + .. code-block:: bash + + $ pip install ray torch torchvision filelock + +We first run some imports: + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __tutorial_imports_begin__ + :end-before: __tutorial_imports_end__ + + +Below, we have some boiler plate code for a PyTorch training function. + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __train_func_begin__ + :end-before: __train_func_end__ + +Notice that there's a couple helper functions in the above training script. You can take a look at these functions in the imported module `examples/mnist_pytorch `__; there's no black magic happening. For example, ``train`` is simply a for loop over the data loader. + +.. code:: python + + def train(model, optimizer, train_loader): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + if batch_idx * len(data) > EPOCH_SIZE: + return + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + +Let's run 1 trial, randomly sampling from a uniform distribution for learning rate and momentum. + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __eval_func_begin__ + :end-before: __eval_func_end__ + +We can then plot the performance of this trial. + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __plot_begin__ + :end-before: __plot_end__ + + +Early Stopping with ASHA +~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's integrate an early stopping algorithm to our search - ASHA, a scalable algorithm for principled early stopping. + +How does it work? On a high level, it terminates trials that are less promising and +allocates more time and resources to more promising trials. See `this blog post `__ for more details. + +We can afford to **increase the search space by 5x**, by adjusting the parameter ``num_samples``. See the `Trial Scheduler section `__ for more details of available schedulers and library integrations. + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __run_scheduler_begin__ + :end-before: __run_scheduler_end__ + +You can run the below in a Jupyter notebook to visualize trial progress. + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __plot_scheduler_begin__ + :end-before: __plot_scheduler_end__ + +.. image:: images/tune-df-plot.png + :scale: 50% + :align: center + +You can also use Tensorboard for visualizing results. + +.. code:: bash + + $ tensorboard --logdir {logdir} + + +Search Algorithms in Tune +~~~~~~~~~~~~~~~~~~~~~~~~~ + +With Tune you can combine powerful hyperparameter search libraries such as `HyperOpt `_ and `Ax `_ with state-of-the-art algorithms such as HyperBand without modifying any model training code. Tune allows you to use different search algorithms in combination with different trial schedulers. See the `Search Algorithm section `__ for more details of available algorithms and library integrations. + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __run_searchalg_begin__ + :end-before: __run_searchalg_end__ + + +Evaluate your model +~~~~~~~~~~~~~~~~~~~ + +You can evaluate best trained model using the Analysis object to retrieve the best model: + +.. literalinclude:: ../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __run_analysis_begin__ + :end-before: __run_analysis_end__ + + +Next Steps +---------- +Take a look at the `Usage Guide `__ for more comprehensive overview of Tune features. diff --git a/doc/source/tune-usage.rst b/doc/source/tune-usage.rst index 35cb8eef4..ac0ce698d 100644 --- a/doc/source/tune-usage.rst +++ b/doc/source/tune-usage.rst @@ -10,16 +10,6 @@ Tune schedules a number of *trials* in a cluster. Each trial runs a user-defined More information about Tune's `search algorithms can be found here `__. More information about Tune's `trial schedulers can be found here `__. -Start by installing, importing, and initializing Ray. - -.. code-block:: python - - import ray - import ray.tune as tune - - ray.init() - - Experiment Configuration ------------------------ @@ -30,36 +20,47 @@ You can checkout out our `examples page `__ for more code ex Training API ~~~~~~~~~~~~ -Training can be done with either the **function-based API** or **Trainable API**. +Training can be done with either the **Trainable Class API** or **function-based API**. + +**Python classes** passed into Tune will need to subclass ``ray.tune.Trainable``. The Trainable interface `can be found here `__. Here is an example: -**Python functions** will need to have the following signature: .. code-block:: python - def trainable(config, reporter): + class Example(Trainable): + def _setup(self, config): + ... + + def _train(self): + # run training code + result_dict = {"accuracy": 0.5, "f1": 0.1, ...} + return result_dict + +**Python functions** will need to have the following signature and call ``tune.track.log``, which will allow you to report metrics used for scheduling, search, or early stopping.: + +.. code-block:: python + + def trainable(config): """ Args: config (dict): Parameters provided from the search algorithm or variant generation. - reporter (Reporter): Handle to report intermediate metrics to Tune. """ while True: # ... - reporter(**kwargs) + tune.track.log(**kwargs) -The reporter will allow you to report metrics used for scheduling, search, or early stopping. -Tune will run this function on a separate thread in a Ray actor process. Note that this API is not checkpointable, since the thread will never return control back to its caller. The reporter documentation can be `found here `__. +Tune will run this function on a separate thread in a Ray actor process. Note that this API is not checkpointable, since the thread will never return control back to its caller. ``tune.track`` documentation can be `found here `__. + +Both the Trainable and function-based API will have `autofilled metrics `__ in addition to the metrics reported. .. note:: If you have a lambda function that you want to train, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``lambda_id`` in place of ``my_trainable``. -**Python classes** passed into Tune will need to subclass ``ray.tune.Trainable``. The Trainable interface `can be found here `__. - -Both the Trainable and function-based API will have `autofilled metrics `__ in addition to the metrics reported. - -See the `experiment specification `__ section on how to specify and execute your training. +.. note:: + See previous versions of the documentation for the ``reporter`` API. Launching an Experiment @@ -67,8 +68,13 @@ Launching an Experiment Tune provides a ``run`` function that generates and runs the trials. -.. autofunction:: ray.tune.run - :noindex: +.. code-block:: python + + tune.run( + trainable, + name="example-experiment", + num_samples=10, + ) This function will report status on the command line until all Trials stop: @@ -86,34 +92,53 @@ This function will report status on the command line until all Trials stop: - train_func_5_lr=0.6,momentum=2: TERMINATED [pid=6809], 10 s, 2164 ts, 100 acc -Custom Trial Names -~~~~~~~~~~~~~~~~~~ +All results reported by the trainable will be logged locally to a unique directory per experiment, e.g. ``~/ray_results/example-experiment`` in the above example. On a cluster, incremental results will be synced to local disk on the head node. -To specify custom trial names, you can pass use the ``trial_name_creator`` argument -to `tune.run`. This takes a function with the following signature, and -be sure to wrap it with `tune.function`: + +Analyzing Results +----------------- + +Tune provides an ``ExperimentAnalysis`` object for analyzing results from ``tune.run``. .. code-block:: python - def trial_name_string(trial): - """ - Args: - trial (Trial): A generated trial object. - - Returns: - trial_name (str): String representation of Trial. - """ - return str(trial) - - tune.run( - MyTrainableClass, - name="hyperband_test", - num_samples=1, - trial_name_creator=tune.function(trial_name_string) + analysis = tune.run( + trainable, + name="example-experiment", + num_samples=10, ) -An example can be found in `logging_example.py `__. +You can use the ``ExperimentAnalysis`` object to obtain the best configuration of the experiment: +.. code-block:: python + + >>> print("Best config is", analysis.get_best_config(metric="mean_accuracy")) + Best config is: {'lr': 0.011537575723482687, 'momentum': 0.8921971713692662} + +Here are some example operations for obtaining a summary of your experiment: + +.. code-block:: python + + # Get a dataframe for the last reported results of all of the trials + df = analysis.dataframe() + + # Get a dataframe for the max accuracy seen for each trial + df = analysis.dataframe(metric="mean_accuracy", mode="max") + + # Get a dict mapping {trial logdir -> dataframes} for all trials in the experiment. + all_dataframes = analysis.trial_dataframes + + # Get a list of trials + trials = analysis.trials + +You may want to get a summary of multiple experiments that point to the same ``local_dir``. For this, you can use the ``Analysis`` class. + +.. code-block:: python + + from ray.tune import Analysis + analysis = Analysis("~/ray_results/example-experiment") + +See the `full documentation `_ for the ``Analysis`` object. Training Features ----------------- @@ -151,6 +176,34 @@ The following shows grid search over two nested parameters combined with random For more information on variant generation, see `basic_variant.py `__. +Custom Trial Names +~~~~~~~~~~~~~~~~~~ + +To specify custom trial names, you can pass use the ``trial_name_creator`` argument +to `tune.run`. This takes a function with the following signature, and +be sure to wrap it with `tune.function`: + +.. code-block:: python + + def trial_name_string(trial): + """ + Args: + trial (Trial): A generated trial object. + + Returns: + trial_name (str): String representation of Trial. + """ + return str(trial) + + tune.run( + MyTrainableClass, + name="example-experiment", + num_samples=1, + trial_name_creator=tune.function(trial_name_string) + ) + +An example can be found in `logging_example.py `__. + Sampling Multiple Times ~~~~~~~~~~~~~~~~~~~~~~~ @@ -202,23 +255,29 @@ If your trainable function / class creates further Ray actors or tasks that also } ) +Saving and Recovery +------------------- -Trial Checkpointing -~~~~~~~~~~~~~~~~~~~ +When running a hyperparameter search, Tune can automatically and periodically save/checkpoint your model. Checkpointing is used for -To enable checkpointing, you must implement a `Trainable class `__ (Trainable functions are not checkpointable, since they never return control back to their caller). The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement its ``_train``, ``_save``, and ``_restore`` abstract methods `(example) `__. Implementing this interface is required to support resource multiplexing in Trial Schedulers such as HyperBand and PBT. + * saving a model at the end of training + * modifying a model in the middle of training + * fault-tolerance in experiments with pre-emptible machines. + * enables certain Trial Schedulers such as HyperBand and PBT. -For TensorFlow model training, this would look something like this `(full tensorflow example) `__: +To enable checkpointing, you must implement a `Trainable class `__ (Trainable functions are not checkpointable, since they never return control back to their caller). The easiest way to do this is to subclass the pre-defined ``Trainable`` class and implement ``_save``, and ``_restore`` abstract methods, as seen in `this example `__. + +For TensorFlow model training, this would look something like this `tensorflow example `__: .. code-block:: python - class MyClass(Trainable): + class MyTrainableClass(Trainable): def _setup(self, config): self.saver = tf.train.Saver() self.sess = ... def _train(self): - self.sess.run(...) + return {"mean_accuracy: self.sess.run(...)} def _save(self, checkpoint_dir): return self.saver.save(self.sess, os.path.join(checkpoint_dir, save)) @@ -226,11 +285,36 @@ For TensorFlow model training, this would look something like this `(full tensor def _restore(self, checkpoint_prefix): self.saver.restore(self.sess, checkpoint_prefix) +Checkpoints will be saved by training iteration to ``local_dir/exp_name/trial_name/checkpoint_``. You can restore a single trial checkpoint by using ``tune.run(restore=)``. To test if your Trainable will checkpoint and restore correctly, you can use ``tune.util.validate_save_restore`` as follows: -Additionally, checkpointing can be used to provide fault-tolerance for experiments. This can be enabled by setting ``checkpoint_freq=N`` and ``max_failures=M`` to checkpoint trials every *N* iterations and recover from up to *M* crashes per trial, e.g.: + .. code-block:: python + + from ray.tune.util import validate_save_restore + + validate_save_restore(MyTrainableClass) + validate_save_restore(MyTrainableClass, use_object_store=True) + + +Trainable (Trial) Checkpointing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Checkpointing assumes that the model state will be saved to disk on whichever node the Trainable is running on. You can checkpoint with three different mechanisms: manually, periodically, and at termination. + +**Manual Checkpointing**: A custom Trainable can manually trigger checkpointing by returning ``should_checkpoint: True`` (or ``tune.result.SHOULD_CHECKPOINT: True``) in the result dictionary of `_train`. This can be especially helpful in spot instances: + +.. code-block:: python + + def _train(self): + # training code + result = {"mean_accuracy": accuracy} + if detect_instance_preemption(): + result.update(should_checkpoint=True) + return result + + +**Periodic Checkpointing**: periodic checkpointing can be used to provide fault-tolerance for experiments. This can be enabled by setting ``checkpoint_freq=`` and ``max_failures=`` to checkpoint trials every *N* iterations and recover from up to *M* crashes per trial, e.g.: .. code-block:: python - :emphasize-lines: 4,5 tune.run( my_trainable, @@ -238,8 +322,8 @@ Additionally, checkpointing can be used to provide fault-tolerance for experimen max_failures=5, ) -The checkpoint_freq may not coincide with the exact end of an experiment. If you want a checkpoint to be created at the end -of a trial, you can additionally set the checkpoint_at_end to True. An example is shown below: +**Checkpointing at Termination**: The checkpoint_freq may not coincide with the exact end of an experiment. If you want a checkpoint to be created at the end +of a trial, you can additionally set the ``checkpoint_at_end=True``: .. code-block:: python :emphasize-lines: 5 @@ -251,11 +335,41 @@ of a trial, you can additionally set the checkpoint_at_end to True. An example i max_failures=5, ) +The checkpoint will be saved at a path that looks like ``local_dir/exp_name/trial_name/checkpoint_x/``, where the x is the number of iterations so far when the checkpoint is saved. To restore the checkpoint, you can use the ``restore`` argument and specify a checkpoint file. By doing this, you can change whatever experiments' configuration such as the experiment's name, the training iteration or so: -Recovering From Failures (Experimental) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python -Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed by passing one of True, False, "LOCAL", "REMOTE", or "PROMPT" to ``tune.run(resume=...)``. The default setting of ``resume=False`` creates a new experiment. ``resume="LOCAL"`` and ``resume=True`` restore the experiment from ``local_dir/[experiment_name]``. ``resume="REMOTE"`` syncs the upload dir down to the local dir and then restores the experiment from ``local_dir/experiment_name``. ``resume="PROMPT"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name. + # Restored previous trial from the given checkpoint + tune.run( + "PG", + name="RestoredExp", # The name can be different. + stop={"training_iteration": 10}, # train 5 more iterations than previous + restore="~/ray_results/Original/PG_/checkpoint_5/checkpoint-5", + config={"env": "CartPole-v0"}, + ) + +Fault Tolerance +~~~~~~~~~~~~~~~ + +Tune will automatically restart trials from the last checkpoint in case of trial failures/error (if ``max_failures`` is set), both in the single node and distributed setting. + +In the distributed setting, if using the autoscaler with ``rsync`` enabled, Tune will automatically sync the trial folder with the driver. For example, if a node is lost while a trial (specifically, the corresponding Trainable actor of the trial) is still executing on that node and a checkpoint of the trial exists, Tune will wait until available resources are available to begin executing the trial again. + +If the trial/actor is placed on a different node, Tune will automatically push the previous checkpoint file to that node and restore the remote trial actor state, allowing the trial to resume from the latest checkpoint even after failure. + +Take a look at `an example `_. + +Recovering From Failures +~~~~~~~~~~~~~~~~~~~~~~~~ + +Tune automatically persists the progress of your entire experiment (a ``tune.run`` session), so if an experiment crashes or is otherwise cancelled, it can be resumed by passing one of True, False, "LOCAL", "REMOTE", or "PROMPT" to ``tune.run(resume=...)``. Note that this only works if trial checkpoints are detected, whether it be by manual or periodic checkpointing. + +**Settings:** + + - The default setting of ``resume=False`` creates a new experiment. + - ``resume="LOCAL"`` and ``resume=True`` restore the experiment from ``local_dir/[experiment_name]``. + - ``resume="REMOTE"`` syncs the upload dir down to the local dir and then restores the experiment from ``local_dir/experiment_name``. + - ``resume="PROMPT"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name. Note that trials will be restored to their last checkpoint. If trial checkpointing is not enabled, unfinished trials will be restarted from scratch. @@ -270,10 +384,11 @@ E.g.: resume=True ) +Upon a second run, this will restore the entire experiment state from ``~/path/to/results/my_experiment_name``. Importantly, any changes to the experiment specification upon resume will be ignored. For example, if the previous experiment has reached its termination, then resuming it with a new stop criterion makes no effect: the new experiment will terminate immediately after initialization. If you want to change the configuration, such as training more iterations, you can do so restore the checkpoint by setting ``restore=`` - note that this only works for a single trial. -Upon a second run, this will restore the entire experiment state from ``~/path/to/results/my_experiment_name``. Importantly, any changes to the experiment specification upon resume will be ignored. +.. warning:: -This feature is still experimental, so any provided Trial Scheduler or Search Algorithm will not be preserved. Only ``FIFOScheduler`` and ``BasicVariantGenerator`` will be supported. + This feature is still experimental, so any provided Trial Scheduler or Search Algorithm will not be preserved. Only ``FIFOScheduler`` and ``BasicVariantGenerator`` will be supported. Handling Large Datasets @@ -323,21 +438,8 @@ The following fields will automatically show up on the console output, if provid Example_0: TERMINATED [pid=68248], 179 s, 2 iter, 60000 ts, 94 rew -Logging, Analyzing, and Visualizing Results -------------------------------------------- - -All results reported by the trainable will be logged locally to a unique directory per experiment, e.g. ``~/ray_results/my_experiment`` in the above example. On a cluster, incremental results will be synced to local disk on the head node. - -Tune provides an ``ExperimentAnalysis`` object for analyzing results which can be used by providing the directory path as follows: - -.. code-block:: python - - from ray.tune.analysis import ExperimentAnalysis - - ea = ExperimentAnalysis("~/ray_results/my_experiment") - trials_dataframe = ea.dataframe() - -You can check out `experiment_analysis.py `__ for more interesting analysis operations. +Visualizing Results +------------------- To visualize learning in tensorboard, install TensorFlow: @@ -369,8 +471,8 @@ To use rllab's VisKit (you may have to install some dependencies), run: .. image:: ray-tune-viskit.png -Custom Loggers -~~~~~~~~~~~~~~ +Logging +------- You can pass in your own logging mechanisms to output logs in custom formats as follows: @@ -384,16 +486,10 @@ You can pass in your own logging mechanisms to output logs in custom formats as loggers=DEFAULT_LOGGERS + (CustomLogger1, CustomLogger2) ) -These loggers will be called along with the default Tune loggers. All loggers must inherit the `Logger interface `__. +These loggers will be called along with the default Tune loggers. All loggers must inherit the `Logger interface `__. Tune has default loggers for Tensorboard, CSV, and JSON formats. You can also check out `logger.py `__ for implementation details. An example can be found in `logging_example.py `__. -Tune has default loggers for Tensorboard, CSV, and JSON formats. - -You can also check out `logger.py `__ for implementation details. - -An example can be found in `logging_example.py `__. - -Custom Sync/Upload Commands -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Uploading/Syncing +----------------- Tune automatically syncs the trial folder on remote nodes back to the head node. This requires the ray cluster to be started with the `autoscaler `__. By default, local syncing requires rsync to be installed. You can customize the sync command with the ``sync_to_driver`` argument in ``tune.run`` by providing either a function or a string. @@ -457,14 +553,25 @@ The API also supports curl. Here are the examples for getting trials (``GET /tri .. code-block:: bash - curl http://
:/trials - curl http://
:/trials/ + $ curl http://
:/trials + $ curl http://
:/trials/ And stopping a trial (``PUT /trials/:id``): .. code-block:: bash - curl -X PUT http://
:/trials/ + $ curl -X PUT http://
:/trials/ + +Debugging (Single Process) +-------------------------- + +By default, Tune will run hyperparameter evaluations on multiple processes. However, if you need to debug your training process, it may be easier to do everything on a single process. You can force all Ray functions to occur on a single process with ``local_mode`` by calling the following before ``tune.run``. + +.. code-block:: python + + ray.init(local_mode=True) + +Note that some behavior such as writing to files by depending on the current working directory in a Trainable and setting global process variables may not work as expected. Local mode with multiple configuration evaluations will interleave computation, so it is most naturally used when running a single configuration evaluation. Tune CLI (Experimental) diff --git a/doc/source/tune.rst b/doc/source/tune.rst index 2674f5b06..63b69f4dc 100644 --- a/doc/source/tune.rst +++ b/doc/source/tune.rst @@ -5,91 +5,98 @@ Tune: Scalable Hyperparameter Search :scale: 30% :align: center -Tune is a scalable framework for hyperparameter search with a focus on deep learning and deep reinforcement learning. +Tune is a scalable framework for hyperparameter search and model training with a focus on deep learning and deep reinforcement learning. -You can find the code for Tune `here on GitHub `__. To get started with Tune, try going through `our tutorial of using Tune with Keras `__. + * Scale to running on a large distributed cluster without changing your code. + * Launch a multi-node Tune experiment in less than 10 lines of code. + * Supports any deep learning framework, including PyTorch, TensorFlow, and Keras. + * Visualize results with `TensorBoard `__. + * Choose among scalable SOTA algorithms such as `Population Based Training (PBT)`_, `Vizier's Median Stopping Rule`_, `HyperBand/ASHA`_. -(Experimental): You can try out `the above tutorial on a free hosted server via Binder `__. +.. _`Population Based Training (PBT)`: tune-schedulers.html#population-based-training-pbt +.. _`Vizier's Median Stopping Rule`: tune-schedulers.html#median-stopping-rule +.. _`HyperBand/ASHA`: tune-schedulers.html#asynchronous-hyperband + +Quick Start +----------- + +.. note:: + + To run this example, you will need to install the following: + + .. code-block:: bash + + $ pip install ray torch torchvision filelock -Features --------- +This example runs a small grid search to train a CNN using PyTorch and Tune. -* Supports any deep learning framework, including PyTorch, TensorFlow, and Keras. +.. literalinclude:: ../../python/ray/tune/tests/example.py + :language: python + :start-after: __quick_start_begin__ + :end-before: __quick_start_end__ -* Choose among scalable hyperparameter and model search techniques such as: +If TensorBoard is installed, automatically visualize all trial results: - - `Population Based Training (PBT) `__ +.. code-block:: bash - - `Median Stopping Rule `__ + tensorboard --logdir ~/ray_results - - `HyperBand `__ -* Mix and match different hyperparameter optimization approaches - such as using `HyperOpt with HyperBand`_ or `Nevergrad with HyperBand`_. +.. image:: images/tune-start-tb.png -* Visualize results with `TensorBoard `__ and `rllab's VisKit `__. +Distributed Quick Start +----------------------- -* Scale to running on a large distributed cluster without changing your code. +.. note:: -* Parallelize training for models with GPU requirements or algorithms that may themselves be parallel and distributed, using Tune's `resource-aware scheduling `__, + This assumes that you have already setup your AWS account and AWS credentials (``aws configure``). To run this example, you will need to install the following: -Take a look at `the User Guide `__ for a comprehensive overview on how to use Tune's features. + .. code-block:: bash + + $ pip install ray torch torchvision filelock + +1. Import and initialize Ray by appending the following to your example script. + +.. code-block:: python + + # Append to top of your script + import ray + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--ray-redis-address") + args = parser.parse_args() + ray.init(redis_address=args.ray_redis_address) + +Alternatively, download a full example script here: :download:`mnist_pytorch.py <../../python/ray/tune/examples/mnist_pytorch.py>` + +2. Download an example cluster yaml here: :download:`tune-default.yaml <../../python/ray/tune/examples/tune-default.yaml>` +3. Run ``ray submit`` like the following. + +.. code-block:: bash + + ray submit tune-default.yaml mnist_pytorch.py --args="--ray-redis-address=localhost:6379" --start + +This will start 3 AWS machines and run a distributed hyperparameter search across them. Append ``[--stop]`` to automatically shutdown your nodes afterwards. + +To summarize, here are the full set of commands: + +.. code-block:: bash + + wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/tune/examples/mnist_pytorch.py + wget https://raw.githubusercontent.com/ray-project/ray/master/python/ray/tune/tune-default.yaml + ray submit tune-default.yaml mnist_pytorch.py --args="--ray-redis-address=localhost:6379" --start + + +Take a look at the `Distributed Experiments `_ documentation for more details, including setting up distributed experiments on local machines, using GCP, adding resilience to spot instance usage, and more. Getting Started --------------- -Installation -~~~~~~~~~~~~ - -You'll need to first `install ray `__ to import Tune. - -.. code-block:: bash - - pip install ray # also recommended: ray[debug] - - -Quick Start -~~~~~~~~~~~ - -This example runs a small grid search over a neural network training function using Tune, reporting status on the command line until the stopping condition of ``mean_accuracy >= 99`` is reached. Tune works with any deep learning framework. - -Tune uses Ray as a backend, so we will first import and initialize Ray. - -.. code-block:: python - - import ray - from ray import tune - - ray.init() - - -For the function you wish to tune, pass in a ``reporter`` object: - -.. code-block:: python - :emphasize-lines: 1,9 - - def train_func(config, reporter): # add a reporter arg - model = ( ... ) - optimizer = SGD(model.parameters(), - momentum=config["momentum"]) - dataset = ( ... ) - - for idx, (data, target) in enumerate(dataset): - accuracy = model.fit(data, target) - reporter(mean_accuracy=accuracy) # report metrics - -**Finally**, configure your search and execute it on your Ray cluster: - -.. code-block:: python - - all_trials = tune.run( - train_func, - name="quick-start", - stop={"mean_accuracy": 99}, - config={"momentum": tune.grid_search([0.1, 0.2])} - ) - -Tune can be used anywhere Ray can, e.g. on your laptop with ``ray.init()`` embedded in a Python script, or in an `auto-scaling cluster `__ for massive parallelism. + * `Code `__: GitHub repository for Tune. + * `User Guide `__: A comprehensive overview on how to use Tune's features. + * `Tutorial Notebook `__: Our tutorial notebooks of using Tune with Keras or PyTorch. Contribute to Tune ------------------ diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index f501d57a0..a34710972 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -47,38 +47,6 @@ class Analysis(object): self._trial_dataframes = {} self.fetch_trial_dataframes() - def fetch_trial_dataframes(self): - fail_count = 0 - for path in self._get_trial_paths(): - try: - self.trial_dataframes[path] = pd.read_csv( - os.path.join(path, EXPR_PROGRESS_FILE)) - except Exception: - fail_count += 1 - - if fail_count: - logger.debug( - "Couldn't read results from {} paths".format(fail_count)) - return self.trial_dataframes - - def get_all_configs(self, prefix=False): - fail_count = 0 - for path in self._get_trial_paths(): - try: - with open(os.path.join(path, EXPR_PARAM_FILE)) as f: - config = json.load(f) - if prefix: - for k in list(config): - config["config:" + k] = config.pop(k) - self._configs[path] = config - except Exception: - fail_count += 1 - - if fail_count: - logger.warning( - "Couldn't read config from {} paths".format(fail_count)) - return self._configs - def dataframe(self, metric=None, mode=None): """Returns a pandas.DataFrame object constructed from the trials. @@ -110,6 +78,58 @@ class Analysis(object): best_path = compare_op(rows, key=lambda k: rows[k][metric]) return all_configs[best_path] + def get_best_logdir(self, metric, mode="max"): + """Retrieve the logdir corresponding to the best trial. + + Args: + metric (str): Key for trial info to order on. + mode (str): One of [min, max]. + + """ + df = self.dataframe() + if mode == "max": + return df.iloc[df[metric].idxmax()].logdir + elif mode == "min": + return df.iloc[df[metric].idxmin()].logdir + + def fetch_trial_dataframes(self): + fail_count = 0 + for path in self._get_trial_paths(): + try: + self.trial_dataframes[path] = pd.read_csv( + os.path.join(path, EXPR_PROGRESS_FILE)) + except Exception: + fail_count += 1 + + if fail_count: + logger.debug( + "Couldn't read results from {} paths".format(fail_count)) + return self.trial_dataframes + + def get_all_configs(self, prefix=False): + """Returns a list of all configurations. + + Parameters: + prefix (bool): If True, flattens the config dict + and prepends `config/`. + """ + fail_count = 0 + for path in self._get_trial_paths(): + try: + with open(os.path.join(path, EXPR_PARAM_FILE)) as f: + config = json.load(f) + if prefix: + for k in list(config): + config["config/" + k] = config.pop(k) + self._configs[path] = config + except Exception: + fail_count += 1 + + if fail_count: + logger.warning( + "Couldn't read config from {} paths".format(fail_count)) + return self._configs + def _retrieve_rows(self, metric=None, mode=None): assert mode is None or mode in ["max", "min"] rows = {} @@ -135,15 +155,9 @@ class Analysis(object): self._experiment_dir)) return _trial_paths - def get_best_logdir(self, metric, mode="max"): - df = self.dataframe() - if mode == "max": - return df.iloc[df[metric].idxmax()].logdir - elif mode == "min": - return df.iloc[df[metric].idxmin()].logdir - @property def trial_dataframes(self): + """List of all dataframes of the trials.""" return self._trial_dataframes @@ -189,9 +203,15 @@ class ExperimentAnalysis(Analysis): def _get_trial_paths(self): """Overwrites Analysis to only have trials of one experiment.""" - _trial_paths = [ - checkpoint["logdir"] for checkpoint in self._checkpoints - ] + if self.trials: + _trial_paths = [t.logdir for t in self.trials] + else: + logger.warning("No `self.trials`. Drawing logdirs from checkpoint " + "file. This may result in some information that is " + "out of sync, as checkpointing is periodic.") + _trial_paths = [ + checkpoint["logdir"] for checkpoint in self._checkpoints + ] if not _trial_paths: raise TuneError("No trials found.") return _trial_paths diff --git a/python/ray/tune/commands.py b/python/ray/tune/commands.py index 631935908..5fc23e573 100644 --- a/python/ray/tune/commands.py +++ b/python/ray/tune/commands.py @@ -148,7 +148,7 @@ def list_trials(experiment_path, info_keys = DEFAULT_EXPERIMENT_INFO_KEYS col_keys = [ k for k in checkpoints_df.columns - if k in info_keys or k.startswith("config:") + if k in info_keys or k.startswith("config/") ] checkpoints_df = checkpoints_df[col_keys] diff --git a/python/ray/tune/examples/mnist_pytorch.py b/python/ray/tune/examples/mnist_pytorch.py index b791f7415..be64b977b 100644 --- a/python/ray/tune/examples/mnist_pytorch.py +++ b/python/ray/tune/examples/mnist_pytorch.py @@ -4,8 +4,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os import numpy as np import argparse +from filelock import FileLock import torch import torch.nn as nn import torch.nn.functional as F @@ -22,9 +24,9 @@ EPOCH_SIZE = 512 TEST_SIZE = 256 -class Net(nn.Module): - def __init__(self, config): - super(Net, self).__init__() +class ConvNet(nn.Module): + def __init__(self): + super(ConvNet, self).__init__() self.conv1 = nn.Conv2d(1, 3, kernel_size=3) self.fc = nn.Linear(192, 10) @@ -35,7 +37,7 @@ class Net(nn.Module): return F.log_softmax(x, dim=1) -def train(model, optimizer, train_loader, device): +def train(model, optimizer, train_loader, device=torch.device("cpu")): model.train() for batch_idx, (data, target) in enumerate(train_loader): if batch_idx * len(data) > EPOCH_SIZE: @@ -48,7 +50,7 @@ def train(model, optimizer, train_loader, device): optimizer.step() -def test(model, data_loader, device): +def test(model, data_loader, device=torch.device("cpu")): model.eval() correct = 0 total = 0 @@ -70,11 +72,18 @@ def get_data_loaders(): [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) - train_loader = torch.utils.data.DataLoader( - datasets.MNIST( - "~/data", train=True, download=True, transform=mnist_transforms), - batch_size=64, - shuffle=True) + # We add FileLock here because multiple workers will want to + # download data, and this may cause overwrites since + # DataLoader is not threadsafe. + with FileLock(os.path.expanduser("~/data.lock")): + train_loader = torch.utils.data.DataLoader( + datasets.MNIST( + "~/data", + train=True, + download=True, + transform=mnist_transforms), + batch_size=64, + shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST("~/data", train=False, transform=mnist_transforms), batch_size=64, @@ -86,7 +95,7 @@ def train_mnist(config): use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") train_loader, test_loader = get_data_loaders() - model = Net(config).to(device) + model = ConvNet().to(device) optimizer = optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"]) @@ -112,24 +121,25 @@ if __name__ == "__main__": args = parser.parse_args() if args.ray_redis_address: ray.init(redis_address=args.ray_redis_address) - datasets.MNIST("~/data", train=True, download=True) sched = AsyncHyperBandScheduler( time_attr="training_iteration", metric="mean_accuracy") - tune.run( + analysis = tune.run( train_mnist, name="exp", scheduler=sched, stop={ "mean_accuracy": 0.98, - "training_iteration": 5 if args.smoke_test else 20 + "training_iteration": 5 if args.smoke_test else 100 }, resources_per_trial={ "cpu": 2, "gpu": int(args.cuda) }, - num_samples=1 if args.smoke_test else 10, + num_samples=1 if args.smoke_test else 50, config={ "lr": tune.sample_from(lambda spec: 10**(-10 * np.random.rand())), "momentum": tune.uniform(0.1, 0.9), "use_gpu": int(args.cuda) }) + + print("Best config is:", analysis.get_best_config(metric="mean_accuracy")) diff --git a/python/ray/tune/examples/mnist_pytorch_trainable.py b/python/ray/tune/examples/mnist_pytorch_trainable.py index 9ce36e97d..65b756666 100644 --- a/python/ray/tune/examples/mnist_pytorch_trainable.py +++ b/python/ray/tune/examples/mnist_pytorch_trainable.py @@ -5,12 +5,13 @@ from __future__ import print_function import argparse import os import torch -import torch.nn as nn -import torch.nn.functional as F import torch.optim as optim -from torchvision import datasets, transforms -from ray.tune import Trainable +import ray +from ray import tune +from ray.tune.schedulers import ASHAScheduler +from ray.tune.examples.mnist_pytorch import (train, test, get_data_loaders, + ConvNet) # Change these values if you want the training to run quicker or slower. EPOCH_SIZE = 512 @@ -19,155 +20,35 @@ TEST_SIZE = 256 # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( - "--batch-size", - type=int, - default=64, - metavar="N", - help="input batch size for training (default: 64)") -parser.add_argument( - "--test-batch-size", - type=int, - default=1000, - metavar="N", - help="input batch size for testing (default: 1000)") -parser.add_argument( - "--epochs", - type=int, - default=1, - metavar="N", - help="number of epochs to train (default: 1)") -parser.add_argument( - "--lr", - type=float, - default=0.01, - metavar="LR", - help="learning rate (default: 0.01)") -parser.add_argument( - "--momentum", - type=float, - default=0.5, - metavar="M", - help="SGD momentum (default: 0.5)") -parser.add_argument( - "--no-cuda", + "--use-gpu", action="store_true", default=False, - help="disables CUDA training") + help="enables CUDA training") parser.add_argument( - "--redis-address", - default=None, - type=str, - help="The Redis address of the cluster.") -parser.add_argument( - "--seed", - type=int, - default=1, - metavar="S", - help="random seed (default: 1)") + "--ray-redis-address", type=str, help="The Redis address of the cluster.") parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - - -class TrainMNIST(Trainable): +# Below comments are for documentation purposes only. +# yapf: disable +# __trainable_example_begin__ +class TrainMNIST(tune.Trainable): def _setup(self, config): - args = config.pop("args", parser.parse_args([])) - vars(args).update(config) - args.cuda = not args.no_cuda and torch.cuda.is_available() - - torch.manual_seed(args.seed) - if args.cuda: - torch.cuda.manual_seed(args.seed) - - kwargs = {"num_workers": 1, "pin_memory": True} if args.cuda else {} - self.train_loader = torch.utils.data.DataLoader( - datasets.MNIST( - "~/data", - train=True, - download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307, ), (0.3081, )) - ])), - batch_size=args.batch_size, - shuffle=True, - **kwargs) - self.test_loader = torch.utils.data.DataLoader( - datasets.MNIST( - "~/data", - train=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307, ), (0.3081, )) - ])), - batch_size=args.test_batch_size, - shuffle=True, - **kwargs) - - self.model = Net() - if args.cuda: - self.model.cuda() - + use_cuda = config.get("use_gpu") and torch.cuda.is_available() + self.device = torch.device("cuda" if use_cuda else "cpu") + self.train_loader, self.test_loader = get_data_loaders() + self.model = ConvNet().to(self.device) self.optimizer = optim.SGD( - self.model.parameters(), lr=args.lr, momentum=args.momentum) - self.args = args - - def _train_iteration(self): - self.model.train() - for batch_idx, (data, target) in enumerate(self.train_loader): - if batch_idx * len(data) > EPOCH_SIZE: - return - if self.args.cuda: - data, target = data.cuda(), target.cuda() - self.optimizer.zero_grad() - output = self.model(data) - loss = F.nll_loss(output, target) - loss.backward() - self.optimizer.step() - - def _test(self): - self.model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for batch_idx, (data, target) in enumerate(self.test_loader): - if batch_idx * len(data) > TEST_SIZE: - break - if self.args.cuda: - data, target = data.cuda(), target.cuda() - output = self.model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, reduction="sum").item() - # get the index of the max log-probability - pred = output.argmax(dim=1, keepdim=True) - correct += pred.eq( - target.data.view_as(pred)).long().cpu().sum() - - test_loss = test_loss / len(self.test_loader.dataset) - accuracy = correct.item() / len(self.test_loader.dataset) - return {"mean_loss": test_loss, "mean_accuracy": accuracy} + self.model.parameters(), + lr=config.get("lr", 0.01), + momentum=config.get("momentum", 0.9)) def _train(self): - self._train_iteration() - return self._test() + train( + self.model, self.optimizer, self.train_loader, device=self.device) + acc = test(self.model, self.test_loader, self.device) + return {"mean_accuracy": acc} def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "model.pth") @@ -178,34 +59,33 @@ class TrainMNIST(Trainable): self.model.load_state_dict(torch.load(checkpoint_path)) +# __trainable_example_end__ +# yapf: enable + if __name__ == "__main__": - datasets.MNIST("~/data", train=True, download=True) args = parser.parse_args() - - import ray - from ray import tune - from ray.tune.schedulers import HyperBandScheduler - - ray.init(redis_address=args.redis_address) - sched = HyperBandScheduler( - time_attr="training_iteration", metric="mean_loss", mode="min") - tune.run( + ray.init(redis_address=args.ray_redis_address) + sched = ASHAScheduler(metric="mean_accuracy") + analysis = tune.run( TrainMNIST, scheduler=sched, **{ "stop": { "mean_accuracy": 0.95, - "training_iteration": 1 if args.smoke_test else 20, + "training_iteration": 3 if args.smoke_test else 20, }, "resources_per_trial": { "cpu": 3, - "gpu": int(not args.no_cuda) + "gpu": int(args.use_gpu) }, "num_samples": 1 if args.smoke_test else 20, "checkpoint_at_end": True, + "checkpoint_freq": 3, "config": { "args": args, "lr": tune.uniform(0.001, 0.1), "momentum": tune.uniform(0.1, 0.9), } }) + + print("Best config is:", analysis.get_best_config(metric="mean_accuracy")) diff --git a/python/ray/tune/examples/tune-default.yaml b/python/ray/tune/examples/tune-default.yaml new file mode 100644 index 000000000..877ab1c11 --- /dev/null +++ b/python/ray/tune/examples/tune-default.yaml @@ -0,0 +1,51 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: tune-example + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 2 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes are currently spread between zones by a round-robin approach, + # however this implementation detail should not be relied upon. + availability_zone: us-west-2a,us-west-2b + +# How Ray will authenticate with newly launched nodes. +# By default Ray creates a new private keypair, but you can also use your own. +auth: + ssh_user: ubuntu + +# Provider-specific config for the head node, e.g. instance type. +head_node: + InstanceType: c5.xlarge + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 + +# Provider-specific config for worker nodes, e.g. instance type. +worker_nodes: + InstanceType: c5.xlarge + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 + + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# List of shell commands to run to set up each node. +setup_commands: + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev3-cp36-cp36m-manylinux1_x86_64.whl + - pip install torch torchvision tabulate tensorboard filelock + diff --git a/python/ray/tune/function_runner.py b/python/ray/tune/function_runner.py index 1e27915df..2b99a5c4c 100644 --- a/python/ray/tune/function_runner.py +++ b/python/ray/tune/function_runner.py @@ -33,10 +33,11 @@ class StatusReporter(object): >>> reporter(timesteps_this_iter=1) """ - def __init__(self, result_queue, continue_semaphore): + def __init__(self, result_queue, continue_semaphore, logdir=None): self._queue = result_queue self._last_report_time = None self._continue_semaphore = continue_semaphore + self._logdir = logdir def __call__(self, **kwargs): """Report updated training status. @@ -77,6 +78,10 @@ class StatusReporter(object): def _start(self): self._last_report_time = time.time() + @property + def logdir(self): + return self._logdir + class _RunnerThread(threading.Thread): """Supervisor thread that runs your script.""" @@ -131,8 +136,8 @@ class FunctionRunner(Trainable): # reporting to block until finished. self._error_queue = queue.Queue(1) - self._status_reporter = StatusReporter(self._results_queue, - self._continue_semaphore) + self._status_reporter = StatusReporter( + self._results_queue, self._continue_semaphore, self.logdir) self._last_result = {} config = config.copy() diff --git a/python/ray/tune/logger.py b/python/ray/tune/logger.py index 9118663ec..54a79f43c 100644 --- a/python/ray/tune/logger.py +++ b/python/ray/tune/logger.py @@ -52,7 +52,7 @@ class Logger(object): raise NotImplementedError def update_config(self, config): - """Updates the config for all loggers.""" + """Updates the config for logger.""" pass diff --git a/python/ray/tune/schedulers/async_hyperband.py b/python/ray/tune/schedulers/async_hyperband.py index 0370d03d3..d3e56d305 100644 --- a/python/ray/tune/schedulers/async_hyperband.py +++ b/python/ray/tune/schedulers/async_hyperband.py @@ -45,7 +45,7 @@ class AsyncHyperBandScheduler(FIFOScheduler): metric="episode_reward_mean", mode="max", max_t=100, - grace_period=10, + grace_period=1, reduction_factor=4, brackets=1): assert max_t > 0, "Max (time_attr) not valid!" diff --git a/python/ray/tune/tests/example.py b/python/ray/tune/tests/example.py new file mode 100644 index 000000000..b6cb1f885 --- /dev/null +++ b/python/ray/tune/tests/example.py @@ -0,0 +1,36 @@ +# flake8: noqa + +# This is an example quickstart for Tune. +# To connect to a cluster, uncomment below: + +# import ray +# import argparse +# parser = argparse.ArgumentParser() +# parser.add_argument("--redis-address") +# args = parser.parse_args() +# ray.init(redis_address=args.redis_address) + +# __quick_start_begin__ +import torch.optim as optim +from ray import tune +from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test + + +def train_mnist(config): + train_loader, test_loader = get_data_loaders() + model = ConvNet() + optimizer = optim.SGD(model.parameters(), lr=config["lr"]) + for i in range(10): + train(model, optimizer, train_loader) + acc = test(model, test_loader) + tune.track.log(mean_accuracy=acc) + + +analysis = tune.run( + train_mnist, config={"lr": tune.grid_search([0.001, 0.01, 0.1])}) + +print("Best config: ", analysis.get_best_config(metric="mean_accuracy")) + +# Get a dataframe for analyzing trial results. +df = analysis.dataframe() +# __quick_start_end__ diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py index fc026cc9a..ea9ee8cbb 100644 --- a/python/ray/tune/tests/test_experiment_analysis.py +++ b/python/ray/tune/tests/test_experiment_analysis.py @@ -34,7 +34,6 @@ class ExperimentAnalysisSuite(unittest.TestCase): global_checkpoint_period=0, name=self.test_name, local_dir=self.test_dir, - return_trials=False, stop={"training_iteration": 1}, num_samples=self.num_samples, config={ diff --git a/python/ray/tune/tests/tutorial.py b/python/ray/tune/tests/tutorial.py new file mode 100644 index 000000000..be5b5520b --- /dev/null +++ b/python/ray/tune/tests/tutorial.py @@ -0,0 +1,106 @@ +# flake8: noqa +# Original Code: https://github.com/pytorch/examples/blob/master/mnist/main.py + +# yapf: disable +# __tutorial_imports_begin__ +import numpy as np +import torch +import torch.optim as optim +from torchvision import datasets + +from ray import tune +from ray.tune import track +from ray.tune.schedulers import ASHAScheduler +from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test +# __tutorial_imports_end__ +# yapf: enable + + +# yapf: disable +# __train_func_begin__ +def train_mnist(config): + model = ConvNet() + train_loader, test_loader = get_data_loaders() + optimizer = optim.SGD( + model.parameters(), lr=config["lr"], momentum=config["momentum"]) + for i in range(10): + train(model, optimizer, train_loader) + acc = test(model, test_loader) + track.log(mean_accuracy=acc) + if i % 5 == 0: + # This saves the model to the trial directory + torch.save(model, "./model.pth") +# __train_func_end__ +# yapf: enable + +# __eval_func_begin__ +search_space = { + "lr": tune.sample_from(lambda spec: 10**(-10 * np.random.rand())), + "momentum": tune.uniform(0.1, 0.9) +} + +# Uncomment this to enable distributed execution +# `ray.init(redis_address=...)` + +analysis = tune.run(train_mnist, config=search_space) +# __eval_func_end__ + +#__plot_begin__ +dfs = analysis.trial_dataframes +[d.mean_accuracy.plot() for d in dfs.values()] +#__plot_end__ + +# __run_scheduler_begin__ +analysis = tune.run( + train_mnist, + num_samples=30, + scheduler=ASHAScheduler(metric="mean_accuracy", mode="max"), + config=search_space) + +# Obtain a trial dataframe from all run trials of this `tune.run` call. +dfs = analysis.trial_dataframes +# __run_scheduler_end__ + +# yapf: disable +# __plot_scheduler_begin__ +# Plot by epoch +ax = None # This plots everything on the same plot +for d in dfs.values(): + ax = d.mean_accuracy.plot(ax=ax, legend=False) +# __plot_scheduler_end__ +# yapf: enable + +# __run_searchalg_begin__ +from hyperopt import hp +from ray.tune.suggest.hyperopt import HyperOptSearch + +space = { + "lr": hp.loguniform("lr", 1e-10, 0.1), + "momentum": hp.uniform("momentum", 0.1, 0.9), +} + +hyperopt_search = HyperOptSearch( + space, max_concurrent=2, reward_attr="mean_accuracy") + +analysis = tune.run(train_mnist, num_samples=10, search_alg=hyperopt_search) +# __run_searchalg_end__ + +# __run_analysis_begin__ +import os + +df = analysis.dataframe() +logdir = analysis.get_best_logdir("mean_accuracy", mode="max") +model = torch.load(os.path.join(logdir, "model.pth")) +# __run_analysis_end__ + +from ray.tune.examples.mnist_pytorch_trainable import TrainMNIST + +# __trainable_run_begin__ +search_space = { + "lr": tune.sample_from(lambda spec: 10**(-10 * np.random.rand())), + "momentum": tune.uniform(0.1, 0.9) +} + +analysis = tune.run( + TrainMNIST, config=search_space, stop={"training_iteration": 10}) +# __trainable_run_end__ diff --git a/python/ray/tune/track/session.py b/python/ray/tune/track/session.py index faf850e5f..65301afde 100644 --- a/python/ray/tune/track/session.py +++ b/python/ray/tune/track/session.py @@ -51,6 +51,7 @@ class TrackSession(object): self.trial_id = trial_name + "_" + self.trial_id if self.is_tune_session: self._logger = _ReporterHook(_tune_reporter) + self._logdir = _tune_reporter.logdir else: self._initialize_logging(trial_name, experiment_dir, upload_dir, trial_config) @@ -60,6 +61,8 @@ class TrackSession(object): experiment_dir=None, upload_dir=None, trial_config=None): + if upload_dir: + raise NotImplementedError("Upload Dir is not yet implemented.") # TODO(rliaw): In other parts of the code, this is `local_dir`. if experiment_dir is None: @@ -74,11 +77,10 @@ class TrackSession(object): # misc metadata to save as well self.trial_config["trial_id"] = self.trial_id - self._logger = UnifiedLogger(self.trial_config, self._logdir, - self._upload_dir) + self._logger = UnifiedLogger(self.trial_config, self._logdir) def log(self, **metrics): - """Logs all named arguments specified in **metrics. + """Logs all named arguments specified in `metrics`. This will log trial metrics locally, and they will be synchronized with the driver periodically through ray. @@ -86,10 +88,9 @@ class TrackSession(object): Arguments: metrics: named arguments with corresponding values to log. """ - + self._iteration += 1 # TODO: Implement a batching mechanism for multiple calls to `log` # within the same iteration. - self._iteration += 1 metrics_dict = metrics.copy() metrics_dict.update({"trial_id": self.trial_id}) diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py index 04ebbf527..5398e6f77 100644 --- a/python/ray/tune/trainable.py +++ b/python/ray/tune/trainable.py @@ -459,7 +459,9 @@ class Trainable(object): Args: checkpoint_dir (str): The directory where the checkpoint - file must be stored. + file must be stored. In a Tune run, this defaults to + `/checkpoint_` (which is the same as + `local_dir/exp_name/trial_name/checkpoint_`). Returns: checkpoint (str | dict): If string, the return value is diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index 6f1913bd1..049306305 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -266,6 +266,9 @@ def run(run_or_experiment, trials = runner.get_trials() if return_trials: return trials + logger.info("Returning an analysis object by default. You can call " + "`analysis.trials` to retrieve a list of trials. " + "This message will be removed in future versions of Tune.") return ExperimentAnalysis(runner.checkpoint_file, trials=trials) diff --git a/python/ray/tune/util.py b/python/ray/tune/util.py index 06cb4f0eb..c6851f26e 100644 --- a/python/ray/tune/util.py +++ b/python/ray/tune/util.py @@ -180,7 +180,7 @@ def deep_update(original, new_dict, new_keys_allowed, whitelist): return original -def flatten_dict(dt, delimiter=":"): +def flatten_dict(dt, delimiter="/"): dt = copy.deepcopy(dt) while any(isinstance(v, dict) for v in dt.values()): remove = []