diff --git a/doc/source/serialization.rst b/doc/source/serialization.rst index 3aec586ea..41c83110f 100644 --- a/doc/source/serialization.rst +++ b/doc/source/serialization.rst @@ -1,3 +1,5 @@ +.. _serialization-guide: + Serialization ============= @@ -45,7 +47,7 @@ Serialization notes l.append(l) # Try to put this list that recursively contains itself in the object store. - ray.put(l) # ok + ray.put(l) # ok - For non-native objects, Ray will always keep a single copy even it is referred multiple times in an object: diff --git a/doc/source/tune.rst b/doc/source/tune.rst index cf0314549..c3b246ab1 100644 --- a/doc/source/tune.rst +++ b/doc/source/tune.rst @@ -19,6 +19,9 @@ Tune is a Python library for experiment execution and hyperparameter tuning at a **Want to get started?** Head over to the :ref:`60 second Tune tutorial `. +.. tip:: Join the `Ray community slack `_ to discuss Ray Tune (and other Ray libraries)! + + Quick Start ----------- @@ -49,8 +52,6 @@ If using TF2 and TensorBoard, Tune will also automatically generate TensorBoard :align: center -.. tip:: Join the `Ray community slack `_ to discuss Ray Tune (and other Ray libraries)! - Why choose Tune? ---------------- diff --git a/doc/source/tune/_tutorials/overview.rst b/doc/source/tune/_tutorials/overview.rst index 7a5783a55..7a09874a3 100644 --- a/doc/source/tune/_tutorials/overview.rst +++ b/doc/source/tune/_tutorials/overview.rst @@ -25,11 +25,6 @@ Take a look at any of the below tutorials to get started with Tune. :figure: /images/tune.png :description: :doc:`A walkthrough to setup your first Tune experiment ` -.. customgalleryitem:: - :tooltip: Tuning XGBoost parameters. - :figure: /images/xgboost_logo.png - :description: :doc:`A guide to tuning XGBoost parameters with Tune ` - .. raw:: html @@ -39,8 +34,6 @@ Take a look at any of the below tutorials to get started with Tune. tune-60-seconds.rst tune-tutorial.rst - tune-pytorch-lightning.rst - tune-xgboost.rst User Guides @@ -72,6 +65,11 @@ These pages will demonstrate the various features and configurations of Tune. :figure: /images/pytorch_lightning_small.png :description: :doc:`Tuning PyTorch Lightning modules ` +.. customgalleryitem:: + :tooltip: Tuning XGBoost parameters. + :figure: /images/xgboost_logo.png + :description: :doc:`A guide to tuning XGBoost parameters with Tune ` + .. raw:: html @@ -83,6 +81,8 @@ These pages will demonstrate the various features and configurations of Tune. tune-usage.rst tune-advanced-tutorial.rst tune-distributed.rst + tune-pytorch-lightning.rst + tune-xgboost.rst Colab Exercises --------------- diff --git a/doc/source/tune/_tutorials/tune-pytorch-lightning.rst b/doc/source/tune/_tutorials/tune-pytorch-lightning.rst index 0a3845c97..e6ceeea9f 100644 --- a/doc/source/tune/_tutorials/tune-pytorch-lightning.rst +++ b/doc/source/tune/_tutorials/tune-pytorch-lightning.rst @@ -8,6 +8,7 @@ aims to avoid boilerplate code, so you don't have to write the same training loops all over again when building a new model. .. image:: /images/pytorch_lightning_full.png + :align: center The main abstraction of PyTorch Lightning is the ``LightningModule`` class, which should be extended by your application. There is `a great post on how to transfer diff --git a/doc/source/tune/_tutorials/tune-tutorial.rst b/doc/source/tune/_tutorials/tune-tutorial.rst index a17927ddc..1808d77fe 100644 --- a/doc/source/tune/_tutorials/tune-tutorial.rst +++ b/doc/source/tune/_tutorials/tune-tutorial.rst @@ -3,81 +3,85 @@ A Basic Tune Tutorial ===================== -.. image:: /images/tune-api.svg +This tutorial will walk you through the process of setting up Tune. Specifically, we'll leverage early stopping and Bayesian Optimization (via HyperOpt) to optimize your PyTorch model. -This tutorial will walk you through the following process to setup a Tune experiment using Pytorch. Specifically, we'll leverage ASHA and Bayesian Optimization (via HyperOpt) via the following steps: - 1. Integrating Tune into your workflow - 2. Specifying a TrialScheduler - 3. Adding a SearchAlgorithm - 4. Getting the best model and analyzing results +.. tip:: If you have suggestions as to how to improve this tutorial, please `let us know `_! -.. note:: +To run this example, you will need to install the following: - To run this example, you will need to install the following: +.. code-block:: bash - .. code-block:: bash + $ pip install ray torch torchvision - $ pip install ray torch torchvision +Pytorch Model Setup +~~~~~~~~~~~~~~~~~~~ -We first run some imports: +To start off, let's first import some dependencies: .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python :start-after: __tutorial_imports_begin__ :end-before: __tutorial_imports_end__ +Then, let's define the PyTorch model that we'll be training. -Below, we have some boiler plate code for a PyTorch training function. +.. literalinclude:: /../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __model_def_begin__ + :end-before: __model_def_end__ + + +Below, we have some boiler plate code for training and evaluating your model in Pytorch. :ref:`Skip ahead to the Tune usage `. + +.. literalinclude:: /../../python/ray/tune/tests/tutorial.py + :language: python + :start-after: __train_def_begin__ + :end-before: __train_def_end__ + +.. _tutorial-tune-setup: + +Setting up Tune +~~~~~~~~~~~~~~~ + +Below, we define a function that trains the Pytorch model for multiple epochs. This function will be executed on a separate :ref:`Ray Actor (process) ` underneath the hood, so we need to communicate the performance of the model back to Tune (which is on the main Python process). + +To do this, we call :ref:`tune.report ` in our training function, which sends the performance value back to Tune. + +.. tip:: Since the function is executed on the separate process, make sure that the function is :ref:`serializable by Ray `. .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python :start-after: __train_func_begin__ :end-before: __train_func_end__ -Notice that there's a couple helper functions in the above training script. You can take a look at these functions in the imported module `examples/mnist_pytorch `__; there's no black magic happening. For example, ``train`` is simply a for loop over the data loader. - -.. code:: python - - EPOCH_SIZE = 20 - - def train(model, optimizer, train_loader): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - if batch_idx * len(data) > EPOCH_SIZE: - return - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - -Let's run 1 trial, randomly sampling from a uniform distribution for learning rate and momentum. +Let's run 1 trial by calling :ref:`tune.run ` and :ref:`randomly sample ` from a uniform distribution for learning rate and momentum. .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python :start-after: __eval_func_begin__ :end-before: __eval_func_end__ -We can then plot the performance of this trial. +``tune.run`` returns an :ref:`Analysis object `. You can use this to plot the performance of this trial. .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python :start-after: __plot_begin__ :end-before: __plot_end__ -.. note:: Tune will automatically run parallel trials across all available cores/GPUs on your machine or cluster. To limit the number of cores that Tune uses, you can call ``ray.init(num_cpus=, num_gpus=)`` before ``tune.run``. +.. note:: Tune will automatically run parallel trials across all available cores/GPUs on your machine or cluster. To limit the number of cores that Tune uses, you can call ``ray.init(num_cpus=, num_gpus=)`` before ``tune.run``. If you're using a Search Algorithm like Bayesian Optimization, you'll want to use the :ref:`ConcurrencyLimiter `. Early Stopping with ASHA ~~~~~~~~~~~~~~~~~~~~~~~~ -Let's integrate a Trial Scheduler to our search - ASHA, a scalable algorithm for principled early stopping. +Let's integrate early stopping into our optimization process. Let's use :ref:`ASHA `, a scalable algorithm for `principled early stopping`_. -How does it work? On a high level, it terminates trials that are less promising and -allocates more time and resources to more promising trials. See `this blog post `__ for more details. +.. _`principled early stopping`: https://blog.ml.cmu.edu/2018/12/12/massively-parallel-hyperparameter-optimization/ -We can afford to **increase the search space by 5x**, by adjusting the parameter ``num_samples``. See :ref:`tune-schedulers` for more details of available schedulers and library integrations. +On a high level, ASHA terminates trials that are less promising and allocates more time and resources to more promising trials. As our optimization process becomes more efficient, we can afford to **increase the search space by 5x**, by adjusting the parameter ``num_samples``. + +ASHA is implemented in Tune as a "Trial Scheduler". These Trial Schedulers can early terminate bad trials, pause trials, clone trials, and alter hyperparameters of a running trial. See :ref:`the TrialScheduler documentation ` for more details of available schedulers and library integrations. .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python @@ -95,7 +99,7 @@ You can run the below in a Jupyter notebook to visualize trial progress. :scale: 50% :align: center -You can also use Tensorboard for visualizing results. +You can also use :ref:`Tensorboard ` for visualizing results. .. code:: bash @@ -105,18 +109,21 @@ You can also use Tensorboard for visualizing results. Search Algorithms in Tune ~~~~~~~~~~~~~~~~~~~~~~~~~ -With Tune you can combine powerful hyperparameter search libraries such as `HyperOpt `_ and `Ax `_ with state-of-the-art algorithms such as HyperBand without modifying any model training code. Tune allows you to use different search algorithms in combination with different trial schedulers. See :ref:`tune-search-alg` for more details of available algorithms and library integrations. +In addition to :ref:`TrialSchedulers `, you can further optimize your hyperparameters by using an intelligent search technique like Bayesian Optimization. To do this, you can use a Tune :ref:`Search Algorithm `. Search Algorithms leverage optimization algorithms to intelligently navigate the given hyperparameter space. + +Note that each library has a specific way of defining the search space. .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python :start-after: __run_searchalg_begin__ :end-before: __run_searchalg_end__ +.. note:: Tune allows you to use some search algorithms in combination with different trial schedulers. See :ref:`this page for more details `. Evaluate your model ~~~~~~~~~~~~~~~~~~~ -You can evaluate best trained model using the Analysis object to retrieve the best model: +You can evaluate best trained model using the :ref:`Analysis object ` to retrieve the best model: .. literalinclude:: /../../python/ray/tune/tests/tutorial.py :language: python @@ -126,4 +133,7 @@ You can evaluate best trained model using the Analysis object to retrieve the be Next Steps ---------- -Take a look at the :ref:`tune-user-guide` for a more comprehensive overview of Tune's features. + +* Take a look at the :ref:`tune-user-guide` for a more comprehensive overview of Tune's features. +* Browse our :ref:`gallery of examples ` to see how to use Tune with PyTorch, XGBoost, Tensorflow, etc. +* `Let us know `__ if you ran into issues or have any questions by opening an issue on our Github. diff --git a/doc/source/tune/_tutorials/tune-usage.rst b/doc/source/tune/_tutorials/tune-usage.rst index cc9feb3d4..63541cc2a 100644 --- a/doc/source/tune/_tutorials/tune-usage.rst +++ b/doc/source/tune/_tutorials/tune-usage.rst @@ -272,8 +272,8 @@ Note that in the above example the currently running trials will not stop immedi .. _tune-logging: -Logging/Tensorboard -------------------- +Logging +------- Tune by default will log results for Tensorboard, CSV, and JSON formats. If you need to log something lower level like model weights or gradients, see :ref:`Trainable Logging `. @@ -288,6 +288,44 @@ Tune will log the results of each trial to a subfolder under a specified local d # trainable_name and trial_name are autogenerated. tune.run(trainable, num_samples=2) +You can specify the ``local_dir`` and ``trainable_name``: + +.. code-block:: python + + # This logs to 2 different trial folders: + # ./results/test_experiment/trial_name_1 and ./results/test_experiment/trial_name_2 + # Only trial_name is autogenerated. + tune.run(trainable, num_samples=2, local_dir="./results", name="test_experiment") + +To specify custom trial folder names, you can pass use the ``trial_name_creator`` argument +to `tune.run`. This takes a function with the following signature: + +.. code-block:: python + + def trial_name_string(trial): + """ + Args: + trial (Trial): A generated trial object. + + Returns: + trial_name (str): String representation of Trial. + """ + return str(trial) + + tune.run( + MyTrainableClass, + name="example-experiment", + num_samples=1, + trial_name_creator=trial_name_string + ) + +See the documentation on Trials: :ref:`trial-docstring`. + +.. _tensorboard: + +Tensorboard (Logging) +--------------------- + Tune automatically outputs Tensorboard files during ``tune.run``. To visualize learning in tensorboard, install tensorboardX: .. code-block:: bash diff --git a/doc/source/tune/api_docs/logging.rst b/doc/source/tune/api_docs/logging.rst index fa9341b3f..46efb6485 100644 --- a/doc/source/tune/api_docs/logging.rst +++ b/doc/source/tune/api_docs/logging.rst @@ -126,52 +126,6 @@ Use ``self.logdir`` (only for Class API) or ``tune.track.logdir`` (only for Func In the distributed case, these logs will be sync'ed back to the driver under your logger path. This will allow you to visualize and analyze logs of all distributed training workers on a single machine. -Log Directory -------------- - -Tune will log the results of each trial to a subfolder under a specified local dir, which defaults to ``~/ray_results``. - -.. code-block:: python - - # This logs to 2 different trial folders: - # ~/ray_results/trainable_name/trial_name_1 and ~/ray_results/trainable_name/trial_name_2 - # trainable_name and trial_name are autogenerated. - tune.run(trainable, num_samples=2) - -You can specify the ``local_dir`` and ``trainable_name``: - -.. code-block:: python - - # This logs to 2 different trial folders: - # ./results/test_experiment/trial_name_1 and ./results/test_experiment/trial_name_2 - # Only trial_name is autogenerated. - tune.run(trainable, num_samples=2, local_dir="./results", name="test_experiment") - -To specify custom trial folder names, you can pass use the ``trial_name_creator`` argument -to `tune.run`. This takes a function with the following signature: - -.. code-block:: python - - def trial_name_string(trial): - """ - Args: - trial (Trial): A generated trial object. - - Returns: - trial_name (str): String representation of Trial. - """ - return str(trial) - - tune.run( - MyTrainableClass, - name="example-experiment", - num_samples=1, - trial_name_creator=trial_name_string - ) - -See the documentation on Trials: :ref:`trial-docstring`. - - Viskit ------ diff --git a/doc/source/tune/api_docs/schedulers.rst b/doc/source/tune/api_docs/schedulers.rst index f530aa4f0..bfabe0640 100644 --- a/doc/source/tune/api_docs/schedulers.rst +++ b/doc/source/tune/api_docs/schedulers.rst @@ -20,13 +20,13 @@ Tune includes distributed implementations of early stopping algorithms such as ` .. tip:: The easiest scheduler to start with is the ``ASHAScheduler`` which will aggressively terminate low-performing trials. -When using schedulers, you may face compatibility issues, as shown in the below compatibility matrix. Certain schedulers cannot be used with Search Algorithms, and certain schedulers are only compatible with the :ref:`tune-class-api`. +When using schedulers, you may face compatibility issues, as shown in the below compatibility matrix. Certain schedulers cannot be used with Search Algorithms, and certain schedulers are require :ref:`checkpointing to be implemented `. .. list-table:: TrialScheduler Feature Compatibility Matrix :header-rows: 1 * - Scheduler - - Class API Required? + - Need Checkpointing? - SearchAlg Compatible? - Example * - :ref:`ASHA ` diff --git a/doc/source/tune/api_docs/trainable.rst b/doc/source/tune/api_docs/trainable.rst index ecd11ce20..e9ee28874 100644 --- a/doc/source/tune/api_docs/trainable.rst +++ b/doc/source/tune/api_docs/trainable.rst @@ -256,7 +256,7 @@ The ``Trainable`` also provides the ``default_resource_requests`` interface to a -.. _track-docstring: +.. _tune-function-docstring: tune.report / tune.checkpoint (Function API) -------------------------------------------- diff --git a/python/ray/tune/examples/ax_example.py b/python/ray/tune/examples/ax_example.py index e6e3c60b4..6f5f06fb5 100644 --- a/python/ray/tune/examples/ax_example.py +++ b/python/ray/tune/examples/ax_example.py @@ -3,9 +3,10 @@ It also checks that it is usable with a separate scheduler. """ import numpy as np +import time import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.ax import AxSearch @@ -33,12 +34,10 @@ def hartmann6(x): return y -def easy_objective(config, reporter): - import time - time.sleep(0.2) +def easy_objective(config): for i in range(config["iterations"]): x = np.array([config.get("x{}".format(i + 1)) for i in range(6)]) - reporter( + tune.report( timesteps_total=i, hartmann6=hartmann6(x), l2norm=np.sqrt((x**2).sum())) @@ -109,7 +108,8 @@ if __name__ == "__main__": ) algo = AxSearch(client, max_concurrent=4) scheduler = AsyncHyperBandScheduler(metric="hartmann6", mode="min") - run(easy_objective, + tune.run( + easy_objective, name="ax", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/bayesopt_example.py b/python/ray/tune/examples/bayesopt_example.py index a88268f5a..afa6e81c2 100644 --- a/python/ray/tune/examples/bayesopt_example.py +++ b/python/ray/tune/examples/bayesopt_example.py @@ -2,20 +2,28 @@ It also checks that it is usable with a separate scheduler. """ +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.bayesopt import BayesOptSearch -def easy_objective(config, reporter): - import time - time.sleep(0.2) - for i in range(config["iterations"]): - reporter( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - time.sleep(0.02) +def evaluation_fn(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.1 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": @@ -32,10 +40,7 @@ if __name__ == "__main__": config = { "num_samples": 10 if args.smoke_test else 1000, "config": { - "iterations": 100, - }, - "stop": { - "timesteps_total": 100 + "steps": 100, } } algo = BayesOptSearch( @@ -48,7 +53,8 @@ if __name__ == "__main__": "xi": 0.0 }) scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, + tune.run( + easy_objective, name="my_exp", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/dragonfly_example.py b/python/ray/tune/examples/dragonfly_example.py index 011a38bca..4e363b90a 100644 --- a/python/ray/tune/examples/dragonfly_example.py +++ b/python/ray/tune/examples/dragonfly_example.py @@ -6,16 +6,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.dragonfly import DragonflySearch -def objective(config, reporter): - import numpy as np - import time - time.sleep(0.2) +def objective(config): for i in range(config["iterations"]): vol1 = config["point"][0] # LiNO3 vol2 = config["point"][1] # Li2SO4 @@ -25,7 +25,7 @@ def objective(config, reporter): conductivity = vol1 + 0.1 * (vol2 + vol3)**2 + 2.3 * vol4 * (vol1**1.5) # Add Gaussian noise to simulate experimental noise conductivity += np.random.normal() * 0.01 - reporter(timesteps_total=i, objective=conductivity) + tune.report(timesteps_total=i, objective=conductivity) time.sleep(0.02) @@ -46,9 +46,6 @@ if __name__ == "__main__": "config": { "iterations": 100, }, - "stop": { - "timesteps_total": 100 - }, } domain_vars = [{ @@ -75,7 +72,8 @@ if __name__ == "__main__": optimizer = EuclideanGPBandit(func_caller, ask_tell_mode=True) algo = DragonflySearch(optimizer, metric="objective", mode="max") scheduler = AsyncHyperBandScheduler(metric="objective", mode="max") - run(objective, + tune.run( + objective, name="dragonfly_search", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/genetic_example.py b/python/ray/tune/examples/genetic_example.py index 3b16de0d4..cc566eca9 100644 --- a/python/ray/tune/examples/genetic_example.py +++ b/python/ray/tune/examples/genetic_example.py @@ -3,7 +3,7 @@ It also checks that it is usable with a separate scheduler. """ import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.automl import GeneticSearch from ray.tune.automl import ContinuousSpace, DiscreteSpace, SearchSpace @@ -20,7 +20,7 @@ def michalewicz_function(config, reporter): y = np.dot(sin_x, sin_z) # Negate y since we want to minimize y value - reporter(timesteps_total=1, neg_mean_loss=-y) + tune.report(timesteps_total=1, neg_mean_loss=-y) if __name__ == "__main__": @@ -47,7 +47,8 @@ if __name__ == "__main__": max_generation=2 if args.smoke_test else 10, population_size=10 if args.smoke_test else 50) scheduler = AsyncHyperBandScheduler(metric="neg_mean_loss", mode="max") - run(michalewicz_function, + tune.run( + michalewicz_function, name="my_exp", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/hyperopt_example.py b/python/ray/tune/examples/hyperopt_example.py index b81680b4c..3ba120e81 100644 --- a/python/ray/tune/examples/hyperopt_example.py +++ b/python/ray/tune/examples/hyperopt_example.py @@ -2,22 +2,28 @@ It also checks that it is usable with a separate scheduler. """ +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.hyperopt import HyperOptSearch -def easy_objective(config, reporter): - import time - time.sleep(0.2) - assert type(config["activation"]) == str, \ - "Config is incorrect: {}".format(type(config["activation"])) - for i in range(config["iterations"]): - reporter( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - time.sleep(0.02) +def evaluation_fn(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.1 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": @@ -33,6 +39,7 @@ if __name__ == "__main__": space = { "width": hp.uniform("width", 0, 20), "height": hp.uniform("height", -100, 100), + # This is an ignored parameter. "activation": hp.choice("activation", ["relu", "tanh"]) } @@ -52,11 +59,8 @@ if __name__ == "__main__": config = { "num_samples": 10 if args.smoke_test else 1000, "config": { - "iterations": 100, - }, - "stop": { - "timesteps_total": 100 - }, + "steps": 100, + } } algo = HyperOptSearch( space, @@ -64,4 +68,4 @@ if __name__ == "__main__": mode="min", points_to_evaluate=current_best_params) scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, search_alg=algo, scheduler=scheduler, **config) + tune.run(easy_objective, search_alg=algo, scheduler=scheduler, **config) diff --git a/python/ray/tune/examples/mlflow_example.py b/python/ray/tune/examples/mlflow_example.py index d3d1c417f..68be24139 100644 --- a/python/ray/tune/examples/mlflow_example.py +++ b/python/ray/tune/examples/mlflow_example.py @@ -15,13 +15,20 @@ from ray import tune from ray.tune.logger import MLFLowLogger, DEFAULT_LOGGERS +def evaluation_fn(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.1 + + def easy_objective(config): - for i in range(20): - result = dict( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - tune.report(**result) - time.sleep(0.02) + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config.get("steps", 100)): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": diff --git a/python/ray/tune/examples/nevergrad_example.py b/python/ray/tune/examples/nevergrad_example.py index d47fe5132..991e0451c 100644 --- a/python/ray/tune/examples/nevergrad_example.py +++ b/python/ray/tune/examples/nevergrad_example.py @@ -2,20 +2,28 @@ It also checks that it is usable with a separate scheduler. """ +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.nevergrad import NevergradSearch -def easy_objective(config, reporter): - import time - time.sleep(0.2) - for i in range(config["iterations"]): - reporter( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - time.sleep(0.02) +def evaluation_fn(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.1 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": @@ -31,10 +39,7 @@ if __name__ == "__main__": config = { "num_samples": 10 if args.smoke_test else 50, "config": { - "iterations": 100, - }, - "stop": { - "timesteps_total": 100 + "steps": 100, } } instrumentation = 2 @@ -49,7 +54,8 @@ if __name__ == "__main__": algo = NevergradSearch( optimizer, parameter_names, metric="mean_loss", mode="min") scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, + tune.run( + easy_objective, name="nevergrad", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/sigopt_example.py b/python/ray/tune/examples/sigopt_example.py index b82aa3eeb..ab8ce8281 100644 --- a/python/ray/tune/examples/sigopt_example.py +++ b/python/ray/tune/examples/sigopt_example.py @@ -2,20 +2,28 @@ It also checks that it is usable with a separate scheduler. """ +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.sigopt import SigOptSearch -def easy_objective(config, reporter): - import time - time.sleep(0.2) - for i in range(config["iterations"]): - reporter( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - time.sleep(0.02) +def evaluate(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.01 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluate(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": @@ -53,11 +61,8 @@ if __name__ == "__main__": config = { "num_samples": 10 if args.smoke_test else 1000, "config": { - "iterations": 100, - }, - "stop": { - "timesteps_total": 100 - }, + "steps": 10 + } } algo = SigOptSearch( space, @@ -66,7 +71,8 @@ if __name__ == "__main__": metric="mean_loss", mode="min") scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, + tune.run( + easy_objective, name="my_exp", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/skopt_example.py b/python/ray/tune/examples/skopt_example.py index de91fd08e..03ced1e87 100644 --- a/python/ray/tune/examples/skopt_example.py +++ b/python/ray/tune/examples/skopt_example.py @@ -2,20 +2,28 @@ It also checks that it is usable with a separate scheduler. """ +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler from ray.tune.suggest.skopt import SkOptSearch -def easy_objective(config, reporter): - import time - time.sleep(0.2) - for i in range(config["iterations"]): - reporter( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - time.sleep(0.02) +def evaluation_fn(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.1 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": @@ -31,11 +39,8 @@ if __name__ == "__main__": config = { "num_samples": 10 if args.smoke_test else 50, "config": { - "iterations": 100, - }, - "stop": { - "timesteps_total": 100 - }, + "steps": 100, + } } optimizer = Optimizer([(0, 20), (-100, 100)]) previously_run_params = [[10, 0], [15, -20]] @@ -47,7 +52,8 @@ if __name__ == "__main__": points_to_evaluate=previously_run_params, evaluated_rewards=known_rewards) scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, + tune.run( + easy_objective, name="skopt_exp_with_warmstart", search_alg=algo, scheduler=scheduler, @@ -61,7 +67,8 @@ if __name__ == "__main__": mode="min", points_to_evaluate=previously_run_params) scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, + tune.run( + easy_objective, name="skopt_exp", search_alg=algo, scheduler=scheduler, diff --git a/python/ray/tune/examples/tune_cifar10_gluon.py b/python/ray/tune/examples/tune_cifar10_gluon.py index 30727a999..49d14574a 100644 --- a/python/ray/tune/examples/tune_cifar10_gluon.py +++ b/python/ray/tune/examples/tune_cifar10_gluon.py @@ -13,6 +13,8 @@ from mxnet.gluon.data.vision import transforms from gluoncv.model_zoo import get_model from gluoncv.data import transforms as gcv_transforms +from ray import tune + # Training settings parser = argparse.ArgumentParser(description="CIFAR-10 Example") parser.add_argument( @@ -86,7 +88,8 @@ parser.add_argument( args = parser.parse_args() -def train_cifar10(args, config, reporter): +def train_cifar10(config): + args = config.pop("args") vars(args).update(config) np.random.seed(args.seed) random.seed(args.seed) @@ -172,18 +175,18 @@ def train_cifar10(args, config, reporter): _, test_acc = metric.get() test_loss /= len(test_data) - reporter(mean_loss=test_loss, mean_accuracy=test_acc) + return test_loss, test_acc for epoch in range(1, args.epochs + 1): train(epoch) - test() + test_loss, test_acc = test() + tune.report(mean_loss=test_loss, mean_accuracy=test_acc) if __name__ == "__main__": args = parser.parse_args() import ray - from ray import tune from ray.tune.schedulers import AsyncHyperBandScheduler, FIFOScheduler ray.init() @@ -198,11 +201,8 @@ if __name__ == "__main__": grace_period=60) else: raise NotImplementedError - tune.register_trainable( - "TRAIN_FN", - lambda config, reporter: train_cifar10(args, config, reporter)) tune.run( - "TRAIN_FN", + train_cifar10, name=args.expname, verbose=2, scheduler=sched, @@ -216,6 +216,7 @@ if __name__ == "__main__": }, num_samples=1 if args.smoke_test else args.num_samples, config={ + "args": args, "lr": tune.sample_from( lambda spec: np.power(10.0, np.random.uniform(-4, -1))), "momentum": tune.sample_from( diff --git a/python/ray/tune/examples/zoopt_example.py b/python/ray/tune/examples/zoopt_example.py index 385b69c59..160c9e6e0 100644 --- a/python/ray/tune/examples/zoopt_example.py +++ b/python/ray/tune/examples/zoopt_example.py @@ -2,21 +2,29 @@ It also checks that it is usable with a separate scheduler. """ +import time + import ray -from ray.tune import run +from ray import tune from ray.tune.suggest.zoopt import ZOOptSearch from ray.tune.schedulers import AsyncHyperBandScheduler from zoopt import ValueType -def easy_objective(config, reporter): - import time - time.sleep(0.2) - for i in range(config["iterations"]): - reporter( - timesteps_total=i, - mean_loss=(config["height"] - 14)**2 - abs(config["width"] - 3)) - time.sleep(0.02) +def evaluation_fn(step, width, height): + return (0.1 + width * step / 100)**(-1) + height * 0.1 + + +def easy_objective(config): + # Hyperparameters + width, height = config["width"], config["height"] + + for step in range(config["steps"]): + # Iterative training function - can be any arbitrary training procedure + intermediate_score = evaluation_fn(step, width, height) + # Feed the score back back to Tune. + tune.report(iterations=step, mean_loss=intermediate_score) + time.sleep(0.1) if __name__ == "__main__": @@ -40,10 +48,7 @@ if __name__ == "__main__": config = { "num_samples": 10 if args.smoke_test else 1000, "config": { - "iterations": 10, # evaluation times - }, - "stop": { - "timesteps_total": 10 # cumstom stop rules + "steps": 10, # evaluation times } } @@ -56,7 +61,8 @@ if __name__ == "__main__": scheduler = AsyncHyperBandScheduler(metric="mean_loss", mode="min") - run(easy_objective, + tune.run( + easy_objective, search_alg=zoopt_search, name="zoopt_search", scheduler=scheduler, diff --git a/python/ray/tune/tests/example.py b/python/ray/tune/tests/example.py index 5c3eca607..be0bd2d17 100644 --- a/python/ray/tune/tests/example.py +++ b/python/ray/tune/tests/example.py @@ -11,25 +11,31 @@ # ray.init(address=args.address) # __quick_start_begin__ -import torch.optim as optim from ray import tune -from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test -def train_mnist(config): - train_loader, test_loader = get_data_loaders() - model = ConvNet() - optimizer = optim.SGD(model.parameters(), lr=config["lr"]) - for i in range(10): - train(model, optimizer, train_loader) - acc = test(model, test_loader) - tune.report(mean_accuracy=acc) +def objective(step, alpha, beta): + return (0.1 + alpha * step / 100)**(-1) + beta * 0.1 + + +def training_function(config): + # Hyperparameters + alpha, beta = config["alpha"], config["beta"] + for step in range(10): + # Iterative training function - can be any arbitrary training procedure. + intermediate_score = objective(step, alpha, beta) + # Feed the score back back to Tune. + tune.report(mean_loss=intermediate_score) analysis = tune.run( - train_mnist, config={"lr": tune.grid_search([0.001, 0.01, 0.1])}) + training_function, + config={ + "alpha": tune.grid_search([0.001, 0.01, 0.1]), + "beta": tune.choice([1, 2, 3]) + }) -print("Best config: ", analysis.get_best_config(metric="mean_accuracy")) +print("Best config: ", analysis.get_best_config(metric="mean_loss")) # Get a dataframe for analyzing trial results. df = analysis.dataframe() diff --git a/python/ray/tune/tests/tutorial.py b/python/ray/tune/tests/tutorial.py index 02994bf2c..f0e5fa5af 100644 --- a/python/ray/tune/tests/tutorial.py +++ b/python/ray/tune/tests/tutorial.py @@ -6,29 +6,106 @@ import numpy as np import torch import torch.optim as optim -from torchvision import datasets +import torch.nn as nn +from torchvision import datasets, transforms +from torch.utils.data import DataLoader +import torch.nn.functional as F from ray import tune from ray.tune.schedulers import ASHAScheduler -from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test # __tutorial_imports_end__ # yapf: enable # yapf: disable +# __model_def_begin__ +class ConvNet(nn.Module): + def __init__(self): + super(ConvNet, self).__init__() + # In this example, we don't change the model architecture + # due to simplicity. + self.conv1 = nn.Conv2d(1, 3, kernel_size=3) + self.fc = nn.Linear(192, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 3)) + x = x.view(-1, 192) + x = self.fc(x) + return F.log_softmax(x, dim=1) +# __model_def_end__ +# yapf: enable + +# yapf: disable +# __train_def_begin__ + +# Change these values if you want the training to run quicker or slower. +EPOCH_SIZE = 512 +TEST_SIZE = 256 + +def train(model, optimizer, train_loader): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + # We set this just for the example to run quickly. + if batch_idx * len(data) > EPOCH_SIZE: + return + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + + +def test(model, data_loader): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.eval() + correct = 0 + total = 0 + with torch.no_grad(): + for batch_idx, (data, target) in enumerate(data_loader): + # We set this just for the example to run quickly. + if batch_idx * len(data) > TEST_SIZE: + break + data, target = data.to(device), target.to(device) + outputs = model(data) + _, predicted = torch.max(outputs.data, 1) + total += target.size(0) + correct += (predicted == target).sum().item() + + return correct / total +# __train_def_end__ + + # __train_func_begin__ def train_mnist(config): + # Data Setup + mnist_transforms = transforms.Compose( + [transforms.ToTensor(), + transforms.Normalize((0.1307, ), (0.3081, ))]) + + train_loader = DataLoader( + datasets.MNIST("~/data", train=True, download=True, transform=mnist_transforms), + batch_size=64, + shuffle=True) + test_loader = DataLoader( + datasets.MNIST("~/data", train=False, transform=mnist_transforms), + batch_size=64, + shuffle=True) + model = ConvNet() - train_loader, test_loader = get_data_loaders() optimizer = optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"]) for i in range(10): train(model, optimizer, train_loader) acc = test(model, test_loader) + + # Send the current training result back to Tune tune.report(mean_accuracy=acc) + if i % 5 == 0: # This saves the model to the trial directory - torch.save(model, "./model.pth") + torch.save(model.state_dict(), "./model.pth") # __train_func_end__ # yapf: enable @@ -39,7 +116,10 @@ search_space = { } # Uncomment this to enable distributed execution -# `ray.init(address=...)` +# `ray.init(address="auto")` + +# Download the dataset first +datasets.MNIST("~/data", train=True, download=True) analysis = tune.run(train_mnist, config=search_space) # __eval_func_end__ @@ -52,7 +132,7 @@ dfs = analysis.trial_dataframes # __run_scheduler_begin__ analysis = tune.run( train_mnist, - num_samples=30, + num_samples=20, scheduler=ASHAScheduler(metric="mean_accuracy", mode="max"), config=search_space) @@ -88,7 +168,10 @@ import os df = analysis.dataframe() logdir = analysis.get_best_logdir("mean_accuracy", mode="max") -model = torch.load(os.path.join(logdir, "model.pth")) +state_dict = torch.load(os.path.join(logdir, "model.pth")) + +model = ConvNet() +model.load_state_dict(state_dict) # __run_analysis_end__ from ray.tune.examples.mnist_pytorch_trainable import TrainMNIST