From 5a40299d425fa65bdcf18268d370ff37876c5cc2 Mon Sep 17 00:00:00 2001 From: krfricke Date: Wed, 15 Jul 2020 19:30:20 +0200 Subject: [PATCH] [tune] extend PTL template (GPU, typing fixes, tensorboard) (#9451) Co-authored-by: Kai Fricke --- .../_tutorials/tune-pytorch-lightning.rst | 62 +++++++++++-- .../tune/examples/mnist_pytorch_lightning.py | 86 +++++++++++++------ python/ray/tune/sample.py | 10 ++- 3 files changed, 119 insertions(+), 39 deletions(-) diff --git a/doc/source/tune/_tutorials/tune-pytorch-lightning.rst b/doc/source/tune/_tutorials/tune-pytorch-lightning.rst index e6ceeea9f..27984e504 100644 --- a/doc/source/tune/_tutorials/tune-pytorch-lightning.rst +++ b/doc/source/tune/_tutorials/tune-pytorch-lightning.rst @@ -102,14 +102,22 @@ The callback just reports some metrics back to Tune after each validation epoch: :start-after: __tune_callback_begin__ :end-before: __tune_callback_end__ +Note that we have to explicitly convert the metrics from a tensor to a Python value. + Adding the Tune training function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Then we specify our training function. Note that we added the ``data_dir`` as a config -parameter here, even though it should not be tuned. We just need to specify it to avoid +Then we specify our training function. Note that we added the ``data_dir`` as a +parameter here to avoid that each training run downloads the full MNIST dataset. Instead, we want to access a shared data location. +We are also able to specify the number of epochs to train each model, and the number +of GPUs we want to use for training. We also create a TensorBoard logger that writes +logfiles directly into Tune's root trial directory - if we didn't do that PyTorch +Lightning would create subdirectories, and each trial would thus be shown twice in +TensorBoard, one time for Tune's logs, and another time for PyTorch Lightning's logs. + .. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py :language: python :start-after: __tune_train_begin__ @@ -134,7 +142,7 @@ We also delete this data after training to avoid filling up our disk or memory s :language: python :start-after: __tune_asha_begin__ :end-before: __tune_asha_end__ - :lines: 27 + :lines: 36 :dedent: 4 Configuring the search space @@ -150,7 +158,7 @@ we are able to also sample small values. :language: python :start-after: __tune_asha_begin__ :end-before: __tune_asha_end__ - :lines: 4-10 + :lines: 5-10 :dedent: 4 Selecting a scheduler @@ -165,7 +173,7 @@ configurations. :language: python :start-after: __tune_asha_begin__ :end-before: __tune_asha_end__ - :lines: 11-16 + :lines: 12-17 :dedent: 4 @@ -173,17 +181,53 @@ Changing the CLI output ~~~~~~~~~~~~~~~~~~~~~~~ We instantiate a ``CLIReporter`` to specify which metrics we would like to see in our -output tables in the command line. If we didn't specify this, Tune would print all -hyperparameters by default, but since ``data_dir`` is not a real hyperparameter, we -can avoid printing it by omitting it in the ``parameter_columns`` parameter. +output tables in the command line. This is optional, but can be used to make sure our +output tables only include information we would like to see. .. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py :language: python :start-after: __tune_asha_begin__ :end-before: __tune_asha_end__ - :lines: 17-19 + :lines: 19-21 :dedent: 4 +Passing constants to the train function +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``data_dir``, ``num_epochs`` and ``num_gpus`` we pass to the training function +are constants. To avoid including them as non-configurable parameters in the ``config`` +specification, we can use ``functools.partial`` to wrap around the training function. + +.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py + :language: python + :start-after: __tune_asha_begin__ + :end-before: __tune_asha_end__ + :lines: 24-28 + :dedent: 8 + +Training with GPUs +~~~~~~~~~~~~~~~~~~ +We can specify how many resources Tune should request for each trial. +This also includes GPUs. + +PyTorch Lightning takes care of moving the training to the GPUs. We +already made sure that our code is compatible with that, so there's +nothing more to do here other than to specify the number of GPUs +we would like to use: + +.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py + :language: python + :start-after: __tune_asha_begin__ + :end-before: __tune_asha_end__ + :lines: 29 + :dedent: 4 + +Please note that in the current state of PyTorch Lightning, training +on :doc:`fractional GPUs ` or +multiple GPUs requires some workarounds. We will address these in a +separate tutorial - for now this example works with no or exactly one +GPU. + Putting it together ~~~~~~~~~~~~~~~~~~~ diff --git a/python/ray/tune/examples/mnist_pytorch_lightning.py b/python/ray/tune/examples/mnist_pytorch_lightning.py index 4f02c05e3..0074bf3c7 100644 --- a/python/ray/tune/examples/mnist_pytorch_lightning.py +++ b/python/ray/tune/examples/mnist_pytorch_lightning.py @@ -13,8 +13,10 @@ import os # __import_tune_begin__ import shutil +from functools import partial from tempfile import mkdtemp from pytorch_lightning.callbacks import Callback +from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.utilities.cloud_io import load as pl_load from ray import tune from ray.tune import CLIReporter @@ -74,7 +76,7 @@ class LightningMNISTClassifier(pl.LightningModule): loss = self.cross_entropy_loss(logits, y) accuracy = self.accuracy(logits, y) - logs = {"train_loss": loss, "train_accuracy": accuracy} + logs = {"ptl/train_loss": loss, "ptl/train_accuracy": accuracy} return {"loss": loss, "log": logs} def validation_step(self, val_batch, batch_idx): @@ -88,12 +90,12 @@ class LightningMNISTClassifier(pl.LightningModule): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean() - tensorboard_logs = {"val_loss": avg_loss, "val_accuracy": avg_acc} + logs = {"ptl/val_loss": avg_loss, "ptl/val_accuracy": avg_acc} return { "avg_val_loss": avg_loss, "avg_val_accuracy": avg_acc, - "log": tensorboard_logs + "log": logs } @staticmethod @@ -133,16 +135,19 @@ def train_mnist(config): class TuneReportCallback(Callback): def on_validation_end(self, trainer, pl_module): tune.report( - loss=trainer.callback_metrics["avg_val_loss"], - mean_accuracy=trainer.callback_metrics["avg_val_accuracy"]) + loss=trainer.callback_metrics["avg_val_loss"].item(), + mean_accuracy=trainer.callback_metrics["avg_val_accuracy"].item()) # __tune_callback_end__ # __tune_train_begin__ -def train_mnist_tune(config): - model = LightningMNISTClassifier(config, config["data_dir"]) +def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0): + model = LightningMNISTClassifier(config, data_dir) trainer = pl.Trainer( - max_epochs=10, + max_epochs=num_epochs, + gpus=num_gpus, + logger=TensorBoardLogger( + save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[TuneReportCallback()]) @@ -160,9 +165,17 @@ class CheckpointCallback(Callback): # __tune_train_checkpoint_begin__ -def train_mnist_tune_checkpoint(config, checkpoint=None): +def train_mnist_tune_checkpoint( + config, + checkpoint=None, + data_dir=None, + num_epochs=10, + num_gpus=0): trainer = pl.Trainer( - max_epochs=10, + max_epochs=num_epochs, + gpus=num_gpus, + logger=TensorBoardLogger( + save_dir=tune.get_trial_dir(), name="", version="."), progress_bar_refresh_rate=0, callbacks=[CheckpointCallback(), TuneReportCallback()]) @@ -178,54 +191,64 @@ def train_mnist_tune_checkpoint(config, checkpoint=None): trainer.current_epoch = ckpt["epoch"] else: model = LightningMNISTClassifier( - config=config, data_dir=config["data_dir"]) + config=config, data_dir=data_dir) trainer.fit(model) # __tune_train_checkpoint_end__ # __tune_asha_begin__ -def tune_mnist_asha(num_samples=10, max_num_epochs=10): +def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0): data_dir = mkdtemp(prefix="mnist_data_") LightningMNISTClassifier.download_data(data_dir) + config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), - "data_dir": data_dir } + scheduler = ASHAScheduler( metric="loss", mode="min", - max_t=max_num_epochs, + max_t=num_epochs, grace_period=1, reduction_factor=2) + reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) + tune.run( - train_mnist_tune, - resources_per_trial={"cpu": 1}, + partial( + train_mnist_tune, + data_dir=data_dir, + num_epochs=num_epochs, + num_gpus=gpus_per_trial), + resources_per_trial={"cpu": 1, "gpu": gpus_per_trial}, config=config, num_samples=num_samples, scheduler=scheduler, - progress_reporter=reporter) + progress_reporter=reporter, + name="tune_mnist_asha") + shutil.rmtree(data_dir) # __tune_asha_end__ # __tune_pbt_begin__ -def tune_mnist_pbt(): +def tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0): data_dir = mkdtemp(prefix="mnist_data_") LightningMNISTClassifier.download_data(data_dir) + config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": 1e-3, "batch_size": 64, - "data_dir": data_dir } + scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", @@ -235,16 +258,24 @@ def tune_mnist_pbt(): "lr": lambda: tune.loguniform(1e-4, 1e-1).func(None), "batch_size": [32, 64, 128] }) + reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) + tune.run( - train_mnist_tune_checkpoint, - resources_per_trial={"cpu": 1}, + partial( + train_mnist_tune_checkpoint, + data_dir=data_dir, + num_epochs=num_epochs, + num_gpus=gpus_per_trial), + resources_per_trial={"cpu": 1, "gpu": gpus_per_trial}, config=config, - num_samples=10, + num_samples=num_samples, scheduler=scheduler, - progress_reporter=reporter) + progress_reporter=reporter, + name="tune_mnist_pbt") + shutil.rmtree(data_dir) # __tune_pbt_end__ @@ -258,7 +289,10 @@ if __name__ == "__main__": args, _ = parser.parse_known_args() if args.smoke_test: - tune_mnist_asha(1, 1) + tune_mnist_asha(num_samples=1, num_epochs=1, gpus_per_trial=0) + tune_mnist_pbt(num_samples=1, num_epochs=1, gpus_per_trial=0) else: - tune_mnist_asha() # ASHA scheduler - tune_mnist_pbt() # population based training + # ASHA scheduler + tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0) + # Population based training + tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0) diff --git a/python/ray/tune/sample.py b/python/ray/tune/sample.py index bb8c9cb55..dc952f7b5 100644 --- a/python/ray/tune/sample.py +++ b/python/ray/tune/sample.py @@ -1,4 +1,6 @@ import logging +import random + import numpy as np logger = logging.getLogger(__name__) @@ -56,13 +58,13 @@ def loguniform(min_bound, max_bound, base=10): def choice(*args, **kwargs): - """Wraps tune.sample_from around ``np.random.choice``. + """Wraps tune.sample_from around ``random.choice``. - ``tune.choice(10)`` is equivalent to - ``tune.sample_from(lambda _: np.random.choice(10))`` + ``tune.choice([1, 2])`` is equivalent to + ``tune.sample_from(lambda _: random.choice([1, 2]))`` """ - return sample_from(lambda _: np.random.choice(*args, **kwargs)) + return sample_from(lambda _: random.choice(*args, **kwargs)) def randint(*args, **kwargs):