diff --git a/doc/source/conf.py b/doc/source/conf.py index d710fe7d7..5df6e8be1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -44,6 +44,13 @@ MOCK_MODULES = [ "mxnet", "mxnet.model", "psutil", + "pytorch_lightning.core.step_result", + "pytorch_lightning.overrides.data_parallel", + "pytorch_lightning.utilities.model_utils", + "pytorch_lightning.trainer.model_hooks", + "pytorch_lightning.trainer.optimizers", + "pytorch_lightning.utilities.exceptions", + "pytorch_lightning.utilities.memory", "ray._raylet", "ray.core.generated", "ray.core.generated.common_pb2", @@ -76,6 +83,7 @@ MOCK_MODULES = [ "wandb", "zoopt", ] + import scipy.stats import scipy.linalg @@ -87,6 +95,30 @@ sys.modules["tensorflow"].VERSION = "9.9.9" sys.modules["tensorflow.keras.callbacks"] = ChildClassMock() sys.modules["pytorch_lightning"] = ChildClassMock() + +class SimpleClass(object): + pass + + +class SimpleClass2(object): + pass + + +# ray.util.sgd.torch.lightning_operator.LightningOperator extends +# TrainingOperator, pytorch_lightning.TrainerOptimizersMixin, +# and pytorch_lightning.TrainerModelHooksMixin. +# But, we are mocking all pytorch_lightning modules, causing the ptl base +# classes to have a different metaclass than TrainingOperator. +# To fix this, we replace the base classes with dummy classes that extend +# object. +# We have to create 2 dummy classes, one for TrainerOptimizersMixin and one +# for TrainerModelHooksMixin so that we don't extend from the same base +# class twice. +setattr(sys.modules["pytorch_lightning.trainer.optimizers"], + "TrainerOptimizersMixin", SimpleClass) +setattr(sys.modules["pytorch_lightning.trainer.model_hooks"], + "TrainerModelHooksMixin", SimpleClass2) + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. diff --git a/doc/source/images/sgd_ptl.png b/doc/source/images/sgd_ptl.png new file mode 100644 index 000000000..232f6e0d7 Binary files /dev/null and b/doc/source/images/sgd_ptl.png differ diff --git a/doc/source/index.rst b/doc/source/index.rst index a2fd85b34..8d1e1aed8 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -274,6 +274,7 @@ Papers raysgd/raysgd_pytorch.rst raysgd/raysgd_tensorflow.rst raysgd/raysgd_dataset.rst + raysgd/raysgd_ptl.rst raysgd/raysgd_tune.rst raysgd/raysgd_ref.rst diff --git a/doc/source/raysgd/raysgd_ptl.rst b/doc/source/raysgd/raysgd_ptl.rst new file mode 100644 index 000000000..6a1e3a3da --- /dev/null +++ b/doc/source/raysgd/raysgd_ptl.rst @@ -0,0 +1,117 @@ +Pytorch Lightning with RaySGD +============================== +.. image:: /images/sgd_ptl.png + :align: center + :scale: 50 % + + +RaySGD includes an integration with Pytorch Lightning's `LightningModule `_. +Easily take your existing ``LightningModule``, and use it with Ray SGD's ``TorchTrainer`` to take advantage of all of Ray SGD's distributed training features with minimal code changes. + +.. tip:: This LightningModule integration is currently under active development. If you encounter any bugs, please raise an issue on `Github `_! + +.. note:: Not all Pytorch Lightning features are supported. A full list of unsupported model hooks is listed down :ref:`below `. Please post any feature requests on `Github `_ and we will get to it shortly! + +.. contents:: + :local: + +Quick Start +----------- +Step 1: Define your ``LightningModule`` just like how you would with Pytorch Lightning. + +.. code-block:: python + + from pytorch_lightning.core.lightning import LightningModule + + class MyLightningModule(LightningModule): + ... + +Step 2: Use the ``TrainingOperator.from_ptl`` method to convert the ``LightningModule`` to a Ray SGD compatible ``LightningOperator``. + +.. code-block:: python + + from ray.util.sgd.torch import TrainingOperator + + MyLightningOperator = TrainingOperator.from_ptl(MyLightningModule) + +Step 3: Use the Operator with Ray SGD's ``TorchTrainer``, just like how you would normally. See :ref:`torch-guide` for a more full guide on ``TorchTrainer``. + +.. code-block:: python + + import ray + from ray.util.sgd.torch import TorchTrainer + + ray.init() + trainer = TorchTrainer(training_operator_cls=MyLightningOperator, num_workers=4, use_gpu=True) + train_stats = trainer.train() + +And that's it! For a more comprehensive guide, see the MNIST tutorial :ref:`below `. + +.. _ptl-mnist: + +MNIST Tutorial +-------------- +In this walkthrough we will go through how to train an MNIST classifier with Pytorch Lightning's ``LightningModule`` and Ray SGD. + +We will follow `this tutorial from the PyTorch Lightning documentation +`_ for specifying our MNIST LightningModule. + +Setup / Imports +~~~~~~~~~~~~~~~ +Let's start with some basic imports: + +.. literalinclude:: /../../python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py + :language: python + :start-after: __import_begin__ + :end-before: __import_end__ + +Most of these imports are needed for building our Pytorch model and training components. +Only a few additional imports are needed for Ray and Pytorch Lightning. + +MNIST LightningModule +~~~~~~~~~~~~~~~~~~~~~ +We now define our Pytorch Lightning ``LightningModule``: + +.. literalinclude:: /../../python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py + :language: python + :start-after: __ptl_begin__ + :end-before: __ptl_end__ + +This is the same code that would normally be used in Pytorch Lightning, and is taken directly from `this PTL guide `_. +The only difference here is that the ``__init__`` method can optionally take in a ``config`` argument, +as a way to pass in hyperparameters to your model, optimizer, or schedulers. The ``config`` will be passed in directly from +the TorchTrainer. Or if using Ray SGD in conjunction with Tune (:ref:`raysgd-tune`), it will come directly from the config in your +``tune.run`` call. + +Training with Ray SGD +~~~~~~~~~~~~~~~~~~~~~ +We now can define our training function using our LitMNIST module and Ray SGD. + +.. literalinclude:: /../../python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py + :language: python + :start-after: __train_begin__ + :end-before: __train_end__ + +With just a single ``from_ptl`` call, we can convert our LightningModule to a ``TrainingOperator`` class that's compatible +with Ray SGD. Now we can take full advantage of all of Ray SGD's distributed trainign features without having to rewrite our existing +LightningModule. + +The last thing to do is initialize Ray, and run our training function! + +.. code-block:: python + + # Use ray.init(address="auto") if running on a Ray cluster. + ray.init() + train_mnist(num_workers=32, use_gpu=True, num_epochs=5) + +.. _ptl-unsupported-features: + +Unsupported Features +-------------------- +This integration is currently under active development, so not all Pytorch Lightning features are supported. +Please post any feature requests on `Github +`_ and we will get to it shortly! + +A list of unsupported model hooks (as of v1.0.0) is as follows: +``test_dataloader``, ``on_test_batch_start``, ``on_test_epoch_start``, ``on_test_batch_end``, ``on_test_epoch_start``, +``get_progress_bar_dict``, ``on_fit_end``, ``on_pretrain_routine_end``, ``manual_backward``, ``tbtt_split_batch``. diff --git a/doc/source/raysgd/raysgd_pytorch.rst b/doc/source/raysgd/raysgd_pytorch.rst index 093a24a79..17e79896d 100644 --- a/doc/source/raysgd/raysgd_pytorch.rst +++ b/doc/source/raysgd/raysgd_pytorch.rst @@ -1,3 +1,5 @@ +.. _torch-guide: + Distributed PyTorch =================== @@ -467,7 +469,6 @@ After connecting, you can scale up the number of workers seamlessly across multi trainer.train() model = trainer.get_model() - Advanced: Fault Tolerance ------------------------- diff --git a/doc/source/raysgd/raysgd_ref.rst b/doc/source/raysgd/raysgd_ref.rst index 692571a36..0f72ab53f 100644 --- a/doc/source/raysgd/raysgd_ref.rst +++ b/doc/source/raysgd/raysgd_ref.rst @@ -1,10 +1,13 @@ -RaySGD API Documentation -======================== +RaySGD API Reference +==================== + +PyTorch +------- .. _ref-torch-trainer: TorchTrainer ------------- +~~~~~~~~~~~~ .. autoclass:: ray.util.sgd.torch.TorchTrainer :members: @@ -12,40 +15,66 @@ TorchTrainer .. _ref-torch-operator: PyTorch TrainingOperator ------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: ray.util.sgd.torch.TrainingOperator :members: +.. _ref-creator-operator: + +CreatorOperator +~~~~~~~~~~~~~~~~ + +.. autoclass:: ray.util.sgd.torch.training_operator.CreatorOperator + :members: + :exclude-members: setup + +.. _ref-lightning-operator: + +Pytorch Lightning LightningOperator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: ray.util.sgd.torch.lightning_operator.LightningOperator + :members: + :exclude-members: setup, train_epoch, train_batch, validate, validate_batch, state_dict, load_state_dict + .. _BaseTorchTrainable-doc: BaseTorchTrainable ------------------- +~~~~~~~~~~~~~~~~~~ .. autoclass:: ray.util.sgd.torch.BaseTorchTrainable :members: :private-members: +Tensorflow +---------- + TFTrainer ---------- +~~~~~~~~~ .. autoclass:: ray.util.sgd.tf.TFTrainer :members: .. automethod:: __init__ +RaySGD Dataset +--------------- + Dataset -------- +~~~~~~~ .. autoclass:: ray.util.sgd.data.Dataset :members: .. automethod:: __init__ +RaySGD Utils +------------- .. _ref-utils: Utils ------ +~~~~~ .. autoclass:: ray.util.sgd.utils.AverageMeter :members: diff --git a/doc/source/raysgd/raysgd_tune.rst b/doc/source/raysgd/raysgd_tune.rst index 0826ad97b..cacaea0a2 100644 --- a/doc/source/raysgd/raysgd_tune.rst +++ b/doc/source/raysgd/raysgd_tune.rst @@ -1,3 +1,5 @@ +.. _raysgd-tune: + RaySGD Hyperparameter Tuning ============================ diff --git a/python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py b/python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py index da7731403..031975f19 100644 --- a/python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py +++ b/python/ray/util/sgd/torch/examples/pytorch-lightning/mnist-ptl.py @@ -1,18 +1,29 @@ import argparse +# __import_begin__ +import os + +# Pytorch imports import torch -from ray.util.sgd import TorchTrainer -from ray.util.sgd.torch import TrainingOperator -from torch.nn import functional as F -from pytorch_lightning.core.lightning import LightningModule from torch.optim import Adam from torch.utils.data import DataLoader, random_split -from torchvision.datasets import MNIST -import os +from torch.nn import functional as F from torchvision import transforms +from torchvision.datasets import MNIST + +# Ray imports +from ray.util.sgd import TorchTrainer +from ray.util.sgd.torch import TrainingOperator + +# PTL imports +from pytorch_lightning.core.lightning import LightningModule + +# __import_end__ +# __ptl_begin__ class LitMNIST(LightningModule): + # We take in an additional config parameter here. But this is not required. def __init__(self, config): super().__init__() @@ -77,6 +88,10 @@ class LitMNIST(LightningModule): return {"val_loss": loss.item(), "val_acc": num_correct / num_samples} +# __ptl_end__ + + +# __train_begin__ def train_mnist(num_workers=1, use_gpu=False, num_epochs=5): Operator = TrainingOperator.from_ptl(LitMNIST) trainer = TorchTrainer( @@ -101,6 +116,8 @@ def train_mnist(num_workers=1, use_gpu=False, num_epochs=5): print("success!") +# __train_end__ + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( diff --git a/python/ray/util/sgd/torch/ptl_operator.py b/python/ray/util/sgd/torch/lightning_operator.py similarity index 96% rename from python/ray/util/sgd/torch/ptl_operator.py rename to python/ray/util/sgd/torch/lightning_operator.py index 46d3a09d0..610ed92cd 100644 --- a/python/ray/util/sgd/torch/ptl_operator.py +++ b/python/ray/util/sgd/torch/lightning_operator.py @@ -27,6 +27,24 @@ logger = logging.getLogger(__name__) class LightningOperator(TrainingOperator, TrainerModelHooksMixin, TrainerOptimizersMixin): + """A subclass of TrainingOperator created from a PTL ``LightningModule``. + + This class is returned by `TrainingOperator.from_ptl` and it's training + state is defined by the Pytorch Lightning ``LightningModule`` that is + passed into `from_ptl`. Training and validation functionality have + already been implemented according to + Pytorch Lightning's Trainer. But if you need to modify training, + you should subclass this class and override the appropriate methods + before passing in the subclass to `TorchTrainer`. + + .. code-block:: python + + MyLightningOperator = TrainingOperator.from_ptl( + MyLightningModule) + trainer = TorchTrainer(training_operator_cls=MyLightningOperator, + ...) + """ + def _configure_amp(self, amp, models, optimizers, apex_args=None): assert len(models) == 1 model = models[0] @@ -356,11 +374,7 @@ class LightningOperator(TrainingOperator, TrainerModelHooksMixin, model.on_after_backward() with self.timers.record("apply"): - model.optimizer_step( - epoch=epoch_idx, - batch_idx=batch_idx, - optimizer=optimizer, - optimizer_idx=0) + optimizer.step() model.on_before_zero_grad(optimizer) diff --git a/python/ray/util/sgd/torch/training_operator.py b/python/ray/util/sgd/torch/training_operator.py index 595d7adf0..1c59c68b2 100644 --- a/python/ray/util/sgd/torch/training_operator.py +++ b/python/ray/util/sgd/torch/training_operator.py @@ -777,7 +777,14 @@ class TrainingOperator: lightning_module_cls, train_dataloader=None, val_dataloader=None): - """Creates a TrainingOperator from a Pytorch Lightning Module. + """Create a custom TrainingOperator class from a LightningModule. + + .. code-block:: python + + MyLightningOperator = TrainingOperator.from_ptl( + MyLightningModule) + trainer = TorchTrainer(training_operator_cls=MyLightningOperator, + ...) Args: lightning_module_cls: Your LightningModule class. An object of @@ -793,7 +800,7 @@ class TrainingOperator: A TrainingOperator class properly configured given the LightningModule. """ - from ray.util.sgd.torch.ptl_operator import LightningOperator + from ray.util.sgd.torch.lightning_operator import LightningOperator class CustomLightningOperator(LightningOperator): _lightning_module_cls = lightning_module_cls @@ -810,12 +817,20 @@ class TrainingOperator: loss_creator=None, scheduler_creator=None, serialize_data_creation=True): - """A utility method to create a custom TrainingOperator class from - creator functions. This is useful for backwards compatibility with + """Create a custom TrainingOperator class from creator functions. + + This method is useful for backwards compatibility with previous versions of Ray. To provide custom training and validation, you should subclass the class that is returned by this method instead of ``TrainingOperator``. + .. code-block:: python + + MyCreatorOperator = TrainingOperator.from_creators( + model_creator, optimizer_creator) + trainer = TorchTrainer(training_operator_cls=MyCreatorOperator, + ...) + Args: model_creator (dict -> Model(s)): Constructor function that takes in config and returns the model(s) to be optimized. These @@ -853,8 +868,8 @@ class TrainingOperator: system). Defaults to True. Returns: - A TrainingOperator class with a ``setup`` method that utilizes - the passed in creator functions. + A CreatorOperator class- a subclass of TrainingOperator with a + ``setup`` method that utilizes the passed in creator functions. """ if not (callable(model_creator) and callable(optimizer_creator)): @@ -929,8 +944,21 @@ class TrainingOperator: class CreatorOperator(TrainingOperator): - """A subclass of TrainingOperator specifically for defining training - state using creator functions. + """A subclass of TrainingOperator with training defined by creator funcs. + + This class allows for backwards compatibility with pre Ray 1.0 versions. + + This class is returned by `TrainingOperator.from_creators(...)`. If you + need to add custom functionality, you should subclass this class, + implement the appropriate methods and pass the subclass into + `TorchTrainer`. + + .. code-block:: python + + MyCreatorOperator = TrainingOperator.from_creators( + model_creator, optimizer_creator) + trainer = TorchTrainer(training_operator_cls=MyCreatorOperator, + ...) """ def _validate_loaders(self, loaders):