From fa7eecf48a208329c0d42d0329be9b14ec023dad Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Tue, 21 Apr 2020 01:01:04 -0700
Subject: [PATCH] [sgd] Avoid parameter "gotcha" for learning rate scheduler
 (#8107)

* with-scheduler-creator

* none

* add_freq

* runner

* torch
---
 python/ray/util/sgd/tests/test_torch.py       | 38 ++++++++++++-------
 python/ray/util/sgd/torch/constants.py        |  5 ++-
 .../torch/examples/raysgd_torch_signatures.py |  1 +
 python/ray/util/sgd/torch/torch_runner.py     |  2 +-
 python/ray/util/sgd/torch/torch_trainer.py    | 22 ++++++-----
 5 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/python/ray/util/sgd/tests/test_torch.py b/python/ray/util/sgd/tests/test_torch.py
index 8d67892f4..0eec95a02 100644
--- a/python/ray/util/sgd/tests/test_torch.py
+++ b/python/ray/util/sgd/tests/test_torch.py
@@ -248,6 +248,7 @@ def test_multi_model_matrix(ray_start_2_cpus, num_workers):  # noqa: F811
                     optimizer_creator=multi_optimizer_creator,
                     loss_creator=nn.MSELoss,
                     scheduler_creator=multi_scheduler_creator,
+                    scheduler_step_freq="epoch",
                     training_operator_cls=_TestingOperator,
                     num_workers=num_workers,
                     config={
@@ -260,7 +261,7 @@ def test_multi_model_matrix(ray_start_2_cpus, num_workers):  # noqa: F811
                 trainer.shutdown()
 
 
-@pytest.mark.parametrize("scheduler_freq", ["epoch", "batch"])
+@pytest.mark.parametrize("scheduler_freq", ["epoch", "batch", "manual", None])
 def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
     def train_epoch(self, iterator, info):
         assert info[SCHEDULER_STEP] == scheduler_freq
@@ -270,19 +271,29 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
         return torch.optim.lr_scheduler.StepLR(
             optimizer, step_size=30, gamma=0.1)
 
-    trainer = TorchTrainer(
-        model_creator=model_creator,
-        data_creator=data_creator,
-        optimizer_creator=optimizer_creator,
-        loss_creator=lambda config: nn.MSELoss(),
-        config={"custom_func": train_epoch},
-        training_operator_cls=_TestingOperator,
-        scheduler_creator=scheduler_creator,
-        scheduler_step_freq=scheduler_freq)
+    if scheduler_freq is None:
+        with pytest.raises(ValueError):
+            trainer = TorchTrainer(
+                model_creator=model_creator,
+                data_creator=data_creator,
+                optimizer_creator=optimizer_creator,
+                loss_creator=lambda config: nn.MSELoss(),
+                scheduler_creator=scheduler_creator,
+                scheduler_step_freq=scheduler_freq)
+    else:
+        trainer = TorchTrainer(
+            model_creator=model_creator,
+            data_creator=data_creator,
+            optimizer_creator=optimizer_creator,
+            loss_creator=lambda config: nn.MSELoss(),
+            config={"custom_func": train_epoch},
+            training_operator_cls=_TestingOperator,
+            scheduler_creator=scheduler_creator,
+            scheduler_step_freq=scheduler_freq)
 
-    for i in range(3):
-        trainer.train()
-    trainer.shutdown()
+        for i in range(3):
+            trainer.train()
+        trainer.shutdown()
 
 
 def test_profiling(ray_start_2_cpus):  # noqa: F811
@@ -459,6 +470,7 @@ def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
         optimizer_creator=optimizer_creator,
         loss_creator=lambda config: nn.MSELoss(),
         scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer),
+        scheduler_step_freq="manual",
         training_operator_cls=_TestingOperator)
     trainer.update_scheduler(0.5)
     trainer.update_scheduler(0.5)
diff --git a/python/ray/util/sgd/torch/constants.py b/python/ray/util/sgd/torch/constants.py
index 2cf774a25..cf3a7dc8f 100644
--- a/python/ray/util/sgd/torch/constants.py
+++ b/python/ray/util/sgd/torch/constants.py
@@ -5,6 +5,9 @@ NUM_STEPS = "__num_steps__"
 SCHEDULER_STEP = "scheduler_step"
 SCHEDULER_STEP_BATCH = "batch"
 SCHEDULER_STEP_EPOCH = "epoch"
+SCHEDULER_STEP_MANUAL = "manual"
 NCCL_TIMEOUT_S = env_integer("NCCL_TIMEOUT_S", 10)
 
-VALID_SCHEDULER_STEP = {SCHEDULER_STEP_BATCH, SCHEDULER_STEP_EPOCH}
+VALID_SCHEDULER_STEP = {
+    SCHEDULER_STEP_BATCH, SCHEDULER_STEP_EPOCH, SCHEDULER_STEP_MANUAL
+}
diff --git a/python/ray/util/sgd/torch/examples/raysgd_torch_signatures.py b/python/ray/util/sgd/torch/examples/raysgd_torch_signatures.py
index 657e68ce5..7d0f819e1 100644
--- a/python/ray/util/sgd/torch/examples/raysgd_torch_signatures.py
+++ b/python/ray/util/sgd/torch/examples/raysgd_torch_signatures.py
@@ -119,6 +119,7 @@ trainer = TorchTrainer(
     optimizer_creator=optimizer_creator,
     loss_creator=nn.MSELoss,
     scheduler_creator=scheduler_creator,
+    scheduler_step_freq="epoch",  # if scheduler_creator is set
     config={"lr": 0.001, "batch_size": 64})
 
 # __torch_trainer_end__
diff --git a/python/ray/util/sgd/torch/torch_runner.py b/python/ray/util/sgd/torch/torch_runner.py
index 2ce819d3e..84dbf216b 100644
--- a/python/ray/util/sgd/torch/torch_runner.py
+++ b/python/ray/util/sgd/torch/torch_runner.py
@@ -44,7 +44,7 @@ class TorchRunner:
                  use_fp16=False,
                  use_tqdm=False,
                  apex_args=None,
-                 scheduler_step_freq="batch"):
+                 scheduler_step_freq=None):
         self.model_creator = model_creator
         self.optimizer_creator = optimizer_creator
         self.loss_creator = loss_creator
diff --git a/python/ray/util/sgd/torch/torch_trainer.py b/python/ray/util/sgd/torch/torch_trainer.py
index 1b4e5f738..2fbb043cd 100644
--- a/python/ray/util/sgd/torch/torch_trainer.py
+++ b/python/ray/util/sgd/torch/torch_trainer.py
@@ -23,11 +23,10 @@ RESIZE_COOLDOWN_S = 10
 
 
 def _validate_scheduler_step_freq(scheduler_step_freq):
-    if scheduler_step_freq:
-        if scheduler_step_freq not in VALID_SCHEDULER_STEP:
-            raise ValueError(
-                "Scheduler step freq must be in {}. Got {}".format(
-                    VALID_SCHEDULER_STEP, scheduler_step_freq))
+    """This validation check only happens if a scheduler is passed in."""
+    if scheduler_step_freq not in VALID_SCHEDULER_STEP:
+        raise ValueError("Scheduler step freq must be in {}. Got {}".format(
+            VALID_SCHEDULER_STEP, scheduler_step_freq))
 
 
 def _remind_gpu_usage(use_gpu):
@@ -148,10 +147,13 @@ class TorchTrainer:
             See https://nvidia.github.io/apex/amp.html#module-apex.amp. By
             default, the models and optimizers are passed in. Consider using
             "num_losses" if operating over multiple models and optimizers.
-        scheduler_step_freq: "batch", "epoch", or None. This will
+        scheduler_step_freq: "batch", "epoch", "manual", or None. This will
             determine when ``scheduler.step`` is called. If "batch",
             ``step`` will be called after every optimizer step. If "epoch",
-            ``step`` will be called after one pass of the DataLoader.
+            ``step`` will be called after one pass of the DataLoader. If
+            "manual", the scheduler will not be incremented automatically -
+            you are expected to call ``trainer.update_schedulers`` manually.
+            If a scheduler is passed in, this value is expected to not be None.
 
     """
 
@@ -180,7 +182,7 @@ class TorchTrainer:
             use_tqdm=False,
             apex_args=None,
             add_dist_sampler=True,
-            scheduler_step_freq="batch",
+            scheduler_step_freq=None,
             num_replicas=None,
             batch_size=None,
             data_loader_args=None,
@@ -259,7 +261,9 @@ class TorchTrainer:
         self.local_worker = DeactivatedRunner()
         self.remote_workers = []
 
-        _validate_scheduler_step_freq(scheduler_step_freq)
+        if scheduler_creator:
+            _validate_scheduler_step_freq(scheduler_step_freq)
+
         self.scheduler_step_freq = scheduler_step_freq
 
         if not ray.is_initialized() and self.max_replicas > 1: