From 1b357533b14b9f57442a224aa74b450de39d82c2 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Sat, 24 Oct 2020 01:08:46 -0700 Subject: [PATCH] [tune] Try to enable PTL, SKlearn tests (#11542) --- ci/travis/install-dependencies.sh | 15 +++++------ doc/BUILD | 16 +++++------ doc/source/tune/_tutorials/tune-sklearn.py | 4 ++- python/ray/tune/integration/horovod.py | 1 + .../test_integration_pytorch_lightning.py | 2 +- python/ray/util/sgd/BUILD | 2 +- python/ray/util/sgd/tests/test_ptl.py | 6 +++++ python/ray/util/sgd/tf/tf_runner.py | 3 +-- .../sgd/torch/distributed_torch_runner.py | 2 +- python/ray/util/sgd/torch/ptl_operator.py | 27 ++++++++++++------- python/ray/util/sgd/torch/utils.py | 2 +- python/requirements_tune.txt | 2 +- 12 files changed, 49 insertions(+), 33 deletions(-) diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 64cd380d8..0193f4835 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -295,18 +295,10 @@ install_dependencies() { pip install -r "${WORKSPACE_DIR}"/python/requirements_tune.txt fi - # Additional Tune dependency for Horovod. - if [ "${INSTALL_HOROVOD-}" = 1 ]; then - # TODO: eventually pin this to master. - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git - fi - # Additional RaySGD test dependencies. if [ "${SGD_TESTING-}" = 1 ]; then pip install -r "${WORKSPACE_DIR}"/python/requirements_tune.txt # TODO: eventually have a separate requirements file for Ray SGD. - # Fix PTL version to 0.10 for now. - pip install -U pytorch-lightning==0.10.0 fi # Additional Doc test dependencies. @@ -328,6 +320,13 @@ install_dependencies() { tensorflow=="${TF_VERSION-2.2.0}" gym fi + # Additional Tune dependency for Horovod. + # This must be run last (i.e., torch cannot be re-installed after this) + if [ "${INSTALL_HOROVOD-}" = 1 ]; then + # TODO: eventually pin this to master. + HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git + fi + if [ -n "${PYTHON-}" ] || [ -n "${LINT-}" ] || [ "${MAC_WHEELS-}" = 1 ]; then install_node fi diff --git a/doc/BUILD b/doc/BUILD index 0aff5fcac..de5d1e2eb 100644 --- a/doc/BUILD +++ b/doc/BUILD @@ -54,14 +54,14 @@ py_test( # Please keep these sorted alphabetically. # -------------------------------------------------------------------- -# py_test( -# name = "tune_sklearn", -# size = "medium", -# main = "source/tune/_tutorials/tune-sklearn.py", -# srcs = ["source/tune/_tutorials/tune-sklearn.py"], -# tags = ["exclusive", "example"], -# args = ["--smoke-test"] -# ) +py_test( + name = "tune_sklearn", + size = "medium", + main = "source/tune/_tutorials/tune-sklearn.py", + srcs = ["source/tune/_tutorials/tune-sklearn.py"], + tags = ["exclusive", "example"], + args = ["--smoke-test"] +) py_test( name = "tune_serve_integration_mnist", diff --git a/doc/source/tune/_tutorials/tune-sklearn.py b/doc/source/tune/_tutorials/tune-sklearn.py index fb38c7597..2f81235c1 100644 --- a/doc/source/tune/_tutorials/tune-sklearn.py +++ b/doc/source/tune/_tutorials/tune-sklearn.py @@ -36,7 +36,9 @@ Let's compare Tune's Scikit-Learn APIs to the standard scikit-learn GridSearchCV To start out, change the import statement to get tune-scikit-learn’s grid search cross validation interface: """ -# from sklearn.model_selection import GridSearchCV +# Keep this here for https://github.com/ray-project/ray/issues/11547 +from sklearn.model_selection import GridSearchCV +# Replace above line with: from ray.tune.sklearn import TuneGridSearchCV ####################################################################### diff --git a/python/ray/tune/integration/horovod.py b/python/ray/tune/integration/horovod.py index e3603de2b..d1eda4070 100644 --- a/python/ray/tune/integration/horovod.py +++ b/python/ray/tune/integration/horovod.py @@ -18,6 +18,7 @@ logger = logging.getLogger(__name__) def get_rank() -> str: + """Returns rank of worker.""" return os.environ["HOROVOD_RANK"] diff --git a/python/ray/tune/tests/test_integration_pytorch_lightning.py b/python/ray/tune/tests/test_integration_pytorch_lightning.py index c1ec09f1d..2d550aba9 100644 --- a/python/ray/tune/tests/test_integration_pytorch_lightning.py +++ b/python/ray/tune/tests/test_integration_pytorch_lightning.py @@ -155,4 +155,4 @@ class PyTorchLightningIntegrationTest(unittest.TestCase): if __name__ == "__main__": import pytest import sys - sys.exit(pytest.main(["-v", __file__])) + sys.exit(pytest.main(sys.argv[1:] + ["-v", __file__])) diff --git a/python/ray/util/sgd/BUILD b/python/ray/util/sgd/BUILD index 31191ac7a..896560136 100644 --- a/python/ray/util/sgd/BUILD +++ b/python/ray/util/sgd/BUILD @@ -4,7 +4,7 @@ # -------------------------------------------------------------------- py_test( name = "test_ptl", - size = "small", + size = "large", srcs = ["tests/test_ptl.py"], tags = ["exclusive", "pytorch-lightning", "pytorch"], deps = [":sgd_lib"], diff --git a/python/ray/util/sgd/tests/test_ptl.py b/python/ray/util/sgd/tests/test_ptl.py index e0e4dd3db..a70a8cb3f 100644 --- a/python/ray/util/sgd/tests/test_ptl.py +++ b/python/ray/util/sgd/tests/test_ptl.py @@ -209,3 +209,9 @@ def test_correctness(ray_start_2_cpus, num_workers, use_local): assert train1_stats["train_loss"] == train2_stats["train_loss"] assert val1_stats["val_loss"] == val2_stats["val_loss"] assert val1_stats["val_acc"] == val2_stats["val_accuracy"] + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(sys.argv[1:] + ["-v", __file__])) diff --git a/python/ray/util/sgd/tf/tf_runner.py b/python/ray/util/sgd/tf/tf_runner.py index 0b56465f5..1e6c9f909 100644 --- a/python/ray/util/sgd/tf/tf_runner.py +++ b/python/ray/util/sgd/tf/tf_runner.py @@ -3,7 +3,6 @@ import json import os import ray -import ray._private.services from ray.util.sgd import utils logger = logging.getLogger(__name__) @@ -148,7 +147,7 @@ class TFRunner: def get_node_ip(self): """Returns the IP address of the current node.""" - return ray._private.services.get_node_ip_address() + return ray.services.get_node_ip_address() def find_free_port(self): """Finds a free port on the current node.""" diff --git a/python/ray/util/sgd/torch/distributed_torch_runner.py b/python/ray/util/sgd/torch/distributed_torch_runner.py index fc9015b68..fd338c9cd 100644 --- a/python/ray/util/sgd/torch/distributed_torch_runner.py +++ b/python/ray/util/sgd/torch/distributed_torch_runner.py @@ -168,7 +168,7 @@ def clear_dummy_actor(): def reserve_resources(num_cpus, num_gpus, retries=20): - ip = ray._private.services.get_node_ip_address() + ip = ray.services.get_node_ip_address() reserved_cuda_device = None diff --git a/python/ray/util/sgd/torch/ptl_operator.py b/python/ray/util/sgd/torch/ptl_operator.py index a484de7f7..3403f2559 100644 --- a/python/ray/util/sgd/torch/ptl_operator.py +++ b/python/ray/util/sgd/torch/ptl_operator.py @@ -57,9 +57,12 @@ class LightningOperator(TrainingOperator, TrainerModelHooksMixin, """Returns list of scheduler dictionaries. List is empty if no schedulers are returned in the - configure_optimizers method of your LightningModule. Default - configuration is used if configure_optimizers returns scheduler - objects instead of scheduler dicts. See + configure_optimizers method of your LightningModule. + + Default configuration is used if configure_optimizers + returns scheduler objects. + + See https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html#configure-optimizers """ return self._scheduler_dicts @@ -266,7 +269,8 @@ class LightningOperator(TrainingOperator, TrainerModelHooksMixin, return_output = meter_collection.summary() if self.is_function_implemented("on_train_epoch_end", model): - model.on_train_epoch_end() + model.on_train_epoch_end( + [eo.get("raw_output") for eo in epoch_outputs]) for s_dict, scheduler in zip(self.scheduler_dicts, self.schedulers): if s_dict["interval"] == SCHEDULER_STEP_EPOCH: @@ -345,10 +349,9 @@ class LightningOperator(TrainingOperator, TrainerModelHooksMixin, with self.timers.record("grad"): if self.use_fp16: with self._amp.scale_loss(loss, optimizer) as scaled_loss: - model.backward( - self, scaled_loss, optimizer, optimizer_idx=0) + model.backward(scaled_loss, optimizer, optimizer_idx=0) else: - model.backward(self, loss, optimizer, optimizer_idx=0) + model.backward(loss, optimizer, optimizer_idx=0) if self.is_function_implemented("on_after_backward", model): model.on_after_backward() @@ -370,7 +373,10 @@ class LightningOperator(TrainingOperator, TrainerModelHooksMixin, if self.is_function_implemented("on_train_batch_end", model): model.on_train_batch_end( - batch=batch, batch_idx=batch_idx, dataloader_idx=0) + outputs=output, + batch=batch, + batch_idx=batch_idx, + dataloader_idx=0) return { "signal": 0, @@ -468,7 +474,10 @@ class LightningOperator(TrainingOperator, TrainerModelHooksMixin, if self.is_function_implemented("on_validation_batch_end", model): model.on_validation_batch_end( - batch=batch, batch_idx=batch_idx, dataloader_idx=0) + outputs=output, + batch=batch, + batch_idx=batch_idx, + dataloader_idx=0) return { "raw_output": output, # NUM_SAMPLES: len(batch) diff --git a/python/ray/util/sgd/torch/utils.py b/python/ray/util/sgd/torch/utils.py index aaef617cf..c255a6e91 100644 --- a/python/ray/util/sgd/torch/utils.py +++ b/python/ray/util/sgd/torch/utils.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) def setup_address(): - ip = ray._private.services.get_node_ip_address() + ip = ray.services.get_node_ip_address() port = find_free_port() return f"tcp://{ip}:{port}" diff --git a/python/requirements_tune.txt b/python/requirements_tune.txt index e7bb4134d..50979f55e 100644 --- a/python/requirements_tune.txt +++ b/python/requirements_tune.txt @@ -24,7 +24,7 @@ sigopt smart_open tensorflow-probability timm -torch>=1.5.0 +torch>=1.6.0 torchvision>=0.6.0 # transformers git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers