From 58efec0f2bc7e8de2ea9b89ec638d4a0a2d60537 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Fri, 12 Jun 2020 13:53:32 -0700 Subject: [PATCH] [sgd] simplify cuda visible device setting (#8775) --- .../ray/util/sgd/tests/test_torch_runner.py | 31 +++++++++---------- .../sgd/torch/distributed_torch_runner.py | 28 ++++++++++------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/python/ray/util/sgd/tests/test_torch_runner.py b/python/ray/util/sgd/tests/test_torch_runner.py index 30c42d437..cabea8c93 100644 --- a/python/ray/util/sgd/tests/test_torch_runner.py +++ b/python/ray/util/sgd/tests/test_torch_runner.py @@ -198,8 +198,10 @@ class TestLocalDistributedRunner(unittest.TestCase): self.assertEquals(len(env_set_device), 1) if preset_devices: - self.assertIn(env_set_device, preset_devices.split(",")) - self.assertEquals(local_device, "0") + visible_devices = preset_devices.split(",") + self.assertIn(env_set_device, visible_devices) + device_int = int(local_device) + self.assertLess(device_int, len(visible_devices)) else: self.assertEquals(local_device, env_set_device) @@ -220,29 +222,26 @@ class TestLocalDistributedRunner(unittest.TestCase): init_mock.return_value = True self._testWithInitialized(init_mock) - def _testNotInitialized(self, init_mock): - mock_runner = MagicMock() - mock_runner._set_cuda_device = MagicMock() - LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner) - mock_runner._set_cuda_device.assert_called_with("0") - self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1) - - def testNoVisibleNotInitialized(self): - with patch("torch.cuda.is_initialized") as init_mock: - init_mock.return_value = False - self._testNotInitialized(init_mock) - def test2VisibleNotInitialized(self): os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" with patch("torch.cuda.is_initialized") as init_mock: init_mock.return_value = False - self._testNotInitialized(init_mock) + mock_runner = MagicMock() + mock_runner._set_cuda_device = MagicMock() + LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner) + args, _ = mock_runner._set_cuda_device.call_args + self.assertTrue(("1" in args) or "0" in args) + self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1) def test1VisibleNotInitialized(self): os.environ["CUDA_VISIBLE_DEVICES"] = "0" with patch("torch.cuda.is_initialized") as init_mock: init_mock.return_value = False - self._testNotInitialized(init_mock) + mock_runner = MagicMock() + mock_runner._set_cuda_device = MagicMock() + LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner) + mock_runner._set_cuda_device.assert_called_with("0") + self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1) @patch("torch.cuda.set_device") def testSetDevice(self, set_mock): diff --git a/python/ray/util/sgd/torch/distributed_torch_runner.py b/python/ray/util/sgd/torch/distributed_torch_runner.py index 225490bf6..c4181c5ca 100644 --- a/python/ray/util/sgd/torch/distributed_torch_runner.py +++ b/python/ray/util/sgd/torch/distributed_torch_runner.py @@ -276,22 +276,28 @@ class LocalDistributedRunner(DistributedTorchRunner): super(LocalDistributedRunner, self).__init__(*args, **kwargs) def _try_reserve_and_set_cuda(self): - use_found_device = os.environ.get("CUDA_VISIBLE_DEVICES") is None \ - and torch.cuda.is_initialized() - device = reserve_cuda_device() + visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES") + reserved_device = reserve_cuda_device() # This needs to be set even if torch.cuda is already # initialized because the env var is used later when # starting the DDP setup. - os.environ["CUDA_VISIBLE_DEVICES"] = device - if use_found_device: + os.environ["CUDA_VISIBLE_DEVICES"] = reserved_device + if visible_devices: + # We want to set the index on the visible devices list. + if reserved_device not in visible_devices: + raise RuntimeError( + "TorchTrainer reserved a device {} that was not in the " + "CUDA_VISIBLE_DEVICES {}. This may be because the " + "Ray cluster is not set with the right env vars. " + "If that is not the issue, please raise a " + "Github issue.".format(reserved_device, visible_devices)) + devices = visible_devices.split(",") + scoped_index = devices.index(reserved_device) + self._set_cuda_device(str(scoped_index)) + else: # Once cuda is initialized, torch.device ignores the os.env # so we have to set the right actual device. - self._set_cuda_device(device) - else: - # if CUDA is not initialized, we can set the os.env. - # Even if initialized, we want to set the device to use BatchNorm. - # and make Torch think it only sees 1 GPU. - self._set_cuda_device("0") + self._set_cuda_device(reserved_device) def _set_cuda_device(self, device_str): """Sets the CUDA device for this current local worker."""