From 58efec0f2bc7e8de2ea9b89ec638d4a0a2d60537 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Fri, 12 Jun 2020 13:53:32 -0700
Subject: [PATCH] [sgd] simplify cuda visible device setting (#8775)

---
 .../ray/util/sgd/tests/test_torch_runner.py   | 31 +++++++++----------
 .../sgd/torch/distributed_torch_runner.py     | 28 ++++++++++-------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/python/ray/util/sgd/tests/test_torch_runner.py b/python/ray/util/sgd/tests/test_torch_runner.py
index 30c42d437..cabea8c93 100644
--- a/python/ray/util/sgd/tests/test_torch_runner.py
+++ b/python/ray/util/sgd/tests/test_torch_runner.py
@@ -198,8 +198,10 @@ class TestLocalDistributedRunner(unittest.TestCase):
         self.assertEquals(len(env_set_device), 1)
 
         if preset_devices:
-            self.assertIn(env_set_device, preset_devices.split(","))
-            self.assertEquals(local_device, "0")
+            visible_devices = preset_devices.split(",")
+            self.assertIn(env_set_device, visible_devices)
+            device_int = int(local_device)
+            self.assertLess(device_int, len(visible_devices))
         else:
             self.assertEquals(local_device, env_set_device)
 
@@ -220,29 +222,26 @@ class TestLocalDistributedRunner(unittest.TestCase):
             init_mock.return_value = True
             self._testWithInitialized(init_mock)
 
-    def _testNotInitialized(self, init_mock):
-        mock_runner = MagicMock()
-        mock_runner._set_cuda_device = MagicMock()
-        LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner)
-        mock_runner._set_cuda_device.assert_called_with("0")
-        self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1)
-
-    def testNoVisibleNotInitialized(self):
-        with patch("torch.cuda.is_initialized") as init_mock:
-            init_mock.return_value = False
-            self._testNotInitialized(init_mock)
-
     def test2VisibleNotInitialized(self):
         os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
         with patch("torch.cuda.is_initialized") as init_mock:
             init_mock.return_value = False
-            self._testNotInitialized(init_mock)
+            mock_runner = MagicMock()
+            mock_runner._set_cuda_device = MagicMock()
+            LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner)
+            args, _ = mock_runner._set_cuda_device.call_args
+            self.assertTrue(("1" in args) or "0" in args)
+            self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1)
 
     def test1VisibleNotInitialized(self):
         os.environ["CUDA_VISIBLE_DEVICES"] = "0"
         with patch("torch.cuda.is_initialized") as init_mock:
             init_mock.return_value = False
-            self._testNotInitialized(init_mock)
+            mock_runner = MagicMock()
+            mock_runner._set_cuda_device = MagicMock()
+            LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner)
+            mock_runner._set_cuda_device.assert_called_with("0")
+            self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1)
 
     @patch("torch.cuda.set_device")
     def testSetDevice(self, set_mock):
diff --git a/python/ray/util/sgd/torch/distributed_torch_runner.py b/python/ray/util/sgd/torch/distributed_torch_runner.py
index 225490bf6..c4181c5ca 100644
--- a/python/ray/util/sgd/torch/distributed_torch_runner.py
+++ b/python/ray/util/sgd/torch/distributed_torch_runner.py
@@ -276,22 +276,28 @@ class LocalDistributedRunner(DistributedTorchRunner):
         super(LocalDistributedRunner, self).__init__(*args, **kwargs)
 
     def _try_reserve_and_set_cuda(self):
-        use_found_device = os.environ.get("CUDA_VISIBLE_DEVICES") is None \
-                           and torch.cuda.is_initialized()
-        device = reserve_cuda_device()
+        visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        reserved_device = reserve_cuda_device()
         # This needs to be set even if torch.cuda is already
         # initialized because the env var is used later when
         # starting the DDP setup.
-        os.environ["CUDA_VISIBLE_DEVICES"] = device
-        if use_found_device:
+        os.environ["CUDA_VISIBLE_DEVICES"] = reserved_device
+        if visible_devices:
+            # We want to set the index on the visible devices list.
+            if reserved_device not in visible_devices:
+                raise RuntimeError(
+                    "TorchTrainer reserved a device {} that was not in the "
+                    "CUDA_VISIBLE_DEVICES {}. This may be because the "
+                    "Ray cluster is not set with the right env vars. "
+                    "If that is not the issue, please raise a "
+                    "Github issue.".format(reserved_device, visible_devices))
+            devices = visible_devices.split(",")
+            scoped_index = devices.index(reserved_device)
+            self._set_cuda_device(str(scoped_index))
+        else:
             # Once cuda is initialized, torch.device ignores the os.env
             # so we have to set the right actual device.
-            self._set_cuda_device(device)
-        else:
-            # if CUDA is not initialized, we can set the os.env.
-            # Even if initialized, we want to set the device to use BatchNorm.
-            # and make Torch think it only sees 1 GPU.
-            self._set_cuda_device("0")
+            self._set_cuda_device(reserved_device)
 
     def _set_cuda_device(self, device_str):
         """Sets the CUDA device for this current local worker."""