From a52a1e893fc44581e0de0e5c02ec4a21e5f8031b Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 6 Oct 2017 18:38:08 -0700 Subject: [PATCH] Automatically set CUDA_VISIBLE_DEVICES when worker gets task. (#1044) * Automatically set CUDA_VISIBLE_DEVICES when worker gets task. * Add test. --- python/ray/worker.py | 5 ++++ test/runtest.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/python/ray/worker.py b/python/ray/worker.py index 7aa1843c6..97e67ed57 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -868,6 +868,11 @@ class Worker(object): """ with log_span("ray:get_task", worker=self): task = self.local_scheduler_client.get_task() + + # Automatically restrict the GPUs available to this task. + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( + [str(i) for i in ray.get_gpu_ids()]) + return task def main_loop(self): diff --git a/test/runtest.py b/test/runtest.py index a340f80bc..ebe7556eb 100644 --- a/test/runtest.py +++ b/test/runtest.py @@ -1225,6 +1225,8 @@ class ResourcesTest(unittest.TestCase): time.sleep(0.1) gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 0 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) for gpu_id in gpu_ids: assert gpu_id in range(num_gpus) return gpu_ids @@ -1234,6 +1236,8 @@ class ResourcesTest(unittest.TestCase): time.sleep(0.1) gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 1 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) for gpu_id in gpu_ids: assert gpu_id in range(num_gpus) return gpu_ids @@ -1243,6 +1247,8 @@ class ResourcesTest(unittest.TestCase): time.sleep(0.1) gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 2 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) for gpu_id in gpu_ids: assert gpu_id in range(num_gpus) return gpu_ids @@ -1252,6 +1258,8 @@ class ResourcesTest(unittest.TestCase): time.sleep(0.1) gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 3 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) for gpu_id in gpu_ids: assert gpu_id in range(num_gpus) return gpu_ids @@ -1261,6 +1269,8 @@ class ResourcesTest(unittest.TestCase): time.sleep(0.1) gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 4 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) for gpu_id in gpu_ids: assert gpu_id in range(num_gpus) return gpu_ids @@ -1270,6 +1280,8 @@ class ResourcesTest(unittest.TestCase): time.sleep(0.1) gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 5 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) for gpu_id in gpu_ids: assert gpu_id in range(num_gpus) return gpu_ids @@ -1299,6 +1311,48 @@ class ResourcesTest(unittest.TestCase): all_ids = [gpu_id for gpu_ids in list_of_ids for gpu_id in gpu_ids] self.assertEqual(set(all_ids), set(range(10))) + # Test that actors have CUDA_VISIBLE_DEVICES set properly. + + @ray.remote + class Actor0(object): + def __init__(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 0 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) + # Set self.x to make sure that we got here. + self.x = 1 + + def test(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 0 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) + return self.x + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) + # Set self.x to make sure that we got here. + self.x = 1 + + def test(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == + ",".join([str(i) for i in gpu_ids])) + return self.x + + a0 = Actor0.remote() + ray.get(a0.test.remote()) + + a1 = Actor1.remote() + ray.get(a1.test.remote()) + ray.worker.cleanup() def testMultipleLocalSchedulers(self):