Automatically set CUDA_VISIBLE_DEVICES when worker gets task. (#1044)

* Automatically set CUDA_VISIBLE_DEVICES when worker gets task.

* Add test.
This commit is contained in:
Robert Nishihara
2017-10-06 18:38:08 -07:00
committed by Philipp Moritz
parent 4669c59fa8
commit a52a1e893f
2 changed files with 59 additions and 0 deletions
+5
View File
@@ -868,6 +868,11 @@ class Worker(object):
"""
with log_span("ray:get_task", worker=self):
task = self.local_scheduler_client.get_task()
# Automatically restrict the GPUs available to this task.
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
[str(i) for i in ray.get_gpu_ids()])
return task
def main_loop(self):
+54
View File
@@ -1225,6 +1225,8 @@ class ResourcesTest(unittest.TestCase):
time.sleep(0.1)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 0
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
for gpu_id in gpu_ids:
assert gpu_id in range(num_gpus)
return gpu_ids
@@ -1234,6 +1236,8 @@ class ResourcesTest(unittest.TestCase):
time.sleep(0.1)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
for gpu_id in gpu_ids:
assert gpu_id in range(num_gpus)
return gpu_ids
@@ -1243,6 +1247,8 @@ class ResourcesTest(unittest.TestCase):
time.sleep(0.1)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 2
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
for gpu_id in gpu_ids:
assert gpu_id in range(num_gpus)
return gpu_ids
@@ -1252,6 +1258,8 @@ class ResourcesTest(unittest.TestCase):
time.sleep(0.1)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 3
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
for gpu_id in gpu_ids:
assert gpu_id in range(num_gpus)
return gpu_ids
@@ -1261,6 +1269,8 @@ class ResourcesTest(unittest.TestCase):
time.sleep(0.1)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 4
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
for gpu_id in gpu_ids:
assert gpu_id in range(num_gpus)
return gpu_ids
@@ -1270,6 +1280,8 @@ class ResourcesTest(unittest.TestCase):
time.sleep(0.1)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 5
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
for gpu_id in gpu_ids:
assert gpu_id in range(num_gpus)
return gpu_ids
@@ -1299,6 +1311,48 @@ class ResourcesTest(unittest.TestCase):
all_ids = [gpu_id for gpu_ids in list_of_ids for gpu_id in gpu_ids]
self.assertEqual(set(all_ids), set(range(10)))
# Test that actors have CUDA_VISIBLE_DEVICES set properly.
@ray.remote
class Actor0(object):
def __init__(self):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 0
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
# Set self.x to make sure that we got here.
self.x = 1
def test(self):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 0
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
return self.x
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
# Set self.x to make sure that we got here.
self.x = 1
def test(self):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert (os.environ["CUDA_VISIBLE_DEVICES"] ==
",".join([str(i) for i in gpu_ids]))
return self.x
a0 = Actor0.remote()
ray.get(a0.test.remote())
a1 = Actor1.remote()
ray.get(a1.test.remote())
ray.worker.cleanup()
def testMultipleLocalSchedulers(self):