diff --git a/doc/source/actors.rst b/doc/source/actors.rst index 864fe7a78..e3839b593 100644 --- a/doc/source/actors.rst +++ b/doc/source/actors.rst @@ -72,8 +72,8 @@ Resources with Actors You can specify that an actor requires CPUs or GPUs in the decorator. While Ray has built-in support for CPUs and GPUs, Ray can also handle custom resources. When using GPUs, Ray will automatically set the environment variable ``CUDA_VISIBLE_DEVICES`` for the actor after instantiated. The actor will have access to a list of the IDs of the GPUs -that it is allowed to use via ``ray.get_gpu_ids()``. This is a list of integers, -like ``[]``, or ``[1]``, or ``[2, 5, 6]``. +that it is allowed to use via ``ray.get_gpu_ids(as_str=True)``. This is a list of strings, +like ``[]``, or ``['1']``, or ``['2', '5', '6']``. Under some circumstances, the IDs of GPUs could be given as UUID strings instead of indices (see the `CUDA programming guide `__). .. code-block:: python diff --git a/doc/source/using-ray-with-gpus.rst b/doc/source/using-ray-with-gpus.rst index 9607f34b0..05c1b9ab9 100644 --- a/doc/source/using-ray-with-gpus.rst +++ b/doc/source/using-ray-with-gpus.rst @@ -33,8 +33,8 @@ remote decorator. print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids())) print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"])) -Inside of the remote function, a call to ``ray.get_gpu_ids()`` will return a -list of integers indicating which GPUs the remote function is allowed to use. +Inside of the remote function, a call to ``ray.get_gpu_ids(as_str=True)`` will return a +list of strings indicating which GPUs the remote function is allowed to use. Typically, it is not necessary to call ``ray.get_gpu_ids()`` because Ray will automatically set the ``CUDA_VISIBLE_DEVICES`` environment variable. diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 94fd062e5..36e0655b2 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -353,7 +353,7 @@ cdef execute_task( CFiberEvent task_done_event # Automatically restrict the GPUs available to this task. - ray.utils.set_cuda_visible_devices(ray.get_gpu_ids()) + ray.utils.set_cuda_visible_devices(ray.get_gpu_ids(as_str=True)) function_descriptor = CFunctionDescriptorToPython( ray_function.GetFunctionDescriptor()) diff --git a/python/ray/tests/test_actor_resources.py b/python/ray/tests/test_actor_resources.py index 345595fed..37c1dcb01 100644 --- a/python/ray/tests/test_actor_resources.py +++ b/python/ray/tests/test_actor_resources.py @@ -95,10 +95,10 @@ def test_actor_gpus(ray_start_cluster): @ray.remote(num_gpus=1) class Actor1: def __init__(self): - self.gpu_ids = ray.get_gpu_ids() + self.gpu_ids = ray.get_gpu_ids(as_str=True) def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids + assert ray.get_gpu_ids(as_str=True) == self.gpu_ids return (ray.worker.global_worker.node.unique_id, tuple(self.gpu_ids)) diff --git a/python/ray/tests/test_advanced_2.py b/python/ray/tests/test_advanced_2.py index 81f9b1514..19f400534 100644 --- a/python/ray/tests/test_advanced_2.py +++ b/python/ray/tests/test_advanced_2.py @@ -633,6 +633,25 @@ def save_gpu_ids_shutdown_only(): del os.environ["CUDA_VISIBLE_DEVICES"] +@pytest.mark.parametrize("as_str", [False, True]) +def test_gpu_ids_as_str(save_gpu_ids_shutdown_only, as_str): + allowed_gpu_ids = [4, 5, 6] + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( + str(i) for i in allowed_gpu_ids) + ray.init() + + @ray.remote + def get_gpu_ids(as_str): + gpu_ids = ray.get_gpu_ids(as_str) + for gpu_id in gpu_ids: + if as_str: + assert isinstance(gpu_id, str) + else: + assert isinstance(gpu_id, int) + + ray.get([get_gpu_ids.remote(as_str) for _ in range(10)]) + + def test_specific_gpus(save_gpu_ids_shutdown_only): allowed_gpu_ids = [4, 5, 6] os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( @@ -643,14 +662,14 @@ def test_specific_gpus(save_gpu_ids_shutdown_only): def f(): gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 1 - assert gpu_ids[0] in allowed_gpu_ids + assert int(gpu_ids[0]) in allowed_gpu_ids @ray.remote(num_gpus=2) def g(): gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 2 - assert gpu_ids[0] in allowed_gpu_ids - assert gpu_ids[1] in allowed_gpu_ids + assert int(gpu_ids[0]) in allowed_gpu_ids + assert int(gpu_ids[1]) in allowed_gpu_ids ray.get([f.remote() for _ in range(100)]) ray.get([g.remote() for _ in range(100)]) @@ -671,7 +690,7 @@ def test_local_mode_gpus(save_gpu_ids_shutdown_only): gpu_ids = ray.get_gpu_ids() assert len(gpu_ids) == 3 for gpu in gpu_ids: - assert gpu in allowed_gpu_ids + assert int(gpu) in allowed_gpu_ids ray.get([f.remote() for _ in range(100)]) diff --git a/python/ray/utils.py b/python/ray/utils.py index 38965db9c..6c0e297dd 100644 --- a/python/ray/utils.py +++ b/python/ray/utils.py @@ -271,9 +271,9 @@ def get_cuda_visible_devices(): """Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable. Returns: - if CUDA_VISIBLE_DEVICES is set, this returns a list of integers with - the IDs of the GPUs. If it is not set or is set to NoDevFiles, - this returns None. + devices (List[str]): If CUDA_VISIBLE_DEVICES is set, returns a + list of strings representing the IDs of the visible GPUs. + If it is not set or is set to NoDevFiles, returns empty list. """ gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None) @@ -286,7 +286,8 @@ def get_cuda_visible_devices(): if gpu_ids_str == "NoDevFiles": return [] - return [int(i) for i in gpu_ids_str.split(",")] + # GPU identifiers are given as strings representing integers or UUIDs. + return list(gpu_ids_str.split(",")) last_set_gpu_ids = None @@ -296,7 +297,7 @@ def set_cuda_visible_devices(gpu_ids): """Set the CUDA_VISIBLE_DEVICES environment variable. Args: - gpu_ids: This is a list of integers representing GPU IDs. + gpu_ids (List[str]): List of strings representing GPU IDs. """ global last_set_gpu_ids diff --git a/python/ray/worker.py b/python/ray/worker.py index 6861de455..749917c54 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -373,7 +373,7 @@ class Worker: sys.exit(0) -def get_gpu_ids(): +def get_gpu_ids(as_str=False): """Get the IDs of the GPUs that are available to the worker. If the CUDA_VISIBLE_DEVICES environment variable was set when the worker @@ -381,6 +381,10 @@ def get_gpu_ids(): IDs in CUDA_VISIBLE_DEVICES. If not, the IDs will fall in the range [0, NUM_GPUS - 1], where NUM_GPUS is the number of GPUs that the node has. + Args: + as_str (Boolean): If true, return gpu ids in string format. By default, + it is False. This will change to default to True in the future. + Returns: A list of GPU IDs. """ @@ -400,7 +404,17 @@ def get_gpu_ids(): # Give all GPUs in local_mode. if global_worker.mode == LOCAL_MODE: max_gpus = global_worker.node.get_resource_spec().num_gpus - return global_worker.original_gpu_ids[:max_gpus] + assigned_ids = global_worker.original_gpu_ids[:max_gpus] + + if not as_str: + from ray.util.debug import log_once + if log_once("ray.get_gpu_ids.as_str"): + logger.warning( + "ray.get_gpu_ids() will return a list of strings by default" + " in a future version of Ray for compatibility with CUDA. " + "To enable the forward-compatible behavior, use " + "`ray.get_gpu_ids(as_str=True)`.") + assigned_ids = [int(assigned_id) for assigned_id in assigned_ids] return assigned_ids diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 7276a0ec5..f198364d5 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -413,7 +413,7 @@ class RolloutWorker(ParallelIteratorWorker): if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE): # Check available number of GPUs - if not ray.get_gpu_ids(): + if not ray.get_gpu_ids(as_str=True): logger.debug("Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 53002b5fd..412347702 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -98,7 +98,7 @@ class TorchPolicy(Policy): """ self.framework = "torch" super().__init__(observation_space, action_space, config) - if torch.cuda.is_available() and ray.get_gpu_ids(): + if torch.cuda.is_available() and ray.get_gpu_ids(as_str=True): self.device = torch.device("cuda") else: self.device = torch.device("cpu")