mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 13:19:38 +08:00
raylet command line resource configuration plumbing (#1882)
* raylet command line resource configuration plumbing * Small changes.
This commit is contained in:
committed by
Philipp Moritz
parent
85d3963172
commit
39cf6ff6e1
+56
-25
@@ -801,6 +801,49 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
|
||||
return webui_url
|
||||
|
||||
|
||||
def check_and_update_resources(resources):
|
||||
"""Sanity check a resource dictionary and add sensible defaults.
|
||||
|
||||
Args:
|
||||
resources: A dictionary mapping resource names to resource quantities.
|
||||
|
||||
Returns:
|
||||
A new resource dictionary.
|
||||
"""
|
||||
if resources is None:
|
||||
resources = {}
|
||||
resources = resources.copy()
|
||||
if "CPU" not in resources:
|
||||
# By default, use the number of hardware execution threads for the
|
||||
# number of cores.
|
||||
resources["CPU"] = psutil.cpu_count()
|
||||
|
||||
# See if CUDA_VISIBLE_DEVICES has already been set.
|
||||
gpu_ids = ray.utils.get_cuda_visible_devices()
|
||||
|
||||
# Check that the number of GPUs that the local scheduler wants doesn't
|
||||
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
|
||||
if ("GPU" in resources and gpu_ids is not None
|
||||
and resources["GPU"] > len(gpu_ids)):
|
||||
raise Exception("Attempting to start local scheduler with {} GPUs, "
|
||||
"but CUDA_VISIBLE_DEVICES contains {}.".format(
|
||||
resources["GPU"], gpu_ids))
|
||||
|
||||
if "GPU" not in resources:
|
||||
# Try to automatically detect the number of GPUs.
|
||||
resources["GPU"] = _autodetect_num_gpus()
|
||||
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
|
||||
if gpu_ids is not None:
|
||||
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
|
||||
|
||||
# Check types.
|
||||
for _, resource_quantity in resources.items():
|
||||
assert (isinstance(resource_quantity, int)
|
||||
or isinstance(resource_quantity, float))
|
||||
|
||||
return resources
|
||||
|
||||
|
||||
def start_local_scheduler(redis_address,
|
||||
node_ip_address,
|
||||
plasma_store_name,
|
||||
@@ -839,30 +882,7 @@ def start_local_scheduler(redis_address,
|
||||
Return:
|
||||
The name of the local scheduler socket.
|
||||
"""
|
||||
if resources is None:
|
||||
resources = {}
|
||||
if "CPU" not in resources:
|
||||
# By default, use the number of hardware execution threads for the
|
||||
# number of cores.
|
||||
resources["CPU"] = psutil.cpu_count()
|
||||
|
||||
# See if CUDA_VISIBLE_DEVICES has already been set.
|
||||
gpu_ids = ray.utils.get_cuda_visible_devices()
|
||||
|
||||
# Check that the number of GPUs that the local scheduler wants doesn't
|
||||
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
|
||||
if ("GPU" in resources and gpu_ids is not None
|
||||
and resources["GPU"] > len(gpu_ids)):
|
||||
raise Exception("Attempting to start local scheduler with {} GPUs, "
|
||||
"but CUDA_VISIBLE_DEVICES contains {}.".format(
|
||||
resources["GPU"], gpu_ids))
|
||||
|
||||
if "GPU" not in resources:
|
||||
# Try to automatically detect the number of GPUs.
|
||||
resources["GPU"] = _autodetect_num_gpus()
|
||||
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
|
||||
if gpu_ids is not None:
|
||||
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
|
||||
resources = check_and_update_resources(resources)
|
||||
|
||||
print("Starting local scheduler with the following resources: {}."
|
||||
.format(resources))
|
||||
@@ -889,6 +909,7 @@ def start_raylet(redis_address,
|
||||
node_ip_address,
|
||||
plasma_store_name,
|
||||
worker_path,
|
||||
resources=None,
|
||||
stdout_file=None,
|
||||
stderr_file=None,
|
||||
cleanup=True):
|
||||
@@ -913,6 +934,15 @@ def start_raylet(redis_address,
|
||||
Returns:
|
||||
The raylet socket name.
|
||||
"""
|
||||
static_resources = check_and_update_resources(resources)
|
||||
|
||||
# Format the resource argument in a form like 'CPU,1.0,GPU,0,Custom,3'.
|
||||
resource_argument = ",".join([
|
||||
"{},{}".format(resource_name, resource_value)
|
||||
for resource_name, resource_value in zip(static_resources.keys(),
|
||||
static_resources.values())
|
||||
])
|
||||
|
||||
gcs_ip_address, gcs_port = redis_address.split(":")
|
||||
raylet_name = "/tmp/raylet{}".format(random_name())
|
||||
|
||||
@@ -927,7 +957,7 @@ def start_raylet(redis_address,
|
||||
|
||||
command = [
|
||||
RAYLET_EXECUTABLE, raylet_name, plasma_store_name, node_ip_address,
|
||||
gcs_ip_address, gcs_port, start_worker_command
|
||||
gcs_ip_address, gcs_port, start_worker_command, resource_argument
|
||||
]
|
||||
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
||||
|
||||
@@ -1437,6 +1467,7 @@ def start_ray_processes(address_info=None,
|
||||
node_ip_address,
|
||||
object_store_addresses[i].name,
|
||||
worker_path,
|
||||
resources=resources[i],
|
||||
stdout_file=None,
|
||||
stderr_file=None,
|
||||
cleanup=cleanup)
|
||||
|
||||
Reference in New Issue
Block a user