raylet command line resource configuration plumbing (#1882)

* raylet command line resource configuration plumbing

* Small changes.
This commit is contained in:
Alexey Tumanov
2018-04-12 02:37:15 -07:00
committed by Philipp Moritz
parent 85d3963172
commit 39cf6ff6e1
2 changed files with 70 additions and 27 deletions
+56 -25
View File
@@ -801,6 +801,49 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
return webui_url
def check_and_update_resources(resources):
"""Sanity check a resource dictionary and add sensible defaults.
Args:
resources: A dictionary mapping resource names to resource quantities.
Returns:
A new resource dictionary.
"""
if resources is None:
resources = {}
resources = resources.copy()
if "CPU" not in resources:
# By default, use the number of hardware execution threads for the
# number of cores.
resources["CPU"] = psutil.cpu_count()
# See if CUDA_VISIBLE_DEVICES has already been set.
gpu_ids = ray.utils.get_cuda_visible_devices()
# Check that the number of GPUs that the local scheduler wants doesn't
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
if ("GPU" in resources and gpu_ids is not None
and resources["GPU"] > len(gpu_ids)):
raise Exception("Attempting to start local scheduler with {} GPUs, "
"but CUDA_VISIBLE_DEVICES contains {}.".format(
resources["GPU"], gpu_ids))
if "GPU" not in resources:
# Try to automatically detect the number of GPUs.
resources["GPU"] = _autodetect_num_gpus()
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
if gpu_ids is not None:
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
# Check types.
for _, resource_quantity in resources.items():
assert (isinstance(resource_quantity, int)
or isinstance(resource_quantity, float))
return resources
def start_local_scheduler(redis_address,
node_ip_address,
plasma_store_name,
@@ -839,30 +882,7 @@ def start_local_scheduler(redis_address,
Return:
The name of the local scheduler socket.
"""
if resources is None:
resources = {}
if "CPU" not in resources:
# By default, use the number of hardware execution threads for the
# number of cores.
resources["CPU"] = psutil.cpu_count()
# See if CUDA_VISIBLE_DEVICES has already been set.
gpu_ids = ray.utils.get_cuda_visible_devices()
# Check that the number of GPUs that the local scheduler wants doesn't
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
if ("GPU" in resources and gpu_ids is not None
and resources["GPU"] > len(gpu_ids)):
raise Exception("Attempting to start local scheduler with {} GPUs, "
"but CUDA_VISIBLE_DEVICES contains {}.".format(
resources["GPU"], gpu_ids))
if "GPU" not in resources:
# Try to automatically detect the number of GPUs.
resources["GPU"] = _autodetect_num_gpus()
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
if gpu_ids is not None:
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
resources = check_and_update_resources(resources)
print("Starting local scheduler with the following resources: {}."
.format(resources))
@@ -889,6 +909,7 @@ def start_raylet(redis_address,
node_ip_address,
plasma_store_name,
worker_path,
resources=None,
stdout_file=None,
stderr_file=None,
cleanup=True):
@@ -913,6 +934,15 @@ def start_raylet(redis_address,
Returns:
The raylet socket name.
"""
static_resources = check_and_update_resources(resources)
# Format the resource argument in a form like 'CPU,1.0,GPU,0,Custom,3'.
resource_argument = ",".join([
"{},{}".format(resource_name, resource_value)
for resource_name, resource_value in zip(static_resources.keys(),
static_resources.values())
])
gcs_ip_address, gcs_port = redis_address.split(":")
raylet_name = "/tmp/raylet{}".format(random_name())
@@ -927,7 +957,7 @@ def start_raylet(redis_address,
command = [
RAYLET_EXECUTABLE, raylet_name, plasma_store_name, node_ip_address,
gcs_ip_address, gcs_port, start_worker_command
gcs_ip_address, gcs_port, start_worker_command, resource_argument
]
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
@@ -1437,6 +1467,7 @@ def start_ray_processes(address_info=None,
node_ip_address,
object_store_addresses[i].name,
worker_path,
resources=resources[i],
stdout_file=None,
stderr_file=None,
cleanup=cleanup)