Allow scheduling with arbitrary user-defined resource labels. (#1236)

* Enable scheduling with custom resource labels.

* Fix.

* Minor fixes and ref counting fix.

* Linting

* Use .data() instead of .c_str().

* Fix linting.

* Fix ResourcesTest.testGPUIDs test by waiting for workers to start up.

* Sleep in test so that all tasks are submitted before any completes.
This commit is contained in:
Robert Nishihara
2017-12-01 11:41:40 -08:00
committed by Philipp Moritz
parent ac64631043
commit c21e189371
42 changed files with 1073 additions and 806 deletions
+24 -10
View File
@@ -3,6 +3,7 @@ from __future__ import division
from __future__ import print_function
import click
import json
import subprocess
import ray.services as services
@@ -56,13 +57,15 @@ def cli():
help=("The initial number of workers to start on this node, "
"note that the local scheduler may start additional "
"workers. If you wish to control the total number of "
"concurent tasks, then use --num-cpus instead."))
"concurent tasks, then use --resources instead and "
"specify the CPU field."))
@click.option("--num-cpus", required=False, type=int,
help="the number of CPUs on this node")
@click.option("--num-gpus", required=False, type=int,
help="the number of GPUs on this node")
@click.option("--num-custom-resource", required=False, type=int,
help="the amount of a user-defined custom resource on this node")
@click.option("--resources", required=False, default="{}", type=str,
help="a JSON serialized dictionary mapping resource name to "
"resource quantity")
@click.option("--head", is_flag=True, default=False,
help="provide this argument for the head node")
@click.option("--no-ui", is_flag=True, default=False,
@@ -75,7 +78,7 @@ def cli():
help="enable support for huge pages in the object store")
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redis_max_clients, object_manager_port, num_workers, num_cpus,
num_gpus, num_custom_resource, head, no_ui, block, plasma_directory,
num_gpus, resources, head, no_ui, block, plasma_directory,
huge_pages):
# Note that we redirect stdout and stderr to /dev/null because otherwise
# attempts to print may cause exceptions if a process is started inside of
@@ -89,6 +92,21 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
if redis_address is not None:
redis_address = services.address_to_ip(redis_address)
try:
resources = json.loads(resources)
except Exception as e:
raise Exception("Unable to parse the --resources argument using "
"json.loads. Try using a format like\n\n"
" --resources='{\"CustomResource1\": 3, "
"\"CustomReseource2\": 2}'")
assert "CPU" not in resources, "Use the --num-cpus argument."
assert "GPU" not in resources, "Use the --num-gpus argument."
if num_cpus is not None:
resources["CPU"] = num_cpus
if num_gpus is not None:
resources["GPU"] = num_gpus
if head:
# Start Ray on the head node.
if redis_address is not None:
@@ -115,9 +133,7 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
num_workers=num_workers,
cleanup=False,
redirect_output=True,
num_cpus=num_cpus,
num_gpus=num_gpus,
num_custom_resource=num_custom_resource,
resources=resources,
num_redis_shards=num_redis_shards,
redis_max_clients=redis_max_clients,
include_webui=(not no_ui),
@@ -181,9 +197,7 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
num_workers=num_workers,
cleanup=False,
redirect_output=True,
num_cpus=num_cpus,
num_gpus=num_gpus,
num_custom_resource=num_custom_resource,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages)
print(address_info)