Remove num_local_schedulers argument from ray.worker._init. (#3704)

* Remove num_local_schedulers argument from ray.worker._init.

* Fix

* Fix tests.
This commit is contained in:
Robert Nishihara
2019-01-07 12:44:49 -08:00
committed by Philipp Moritz
parent e78562b2e8
commit c9d70f0dda
18 changed files with 388 additions and 513 deletions
+22 -18
View File
@@ -32,12 +32,8 @@ class RayParams(object):
ignored.
redis_shard_ports: A list of the ports to use for the non-primary Redis
shards.
num_cpus (int): Number of cpus the user wishes all local schedulers to
be configured with.
num_gpus (int): Number of gpus the user wishes all local schedulers to
be configured with.
num_local_schedulers (int): The number of local schedulers to start.
This is only provided if start_ray_local is True.
num_cpus (int): Number of CPUs to configure the raylet with.
num_gpus (int): Number of GPUs to configure the raylet with.
resources: A dictionary mapping the name of a resource to the quantity
of that resource available.
object_store_memory: The amount of memory (in bytes) to start the
@@ -46,12 +42,8 @@ class RayParams(object):
to use, or None for no limit. Once the limit is exceeded, redis
will start LRU eviction of entries. This only applies to the
sharded redis tables (task and object tables).
object_manager_ports (list): A list of the ports to use for the object
managers. There should be one per object manager being started on
this node (typically just one).
node_manager_ports (list): A list of the ports to use for the node
managers. There should be one per node manager being started on
this node (typically just one).
object_manager_port int: The port to use for the object manager.
node_manager_port: The port to use for the node manager.
node_ip_address (str): The IP address of the node that we are on.
object_id_seed (int): Used to seed the deterministic generation of
object IDs. The same value can be used across multiple runs of the
@@ -97,14 +89,13 @@ class RayParams(object):
redis_address=None,
num_cpus=None,
num_gpus=None,
num_local_schedulers=None,
resources=None,
object_store_memory=None,
redis_max_memory=None,
redis_port=None,
redis_shard_ports=None,
object_manager_ports=None,
node_manager_ports=None,
object_manager_port=None,
node_manager_port=None,
node_ip_address=None,
object_id_seed=None,
num_workers=None,
@@ -133,14 +124,13 @@ class RayParams(object):
self.redis_address = redis_address
self.num_cpus = num_cpus
self.num_gpus = num_gpus
self.num_local_schedulers = num_local_schedulers
self.resources = resources
self.object_store_memory = object_store_memory
self.redis_max_memory = redis_max_memory
self.redis_port = redis_port
self.redis_shard_ports = redis_shard_ports
self.object_manager_ports = object_manager_ports
self.node_manager_ports = node_manager_ports
self.object_manager_port = object_manager_port
self.node_manager_port = node_manager_port
self.node_ip_address = node_ip_address
self.num_workers = num_workers
self.local_mode = local_mode
@@ -160,6 +150,7 @@ class RayParams(object):
self.include_log_monitor = include_log_monitor
self.autoscaling_config = autoscaling_config
self._internal_config = _internal_config
self._check_usage()
def update(self, **kwargs):
"""Update the settings according to the keyword arguments.
@@ -174,6 +165,8 @@ class RayParams(object):
raise ValueError("Invalid RayParams parameter in"
" update: %s" % arg)
self._check_usage()
def update_if_absent(self, **kwargs):
"""Update the settings when the target fields are None.
@@ -187,3 +180,14 @@ class RayParams(object):
else:
raise ValueError("Invalid RayParams parameter in"
" update_if_absent: %s" % arg)
self._check_usage()
def _check_usage(self):
if self.resources is not None:
assert "CPU" not in self.resources, (
"'CPU' should not be included in the resource dictionary. Use "
"num_cpus instead.")
assert "GPU" not in self.resources, (
"'GPU' should not be included in the resource dictionary. Use "
"num_gpus instead.")
+3 -3
View File
@@ -52,7 +52,7 @@ def create_parser(parser_creator=None):
type=int,
help="--num-gpus to use if starting a new cluster.")
parser.add_argument(
"--ray-num-local-schedulers",
"--ray-num-nodes",
default=None,
type=int,
help="Emulate multiple cluster nodes for debugging.")
@@ -122,9 +122,9 @@ def run(args, parser):
if not exp.get("env") and not exp.get("config", {}).get("env"):
parser.error("the following arguments are required: --env")
if args.ray_num_local_schedulers:
if args.ray_num_nodes:
cluster = Cluster()
for _ in range(args.ray_num_local_schedulers):
for _ in range(args.ray_num_nodes):
cluster.add_node(
resources={
"num_cpus": args.ray_num_cpus or 1,
+4 -8
View File
@@ -231,21 +231,17 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
" --resources='{\"CustomResource1\": 3, "
"\"CustomReseource2\": 2}'")
assert "CPU" not in resources, "Use the --num-cpus argument."
assert "GPU" not in resources, "Use the --num-gpus argument."
if num_cpus is not None:
resources["CPU"] = num_cpus
if num_gpus is not None:
resources["GPU"] = num_gpus
ray_params = RayParams(
node_ip_address=node_ip_address,
object_manager_ports=[object_manager_port],
node_manager_ports=[node_manager_port],
object_manager_port=object_manager_port,
node_manager_port=node_manager_port,
num_workers=num_workers,
object_store_memory=object_store_memory,
redis_password=redis_password,
redirect_worker_output=not no_redirect_worker_output,
redirect_output=not no_redirect_output,
num_cpus=num_cpus,
num_gpus=num_gpus,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
+63 -86
View File
@@ -828,10 +828,12 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
return webui_url
def check_and_update_resources(resources):
def check_and_update_resources(num_cpus, num_gpus, resources):
"""Sanity check a resource dictionary and add sensible defaults.
Args:
num_cpus: The number of CPUs.
num_gpus: The number of GPUs.
resources: A dictionary mapping resource names to resource quantities.
Returns:
@@ -840,6 +842,13 @@ def check_and_update_resources(resources):
if resources is None:
resources = {}
resources = resources.copy()
assert "CPU" not in resources
assert "GPU" not in resources
if num_cpus is not None:
resources["CPU"] = num_cpus
if num_gpus is not None:
resources["GPU"] = num_gpus
if "CPU" not in resources:
# By default, use the number of hardware execution threads for the
# number of cores.
@@ -879,10 +888,9 @@ def check_and_update_resources(resources):
def start_raylet(ray_params,
index,
raylet_name,
plasma_store_name,
num_workers=0,
num_initial_workers=0,
use_valgrind=False,
use_profiler=False,
stdout_file=None,
@@ -894,15 +902,13 @@ def start_raylet(ray_params,
Args:
ray_params (ray.params.RayParams): The RayParams instance. The
following parameters could be checked: redis_address,
node_ip_address, worker_path, resources, object_manager_ports,
node_manager_ports, redis_password
index (int): Usually, this index is 0. When index > 0, it means
starting multiple raylet locally. The index will be used in
resources, object_manager_ports, node_manager_ports.
node_ip_address, worker_path, resources, num_cpus, num_gpus,
object_manager_port, node_manager_port, redis_password.
resources, object_manager_port, node_manager_port.
raylet_name (str): The name of the raylet socket to create.
plasma_store_name (str): The name of the plasma store socket to connect
to.
num_workers (int): The number of workers to start.
num_initial_workers (int): The number of workers to start initially.
use_valgrind (bool): True if the raylet should be started inside
of valgrind. If this is True, use_profiler must be False.
use_profiler (bool): True if the raylet should be started inside
@@ -926,7 +932,8 @@ def start_raylet(ray_params,
if use_valgrind and use_profiler:
raise Exception("Cannot use valgrind and profiler at the same time.")
static_resources = check_and_update_resources(ray_params.resources[index])
static_resources = check_and_update_resources(
ray_params.num_cpus, ray_params.num_gpus, ray_params.resources)
# Limit the number of workers that can be started in parallel by the
# raylet. However, make sure it is at least 1.
@@ -956,23 +963,23 @@ def start_raylet(ray_params,
# If the object manager port is None, then use 0 to cause the object
# manager to choose its own port.
if ray_params.object_manager_ports[index] is None:
ray_params.object_manager_ports[index] = 0
if ray_params.object_manager_port is None:
ray_params.object_manager_port = 0
# If the node manager port is None, then use 0 to cause the node manager
# to choose its own port.
if ray_params.node_manager_ports[index] is None:
ray_params.node_manager_ports[index] = 0
if ray_params.node_manager_port is None:
ray_params.node_manager_port = 0
command = [
RAYLET_EXECUTABLE,
raylet_name,
plasma_store_name,
str(ray_params.object_manager_ports[index]),
str(ray_params.node_manager_ports[index]),
str(ray_params.object_manager_port),
str(ray_params.node_manager_port),
ray_params.node_ip_address,
gcs_ip_address,
gcs_port,
str(num_workers),
str(num_initial_workers),
str(maximum_startup_concurrency),
resource_argument,
config_str,
@@ -1289,9 +1296,8 @@ def start_ray_processes(ray_params, cleanup=True):
Args:
ray_params (ray.params.RayParams): The RayParams instance. The
following parameters will be set to default values if it's None:
node_ip_address("127.0.0.1"), num_local_schedulers(1),
include_webui(False), worker_path(path of default_worker.py),
include_log_monitor(False)
node_ip_address("127.0.0.1"), include_webui(False),
worker_path(path of default_worker.py), include_log_monitor(False)
cleanup (bool): If cleanup is true, then the processes started here
will be killed by services.cleanup() when the Python process that
called this method exits.
@@ -1312,23 +1318,16 @@ def start_ray_processes(ray_params, cleanup=True):
ray_params.update_if_absent(
include_log_monitor=False,
resources={},
num_local_schedulers=1,
include_webui=False,
node_ip_address="127.0.0.1")
if not isinstance(ray_params.resources, list):
ray_params.resources = ray_params.num_local_schedulers * [
ray_params.resources
]
if ray_params.num_workers is not None:
raise Exception("The 'num_workers' argument is deprecated. Please use "
"'num_cpus' instead.")
else:
workers_per_local_scheduler = []
for resource_dict in ray_params.resources:
cpus = resource_dict.get("CPU")
workers_per_local_scheduler.append(cpus if cpus is not None else
multiprocessing.cpu_count())
num_initial_workers = (ray_params.num_cpus
if ray_params.num_cpus is not None else
multiprocessing.cpu_count())
ray_params.update_if_absent(
address_info={},
@@ -1402,37 +1401,16 @@ def start_ray_processes(ray_params, cleanup=True):
redis_password=ray_params.redis_password)
# Initialize with existing services.
if "object_store_addresses" not in ray_params.address_info:
ray_params.address_info["object_store_addresses"] = []
object_store_addresses = ray_params.address_info["object_store_addresses"]
if "raylet_socket_names" not in ray_params.address_info:
ray_params.address_info["raylet_socket_names"] = []
raylet_socket_names = ray_params.address_info["raylet_socket_names"]
object_store_address = ray_params.address_info.get("object_store_address")
raylet_socket_name = ray_params.address_info.get("raylet_socket_name")
# Get the ports to use for the object managers if any are provided.
if not isinstance(ray_params.object_manager_ports, list):
assert (ray_params.object_manager_ports is None
or ray_params.num_local_schedulers == 1)
ray_params.object_manager_ports = (ray_params.num_local_schedulers *
[ray_params.object_manager_ports])
assert len(
ray_params.object_manager_ports) == ray_params.num_local_schedulers
if not isinstance(ray_params.node_manager_ports, list):
assert (ray_params.node_manager_ports is None
or ray_params.num_local_schedulers == 1)
ray_params.node_manager_ports = (
ray_params.num_local_schedulers * [ray_params.node_manager_ports])
assert len(
ray_params.node_manager_ports) == ray_params.num_local_schedulers
# Start any object stores that do not yet exist.
for i in range(ray_params.num_local_schedulers -
len(object_store_addresses)):
# Start an object store if it does not yet exist.
if object_store_address is None:
# Start Plasma.
plasma_store_stdout_file, plasma_store_stderr_file = (
new_plasma_store_log_file(i, ray_params.redirect_output))
new_plasma_store_log_file(ray_params.redirect_output))
object_store_address = start_plasma_store(
ray_params.address_info["object_store_address"] = start_plasma_store(
ray_params.node_ip_address,
ray_params.redis_address,
store_stdout_file=plasma_store_stdout_file,
@@ -1443,25 +1421,25 @@ def start_ray_processes(ray_params, cleanup=True):
huge_pages=ray_params.huge_pages,
plasma_store_socket_name=ray_params.plasma_store_socket_name,
redis_password=ray_params.redis_password)
object_store_addresses.append(object_store_address)
time.sleep(0.1)
else:
raise Exception("JUST CHECKING IF THIS CODE GETS HIT.")
# Start any raylets that do not exist yet.
for raylet_index in range(
len(raylet_socket_names), ray_params.num_local_schedulers):
if raylet_socket_name is None:
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
raylet_index, redirect_output=ray_params.redirect_worker_output)
ray_params.address_info["raylet_socket_names"].append(
start_raylet(
ray_params,
raylet_index,
ray_params.raylet_socket_name or get_raylet_socket_name(),
object_store_addresses[raylet_index],
num_workers=workers_per_local_scheduler[raylet_index],
stdout_file=raylet_stdout_file,
stderr_file=raylet_stderr_file,
cleanup=cleanup,
config=config))
redirect_output=ray_params.redirect_worker_output)
ray_params.address_info["raylet_socket_name"] = start_raylet(
ray_params,
ray_params.raylet_socket_name or get_raylet_socket_name(),
ray_params.address_info["object_store_address"],
num_initial_workers=num_initial_workers,
stdout_file=raylet_stdout_file,
stderr_file=raylet_stderr_file,
cleanup=cleanup,
config=config)
else:
raise Exception("JUST CHECKING IF THIS CODE GETS HIT.")
# Try to start the web UI.
if ray_params.include_webui:
@@ -1486,12 +1464,11 @@ def start_ray_node(ray_params, cleanup=True):
Args:
ray_params (ray.params.RayParams): The RayParams instance. The
following parameters could be checked: node_ip_address,
redis_address, object_manager_ports, node_manager_ports,
num_workers, num_local_schedulers, object_store_memory,
redis_password, worker_path, cleanup, redirect_worker_output,
redirect_output, resources, plasma_directory, huge_pages,
plasma_store_socket_name, raylet_socket_name, temp_dir,
_internal_config
redis_address, object_manager_port, node_manager_port,
num_workers, object_store_memory, redis_password, worker_path,
cleanup, redirect_worker_output, redirect_output, resources,
plasma_directory, huge_pages, plasma_store_socket_name,
raylet_socket_name, temp_dir, _internal_config.
cleanup (bool): If cleanup is true, then the processes started here
will be killed by services.cleanup() when the Python process that
called this method exits.
@@ -1513,14 +1490,14 @@ def start_ray_head(ray_params, cleanup=True):
Args:
ray_params (ray.params.RayParams): The RayParams instance. The
following parameters could be checked: address_info,
object_manager_ports, node_manager_ports, node_ip_address,
redis_port, redis_shard_ports, num_workers, num_local_schedulers,
object_store_memory, redis_max_memory, worker_path, cleanup,
redirect_worker_output, redirect_output,
start_workers_from_local_scheduler, resources, num_redis_shards,
redis_max_clients, redis_password, include_webui, huge_pages,
plasma_directory, autoscaling_config, plasma_store_socket_name,
raylet_socket_name, temp_dir, _internal_config
object_manager_port, node_manager_port, node_ip_address,
redis_port, redis_shard_ports, num_workers, object_store_memory,
redis_max_memory, worker_path, cleanup, redirect_worker_output,
redirect_output, start_workers_from_local_scheduler, resources,
num_redis_shards, redis_max_clients, redis_password, include_webui,
huge_pages, plasma_directory, autoscaling_config,
plasma_store_socket_name, raylet_socket_name, temp_dir,
_internal_config.
cleanup (bool): If cleanup is true, then the processes started here
will be killed by services.cleanup() when the Python process that
called this method exits.
+4 -5
View File
@@ -194,11 +194,10 @@ def new_redis_log_file(redirect_output, shard_number=None):
return redis_stdout_file, redis_stderr_file
def new_raylet_log_file(local_scheduler_index, redirect_output):
def new_raylet_log_file(redirect_output):
"""Create new logging files for raylet."""
raylet_stdout_file, raylet_stderr_file = new_log_files(
"raylet_{}".format(local_scheduler_index),
redirect_output=redirect_output)
"raylet", redirect_output=redirect_output)
return raylet_stdout_file, raylet_stderr_file
@@ -223,10 +222,10 @@ def new_log_monitor_log_file():
return log_monitor_stdout_file, log_monitor_stderr_file
def new_plasma_store_log_file(local_scheduler_index, redirect_output):
def new_plasma_store_log_file(redirect_output):
"""Create new logging files for the plasma store."""
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
"plasma_store_{}".format(local_scheduler_index), redirect_output)
"plasma_store", redirect_output)
return plasma_store_stdout_file, plasma_store_stderr_file
+6 -8
View File
@@ -63,7 +63,7 @@ class Cluster(object):
All nodes are by default started with the following settings:
cleanup=True,
resources={"CPU": 1},
num_cpus=1,
object_store_memory=100 * (2**20) # 100 MB
Args:
@@ -74,9 +74,7 @@ class Cluster(object):
Node object of the added Ray node.
"""
node_kwargs = {
"resources": {
"CPU": 1
},
"num_cpus": 1,
"object_store_memory": 100 * (2**20) # 100 MB
}
node_kwargs.update(override_kwargs)
@@ -103,7 +101,7 @@ class Cluster(object):
node = Node(address_info, process_dict_copy)
self.worker_nodes[node] = address_info
logger.info("Starting Node with raylet socket {}".format(
address_info["raylet_socket_names"]))
address_info["raylet_socket_name"]))
return node
@@ -125,10 +123,10 @@ class Cluster(object):
assert not node.any_processes_alive(), (
"There are zombie processes left over after killing.")
def wait_for_nodes(self, retries=30):
def wait_for_nodes(self, retries=100):
"""Waits for all nodes to be registered with global state.
By default, waits for 3 seconds.
By default, waits for 10 seconds.
Args:
retries (int): Number of times to retry checking client table.
@@ -239,4 +237,4 @@ class Node(object):
Assuming one plasma store per raylet, this may be used as a unique
identifier for a node.
"""
return self.address_info['object_store_addresses'][0]
return self.address_info['object_store_address']
+3 -3
View File
@@ -30,7 +30,7 @@ def cluster_start():
initialize_head=True,
connect=True,
head_node_args={
"resources": dict(CPU=1),
"num_cpus": 1,
"_internal_config": json.dumps({
"num_heartbeats_timeout": 10
})
@@ -94,7 +94,7 @@ def test_add_remove_cluster_resources(cluster_start):
cluster = cluster_start
assert ray.global_state.cluster_resources()["CPU"] == 1
nodes = []
nodes += [cluster.add_node(resources=dict(CPU=1))]
nodes += [cluster.add_node(num_cpus=1)]
assert cluster.wait_for_nodes()
assert ray.global_state.cluster_resources()["CPU"] == 2
@@ -103,6 +103,6 @@ def test_add_remove_cluster_resources(cluster_start):
assert ray.global_state.cluster_resources()["CPU"] == 1
for i in range(5):
nodes += [cluster.add_node(resources=dict(CPU=1))]
nodes += [cluster.add_node(num_cpus=1)]
assert cluster.wait_for_nodes()
assert ray.global_state.cluster_resources()["CPU"] == 6
+14 -14
View File
@@ -30,7 +30,7 @@ def _start_new_cluster():
initialize_head=True,
connect=True,
head_node_args={
"resources": dict(CPU=1),
"num_cpus": 1,
"_internal_config": json.dumps({
"num_heartbeats_timeout": 10
})
@@ -58,7 +58,7 @@ def start_connected_emptyhead_cluster():
initialize_head=True,
connect=True,
head_node_args={
"resources": dict(CPU=0),
"num_cpus": 0,
"_internal_config": json.dumps({
"num_heartbeats_timeout": 10
})
@@ -84,7 +84,7 @@ def test_counting_resources(start_connected_cluster):
runner.add_trial(t)
runner.step() # run 1
nodes += [cluster.add_node(resources=dict(CPU=1))]
nodes += [cluster.add_node(num_cpus=1)]
assert cluster.wait_for_nodes()
assert ray.global_state.cluster_resources()["CPU"] == 2
cluster.remove_node(nodes.pop())
@@ -94,7 +94,7 @@ def test_counting_resources(start_connected_cluster):
assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1
for i in range(5):
nodes += [cluster.add_node(resources=dict(CPU=1))]
nodes += [cluster.add_node(num_cpus=1)]
assert cluster.wait_for_nodes()
assert ray.global_state.cluster_resources()["CPU"] == 6
@@ -105,7 +105,7 @@ def test_counting_resources(start_connected_cluster):
def test_remove_node_before_result(start_connected_emptyhead_cluster):
"""Tune continues when node is removed before trial returns."""
cluster = start_connected_emptyhead_cluster
node = cluster.add_node(resources=dict(CPU=1))
node = cluster.add_node(num_cpus=1)
assert cluster.wait_for_nodes()
runner = TrialRunner(BasicVariantGenerator())
@@ -122,7 +122,7 @@ def test_remove_node_before_result(start_connected_emptyhead_cluster):
runner.step() # run 1
assert trial.status == Trial.RUNNING
cluster.remove_node(node)
cluster.add_node(resources=dict(CPU=1))
cluster.add_node(num_cpus=1)
cluster.wait_for_nodes()
assert ray.global_state.cluster_resources()["CPU"] == 1
@@ -144,7 +144,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
The trial state should also be consistent with the checkpoint.
"""
cluster = start_connected_emptyhead_cluster
node = cluster.add_node(resources=dict(CPU=1))
node = cluster.add_node(num_cpus=1)
assert cluster.wait_for_nodes()
runner = TrialRunner(BasicVariantGenerator())
@@ -162,7 +162,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
runner.step() # start
runner.step() # 1 result
assert t.last_result is not None
node2 = cluster.add_node(resources=dict(CPU=1))
node2 = cluster.add_node(num_cpus=1)
cluster.remove_node(node)
assert cluster.wait_for_nodes()
runner.step() # Recovery step
@@ -183,7 +183,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
runner.step() # 1 result
runner.step() # 2 result and checkpoint
assert t2.has_checkpoint()
node3 = cluster.add_node(resources=dict(CPU=1))
node3 = cluster.add_node(num_cpus=1)
cluster.remove_node(node2)
assert cluster.wait_for_nodes()
runner.step() # Recovery step
@@ -198,7 +198,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
runner.add_trial(t3)
runner.step() # start
runner.step() # 1 result
cluster.add_node(resources=dict(CPU=1))
cluster.add_node(num_cpus=1)
cluster.remove_node(node3)
assert cluster.wait_for_nodes()
runner.step() # Error handling step
@@ -215,7 +215,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
def test_trial_requeue(start_connected_emptyhead_cluster):
"""Removing a node in full cluster causes Trial to be requeued."""
cluster = start_connected_emptyhead_cluster
node = cluster.add_node(resources=dict(CPU=1))
node = cluster.add_node(num_cpus=1)
assert cluster.wait_for_nodes()
runner = TrialRunner(BasicVariantGenerator())
@@ -246,7 +246,7 @@ def test_trial_requeue(start_connected_emptyhead_cluster):
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
"""Test checks that trial restarts if checkpoint is lost w/ node fail."""
cluster = start_connected_emptyhead_cluster
node = cluster.add_node(resources=dict(CPU=1))
node = cluster.add_node(num_cpus=1)
assert cluster.wait_for_nodes()
runner = TrialRunner(BasicVariantGenerator())
@@ -265,7 +265,7 @@ def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
runner.step() # 1 result
runner.step() # 2 result and checkpoint
assert t1.has_checkpoint()
cluster.add_node(resources=dict(CPU=1))
cluster.add_node(num_cpus=1)
cluster.remove_node(node)
assert cluster.wait_for_nodes()
shutil.rmtree(os.path.dirname(t1._checkpoint.value))
@@ -280,7 +280,7 @@ def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
def test_cluster_down_simple(start_connected_cluster, tmpdir):
"""Tests that TrialRunner save/restore works on cluster shutdown."""
cluster = start_connected_cluster
cluster.add_node(resources=dict(CPU=1))
cluster.add_node(num_cpus=1)
assert cluster.wait_for_nodes()
dirpath = str(tmpdir)
+9 -62
View File
@@ -1204,17 +1204,14 @@ def get_address_info_from_redis_helper(redis_address,
if len(raylets) == 0:
raise Exception(
"Redis has started but no raylets have registered yet.")
object_store_addresses = [
ray.utils.decode(raylet.ObjectStoreSocketName()) for raylet in raylets
]
raylet_socket_names = [
ray.utils.decode(raylet.RayletSocketName()) for raylet in raylets
]
object_store_address = ray.utils.decode(raylets[0].ObjectStoreSocketName())
raylet_socket_name = ray.utils.decode(raylets[0].RayletSocketName())
return {
"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"raylet_socket_names": raylet_socket_names,
"object_store_address": object_store_address,
"raylet_socket_name": raylet_socket_name,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)
}
@@ -1242,44 +1239,6 @@ def get_address_info_from_redis(redis_address,
counter += 1
def _normalize_resource_arguments(num_cpus, num_gpus, resources,
num_local_schedulers):
"""Stick the CPU and GPU arguments into the resources dictionary.
This also checks that the arguments are well-formed.
Args:
num_cpus: Either a number of CPUs or a list of numbers of CPUs.
num_gpus: Either a number of CPUs or a list of numbers of CPUs.
resources: Either a dictionary of resource mappings or a list of
dictionaries of resource mappings.
num_local_schedulers: The number of local schedulers.
Returns:
A list of dictionaries of resources of length num_local_schedulers.
"""
if resources is None:
resources = {}
if not isinstance(num_cpus, list):
num_cpus = num_local_schedulers * [num_cpus]
if not isinstance(num_gpus, list):
num_gpus = num_local_schedulers * [num_gpus]
if not isinstance(resources, list):
resources = num_local_schedulers * [resources]
new_resources = [r.copy() for r in resources]
for i in range(num_local_schedulers):
assert "CPU" not in new_resources[i], "Use the 'num_cpus' argument."
assert "GPU" not in new_resources[i], "Use the 'num_gpus' argument."
if num_cpus[i] is not None:
new_resources[i]["CPU"] = num_cpus[i]
if num_gpus[i] is not None:
new_resources[i]["GPU"] = num_gpus[i]
return new_resources
def _init(ray_params, driver_id=None):
"""Helper method to connect to an existing Ray cluster or start a new one.
@@ -1291,8 +1250,8 @@ def _init(ray_params, driver_id=None):
ray_params (ray.params.RayParams): The RayParams instance. The
following parameters could be checked: address_info,
start_ray_local, object_id_seed, num_workers,
num_local_schedulers, object_store_memory, redis_max_memory,
local_mode, redirect_worker_output, driver_mode, redirect_output,
object_store_memory, redis_max_memory, local_mode,
redirect_worker_output, driver_mode, redirect_output,
start_workers_from_local_scheduler, num_cpus, num_gpus, resources,
num_redis_shards, redis_max_clients, redis_password,
plasma_directory, huge_pages, include_webui, driver_id,
@@ -1333,18 +1292,9 @@ def _init(ray_params, driver_id=None):
# are already registered in address_info.
ray_params.update_if_absent(
node_ip_address=ray.services.get_node_ip_address())
# Use 1 local scheduler if num_local_schedulers is not provided. If
# existing local schedulers are provided, use that count as
# num_local_schedulers.
ray_params.update_if_absent(num_local_schedulers=1)
# Use 1 additional redis shard if num_redis_shards is not provided.
ray_params.update_if_absent(num_redis_shards=1)
# Stick the CPU and GPU resources into the resource dictionary.
ray_params.resources = _normalize_resource_arguments(
ray_params.num_cpus, ray_params.num_gpus, ray_params.resources,
ray_params.num_local_schedulers)
# Start the scheduler, object store, and some workers. These will be
# killed by the call to shutdown(), which happens when the Python
# script exits.
@@ -1356,9 +1306,6 @@ def _init(ray_params, driver_id=None):
if ray_params.num_workers is not None:
raise Exception("When connecting to an existing cluster, "
"num_workers must not be provided.")
if ray_params.num_local_schedulers is not None:
raise Exception("When connecting to an existing cluster, "
"num_local_schedulers must not be provided.")
if ray_params.num_cpus is not None or ray_params.num_gpus is not None:
raise Exception("When connecting to an existing cluster, num_cpus "
"and num_gpus must not be provided.")
@@ -1417,11 +1364,11 @@ def _init(ray_params, driver_id=None):
"node_ip_address": ray_params.node_ip_address,
"redis_address": ray_params.address_info["redis_address"],
"store_socket_name": ray_params.address_info[
"object_store_addresses"][0],
"object_store_address"],
"webui_url": ray_params.address_info["webui_url"],
}
driver_address_info["raylet_socket_name"] = (
ray_params.address_info["raylet_socket_names"][0])
ray_params.address_info["raylet_socket_name"])
# We only pass `temp_dir` to a worker (WORKER_MODE).
# It can't be a worker here.