mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 17:34:51 +08:00
Remove num_local_schedulers argument from ray.worker._init. (#3704)
* Remove num_local_schedulers argument from ray.worker._init. * Fix * Fix tests.
This commit is contained in:
committed by
Philipp Moritz
parent
e78562b2e8
commit
c9d70f0dda
+22
-18
@@ -32,12 +32,8 @@ class RayParams(object):
|
||||
ignored.
|
||||
redis_shard_ports: A list of the ports to use for the non-primary Redis
|
||||
shards.
|
||||
num_cpus (int): Number of cpus the user wishes all local schedulers to
|
||||
be configured with.
|
||||
num_gpus (int): Number of gpus the user wishes all local schedulers to
|
||||
be configured with.
|
||||
num_local_schedulers (int): The number of local schedulers to start.
|
||||
This is only provided if start_ray_local is True.
|
||||
num_cpus (int): Number of CPUs to configure the raylet with.
|
||||
num_gpus (int): Number of GPUs to configure the raylet with.
|
||||
resources: A dictionary mapping the name of a resource to the quantity
|
||||
of that resource available.
|
||||
object_store_memory: The amount of memory (in bytes) to start the
|
||||
@@ -46,12 +42,8 @@ class RayParams(object):
|
||||
to use, or None for no limit. Once the limit is exceeded, redis
|
||||
will start LRU eviction of entries. This only applies to the
|
||||
sharded redis tables (task and object tables).
|
||||
object_manager_ports (list): A list of the ports to use for the object
|
||||
managers. There should be one per object manager being started on
|
||||
this node (typically just one).
|
||||
node_manager_ports (list): A list of the ports to use for the node
|
||||
managers. There should be one per node manager being started on
|
||||
this node (typically just one).
|
||||
object_manager_port int: The port to use for the object manager.
|
||||
node_manager_port: The port to use for the node manager.
|
||||
node_ip_address (str): The IP address of the node that we are on.
|
||||
object_id_seed (int): Used to seed the deterministic generation of
|
||||
object IDs. The same value can be used across multiple runs of the
|
||||
@@ -97,14 +89,13 @@ class RayParams(object):
|
||||
redis_address=None,
|
||||
num_cpus=None,
|
||||
num_gpus=None,
|
||||
num_local_schedulers=None,
|
||||
resources=None,
|
||||
object_store_memory=None,
|
||||
redis_max_memory=None,
|
||||
redis_port=None,
|
||||
redis_shard_ports=None,
|
||||
object_manager_ports=None,
|
||||
node_manager_ports=None,
|
||||
object_manager_port=None,
|
||||
node_manager_port=None,
|
||||
node_ip_address=None,
|
||||
object_id_seed=None,
|
||||
num_workers=None,
|
||||
@@ -133,14 +124,13 @@ class RayParams(object):
|
||||
self.redis_address = redis_address
|
||||
self.num_cpus = num_cpus
|
||||
self.num_gpus = num_gpus
|
||||
self.num_local_schedulers = num_local_schedulers
|
||||
self.resources = resources
|
||||
self.object_store_memory = object_store_memory
|
||||
self.redis_max_memory = redis_max_memory
|
||||
self.redis_port = redis_port
|
||||
self.redis_shard_ports = redis_shard_ports
|
||||
self.object_manager_ports = object_manager_ports
|
||||
self.node_manager_ports = node_manager_ports
|
||||
self.object_manager_port = object_manager_port
|
||||
self.node_manager_port = node_manager_port
|
||||
self.node_ip_address = node_ip_address
|
||||
self.num_workers = num_workers
|
||||
self.local_mode = local_mode
|
||||
@@ -160,6 +150,7 @@ class RayParams(object):
|
||||
self.include_log_monitor = include_log_monitor
|
||||
self.autoscaling_config = autoscaling_config
|
||||
self._internal_config = _internal_config
|
||||
self._check_usage()
|
||||
|
||||
def update(self, **kwargs):
|
||||
"""Update the settings according to the keyword arguments.
|
||||
@@ -174,6 +165,8 @@ class RayParams(object):
|
||||
raise ValueError("Invalid RayParams parameter in"
|
||||
" update: %s" % arg)
|
||||
|
||||
self._check_usage()
|
||||
|
||||
def update_if_absent(self, **kwargs):
|
||||
"""Update the settings when the target fields are None.
|
||||
|
||||
@@ -187,3 +180,14 @@ class RayParams(object):
|
||||
else:
|
||||
raise ValueError("Invalid RayParams parameter in"
|
||||
" update_if_absent: %s" % arg)
|
||||
|
||||
self._check_usage()
|
||||
|
||||
def _check_usage(self):
|
||||
if self.resources is not None:
|
||||
assert "CPU" not in self.resources, (
|
||||
"'CPU' should not be included in the resource dictionary. Use "
|
||||
"num_cpus instead.")
|
||||
assert "GPU" not in self.resources, (
|
||||
"'GPU' should not be included in the resource dictionary. Use "
|
||||
"num_gpus instead.")
|
||||
|
||||
@@ -52,7 +52,7 @@ def create_parser(parser_creator=None):
|
||||
type=int,
|
||||
help="--num-gpus to use if starting a new cluster.")
|
||||
parser.add_argument(
|
||||
"--ray-num-local-schedulers",
|
||||
"--ray-num-nodes",
|
||||
default=None,
|
||||
type=int,
|
||||
help="Emulate multiple cluster nodes for debugging.")
|
||||
@@ -122,9 +122,9 @@ def run(args, parser):
|
||||
if not exp.get("env") and not exp.get("config", {}).get("env"):
|
||||
parser.error("the following arguments are required: --env")
|
||||
|
||||
if args.ray_num_local_schedulers:
|
||||
if args.ray_num_nodes:
|
||||
cluster = Cluster()
|
||||
for _ in range(args.ray_num_local_schedulers):
|
||||
for _ in range(args.ray_num_nodes):
|
||||
cluster.add_node(
|
||||
resources={
|
||||
"num_cpus": args.ray_num_cpus or 1,
|
||||
|
||||
@@ -231,21 +231,17 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
|
||||
" --resources='{\"CustomResource1\": 3, "
|
||||
"\"CustomReseource2\": 2}'")
|
||||
|
||||
assert "CPU" not in resources, "Use the --num-cpus argument."
|
||||
assert "GPU" not in resources, "Use the --num-gpus argument."
|
||||
if num_cpus is not None:
|
||||
resources["CPU"] = num_cpus
|
||||
if num_gpus is not None:
|
||||
resources["GPU"] = num_gpus
|
||||
ray_params = RayParams(
|
||||
node_ip_address=node_ip_address,
|
||||
object_manager_ports=[object_manager_port],
|
||||
node_manager_ports=[node_manager_port],
|
||||
object_manager_port=object_manager_port,
|
||||
node_manager_port=node_manager_port,
|
||||
num_workers=num_workers,
|
||||
object_store_memory=object_store_memory,
|
||||
redis_password=redis_password,
|
||||
redirect_worker_output=not no_redirect_worker_output,
|
||||
redirect_output=not no_redirect_output,
|
||||
num_cpus=num_cpus,
|
||||
num_gpus=num_gpus,
|
||||
resources=resources,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
|
||||
+63
-86
@@ -828,10 +828,12 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
|
||||
return webui_url
|
||||
|
||||
|
||||
def check_and_update_resources(resources):
|
||||
def check_and_update_resources(num_cpus, num_gpus, resources):
|
||||
"""Sanity check a resource dictionary and add sensible defaults.
|
||||
|
||||
Args:
|
||||
num_cpus: The number of CPUs.
|
||||
num_gpus: The number of GPUs.
|
||||
resources: A dictionary mapping resource names to resource quantities.
|
||||
|
||||
Returns:
|
||||
@@ -840,6 +842,13 @@ def check_and_update_resources(resources):
|
||||
if resources is None:
|
||||
resources = {}
|
||||
resources = resources.copy()
|
||||
assert "CPU" not in resources
|
||||
assert "GPU" not in resources
|
||||
if num_cpus is not None:
|
||||
resources["CPU"] = num_cpus
|
||||
if num_gpus is not None:
|
||||
resources["GPU"] = num_gpus
|
||||
|
||||
if "CPU" not in resources:
|
||||
# By default, use the number of hardware execution threads for the
|
||||
# number of cores.
|
||||
@@ -879,10 +888,9 @@ def check_and_update_resources(resources):
|
||||
|
||||
|
||||
def start_raylet(ray_params,
|
||||
index,
|
||||
raylet_name,
|
||||
plasma_store_name,
|
||||
num_workers=0,
|
||||
num_initial_workers=0,
|
||||
use_valgrind=False,
|
||||
use_profiler=False,
|
||||
stdout_file=None,
|
||||
@@ -894,15 +902,13 @@ def start_raylet(ray_params,
|
||||
Args:
|
||||
ray_params (ray.params.RayParams): The RayParams instance. The
|
||||
following parameters could be checked: redis_address,
|
||||
node_ip_address, worker_path, resources, object_manager_ports,
|
||||
node_manager_ports, redis_password
|
||||
index (int): Usually, this index is 0. When index > 0, it means
|
||||
starting multiple raylet locally. The index will be used in
|
||||
resources, object_manager_ports, node_manager_ports.
|
||||
node_ip_address, worker_path, resources, num_cpus, num_gpus,
|
||||
object_manager_port, node_manager_port, redis_password.
|
||||
resources, object_manager_port, node_manager_port.
|
||||
raylet_name (str): The name of the raylet socket to create.
|
||||
plasma_store_name (str): The name of the plasma store socket to connect
|
||||
to.
|
||||
num_workers (int): The number of workers to start.
|
||||
num_initial_workers (int): The number of workers to start initially.
|
||||
use_valgrind (bool): True if the raylet should be started inside
|
||||
of valgrind. If this is True, use_profiler must be False.
|
||||
use_profiler (bool): True if the raylet should be started inside
|
||||
@@ -926,7 +932,8 @@ def start_raylet(ray_params,
|
||||
if use_valgrind and use_profiler:
|
||||
raise Exception("Cannot use valgrind and profiler at the same time.")
|
||||
|
||||
static_resources = check_and_update_resources(ray_params.resources[index])
|
||||
static_resources = check_and_update_resources(
|
||||
ray_params.num_cpus, ray_params.num_gpus, ray_params.resources)
|
||||
|
||||
# Limit the number of workers that can be started in parallel by the
|
||||
# raylet. However, make sure it is at least 1.
|
||||
@@ -956,23 +963,23 @@ def start_raylet(ray_params,
|
||||
|
||||
# If the object manager port is None, then use 0 to cause the object
|
||||
# manager to choose its own port.
|
||||
if ray_params.object_manager_ports[index] is None:
|
||||
ray_params.object_manager_ports[index] = 0
|
||||
if ray_params.object_manager_port is None:
|
||||
ray_params.object_manager_port = 0
|
||||
# If the node manager port is None, then use 0 to cause the node manager
|
||||
# to choose its own port.
|
||||
if ray_params.node_manager_ports[index] is None:
|
||||
ray_params.node_manager_ports[index] = 0
|
||||
if ray_params.node_manager_port is None:
|
||||
ray_params.node_manager_port = 0
|
||||
|
||||
command = [
|
||||
RAYLET_EXECUTABLE,
|
||||
raylet_name,
|
||||
plasma_store_name,
|
||||
str(ray_params.object_manager_ports[index]),
|
||||
str(ray_params.node_manager_ports[index]),
|
||||
str(ray_params.object_manager_port),
|
||||
str(ray_params.node_manager_port),
|
||||
ray_params.node_ip_address,
|
||||
gcs_ip_address,
|
||||
gcs_port,
|
||||
str(num_workers),
|
||||
str(num_initial_workers),
|
||||
str(maximum_startup_concurrency),
|
||||
resource_argument,
|
||||
config_str,
|
||||
@@ -1289,9 +1296,8 @@ def start_ray_processes(ray_params, cleanup=True):
|
||||
Args:
|
||||
ray_params (ray.params.RayParams): The RayParams instance. The
|
||||
following parameters will be set to default values if it's None:
|
||||
node_ip_address("127.0.0.1"), num_local_schedulers(1),
|
||||
include_webui(False), worker_path(path of default_worker.py),
|
||||
include_log_monitor(False)
|
||||
node_ip_address("127.0.0.1"), include_webui(False),
|
||||
worker_path(path of default_worker.py), include_log_monitor(False)
|
||||
cleanup (bool): If cleanup is true, then the processes started here
|
||||
will be killed by services.cleanup() when the Python process that
|
||||
called this method exits.
|
||||
@@ -1312,23 +1318,16 @@ def start_ray_processes(ray_params, cleanup=True):
|
||||
ray_params.update_if_absent(
|
||||
include_log_monitor=False,
|
||||
resources={},
|
||||
num_local_schedulers=1,
|
||||
include_webui=False,
|
||||
node_ip_address="127.0.0.1")
|
||||
if not isinstance(ray_params.resources, list):
|
||||
ray_params.resources = ray_params.num_local_schedulers * [
|
||||
ray_params.resources
|
||||
]
|
||||
|
||||
if ray_params.num_workers is not None:
|
||||
raise Exception("The 'num_workers' argument is deprecated. Please use "
|
||||
"'num_cpus' instead.")
|
||||
else:
|
||||
workers_per_local_scheduler = []
|
||||
for resource_dict in ray_params.resources:
|
||||
cpus = resource_dict.get("CPU")
|
||||
workers_per_local_scheduler.append(cpus if cpus is not None else
|
||||
multiprocessing.cpu_count())
|
||||
num_initial_workers = (ray_params.num_cpus
|
||||
if ray_params.num_cpus is not None else
|
||||
multiprocessing.cpu_count())
|
||||
|
||||
ray_params.update_if_absent(
|
||||
address_info={},
|
||||
@@ -1402,37 +1401,16 @@ def start_ray_processes(ray_params, cleanup=True):
|
||||
redis_password=ray_params.redis_password)
|
||||
|
||||
# Initialize with existing services.
|
||||
if "object_store_addresses" not in ray_params.address_info:
|
||||
ray_params.address_info["object_store_addresses"] = []
|
||||
object_store_addresses = ray_params.address_info["object_store_addresses"]
|
||||
if "raylet_socket_names" not in ray_params.address_info:
|
||||
ray_params.address_info["raylet_socket_names"] = []
|
||||
raylet_socket_names = ray_params.address_info["raylet_socket_names"]
|
||||
object_store_address = ray_params.address_info.get("object_store_address")
|
||||
raylet_socket_name = ray_params.address_info.get("raylet_socket_name")
|
||||
|
||||
# Get the ports to use for the object managers if any are provided.
|
||||
if not isinstance(ray_params.object_manager_ports, list):
|
||||
assert (ray_params.object_manager_ports is None
|
||||
or ray_params.num_local_schedulers == 1)
|
||||
ray_params.object_manager_ports = (ray_params.num_local_schedulers *
|
||||
[ray_params.object_manager_ports])
|
||||
assert len(
|
||||
ray_params.object_manager_ports) == ray_params.num_local_schedulers
|
||||
if not isinstance(ray_params.node_manager_ports, list):
|
||||
assert (ray_params.node_manager_ports is None
|
||||
or ray_params.num_local_schedulers == 1)
|
||||
ray_params.node_manager_ports = (
|
||||
ray_params.num_local_schedulers * [ray_params.node_manager_ports])
|
||||
assert len(
|
||||
ray_params.node_manager_ports) == ray_params.num_local_schedulers
|
||||
|
||||
# Start any object stores that do not yet exist.
|
||||
for i in range(ray_params.num_local_schedulers -
|
||||
len(object_store_addresses)):
|
||||
# Start an object store if it does not yet exist.
|
||||
if object_store_address is None:
|
||||
# Start Plasma.
|
||||
plasma_store_stdout_file, plasma_store_stderr_file = (
|
||||
new_plasma_store_log_file(i, ray_params.redirect_output))
|
||||
new_plasma_store_log_file(ray_params.redirect_output))
|
||||
|
||||
object_store_address = start_plasma_store(
|
||||
ray_params.address_info["object_store_address"] = start_plasma_store(
|
||||
ray_params.node_ip_address,
|
||||
ray_params.redis_address,
|
||||
store_stdout_file=plasma_store_stdout_file,
|
||||
@@ -1443,25 +1421,25 @@ def start_ray_processes(ray_params, cleanup=True):
|
||||
huge_pages=ray_params.huge_pages,
|
||||
plasma_store_socket_name=ray_params.plasma_store_socket_name,
|
||||
redis_password=ray_params.redis_password)
|
||||
object_store_addresses.append(object_store_address)
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
raise Exception("JUST CHECKING IF THIS CODE GETS HIT.")
|
||||
|
||||
# Start any raylets that do not exist yet.
|
||||
for raylet_index in range(
|
||||
len(raylet_socket_names), ray_params.num_local_schedulers):
|
||||
if raylet_socket_name is None:
|
||||
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
|
||||
raylet_index, redirect_output=ray_params.redirect_worker_output)
|
||||
ray_params.address_info["raylet_socket_names"].append(
|
||||
start_raylet(
|
||||
ray_params,
|
||||
raylet_index,
|
||||
ray_params.raylet_socket_name or get_raylet_socket_name(),
|
||||
object_store_addresses[raylet_index],
|
||||
num_workers=workers_per_local_scheduler[raylet_index],
|
||||
stdout_file=raylet_stdout_file,
|
||||
stderr_file=raylet_stderr_file,
|
||||
cleanup=cleanup,
|
||||
config=config))
|
||||
redirect_output=ray_params.redirect_worker_output)
|
||||
ray_params.address_info["raylet_socket_name"] = start_raylet(
|
||||
ray_params,
|
||||
ray_params.raylet_socket_name or get_raylet_socket_name(),
|
||||
ray_params.address_info["object_store_address"],
|
||||
num_initial_workers=num_initial_workers,
|
||||
stdout_file=raylet_stdout_file,
|
||||
stderr_file=raylet_stderr_file,
|
||||
cleanup=cleanup,
|
||||
config=config)
|
||||
else:
|
||||
raise Exception("JUST CHECKING IF THIS CODE GETS HIT.")
|
||||
|
||||
# Try to start the web UI.
|
||||
if ray_params.include_webui:
|
||||
@@ -1486,12 +1464,11 @@ def start_ray_node(ray_params, cleanup=True):
|
||||
Args:
|
||||
ray_params (ray.params.RayParams): The RayParams instance. The
|
||||
following parameters could be checked: node_ip_address,
|
||||
redis_address, object_manager_ports, node_manager_ports,
|
||||
num_workers, num_local_schedulers, object_store_memory,
|
||||
redis_password, worker_path, cleanup, redirect_worker_output,
|
||||
redirect_output, resources, plasma_directory, huge_pages,
|
||||
plasma_store_socket_name, raylet_socket_name, temp_dir,
|
||||
_internal_config
|
||||
redis_address, object_manager_port, node_manager_port,
|
||||
num_workers, object_store_memory, redis_password, worker_path,
|
||||
cleanup, redirect_worker_output, redirect_output, resources,
|
||||
plasma_directory, huge_pages, plasma_store_socket_name,
|
||||
raylet_socket_name, temp_dir, _internal_config.
|
||||
cleanup (bool): If cleanup is true, then the processes started here
|
||||
will be killed by services.cleanup() when the Python process that
|
||||
called this method exits.
|
||||
@@ -1513,14 +1490,14 @@ def start_ray_head(ray_params, cleanup=True):
|
||||
Args:
|
||||
ray_params (ray.params.RayParams): The RayParams instance. The
|
||||
following parameters could be checked: address_info,
|
||||
object_manager_ports, node_manager_ports, node_ip_address,
|
||||
redis_port, redis_shard_ports, num_workers, num_local_schedulers,
|
||||
object_store_memory, redis_max_memory, worker_path, cleanup,
|
||||
redirect_worker_output, redirect_output,
|
||||
start_workers_from_local_scheduler, resources, num_redis_shards,
|
||||
redis_max_clients, redis_password, include_webui, huge_pages,
|
||||
plasma_directory, autoscaling_config, plasma_store_socket_name,
|
||||
raylet_socket_name, temp_dir, _internal_config
|
||||
object_manager_port, node_manager_port, node_ip_address,
|
||||
redis_port, redis_shard_ports, num_workers, object_store_memory,
|
||||
redis_max_memory, worker_path, cleanup, redirect_worker_output,
|
||||
redirect_output, start_workers_from_local_scheduler, resources,
|
||||
num_redis_shards, redis_max_clients, redis_password, include_webui,
|
||||
huge_pages, plasma_directory, autoscaling_config,
|
||||
plasma_store_socket_name, raylet_socket_name, temp_dir,
|
||||
_internal_config.
|
||||
cleanup (bool): If cleanup is true, then the processes started here
|
||||
will be killed by services.cleanup() when the Python process that
|
||||
called this method exits.
|
||||
|
||||
@@ -194,11 +194,10 @@ def new_redis_log_file(redirect_output, shard_number=None):
|
||||
return redis_stdout_file, redis_stderr_file
|
||||
|
||||
|
||||
def new_raylet_log_file(local_scheduler_index, redirect_output):
|
||||
def new_raylet_log_file(redirect_output):
|
||||
"""Create new logging files for raylet."""
|
||||
raylet_stdout_file, raylet_stderr_file = new_log_files(
|
||||
"raylet_{}".format(local_scheduler_index),
|
||||
redirect_output=redirect_output)
|
||||
"raylet", redirect_output=redirect_output)
|
||||
return raylet_stdout_file, raylet_stderr_file
|
||||
|
||||
|
||||
@@ -223,10 +222,10 @@ def new_log_monitor_log_file():
|
||||
return log_monitor_stdout_file, log_monitor_stderr_file
|
||||
|
||||
|
||||
def new_plasma_store_log_file(local_scheduler_index, redirect_output):
|
||||
def new_plasma_store_log_file(redirect_output):
|
||||
"""Create new logging files for the plasma store."""
|
||||
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
|
||||
"plasma_store_{}".format(local_scheduler_index), redirect_output)
|
||||
"plasma_store", redirect_output)
|
||||
return plasma_store_stdout_file, plasma_store_stderr_file
|
||||
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ class Cluster(object):
|
||||
|
||||
All nodes are by default started with the following settings:
|
||||
cleanup=True,
|
||||
resources={"CPU": 1},
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * (2**20) # 100 MB
|
||||
|
||||
Args:
|
||||
@@ -74,9 +74,7 @@ class Cluster(object):
|
||||
Node object of the added Ray node.
|
||||
"""
|
||||
node_kwargs = {
|
||||
"resources": {
|
||||
"CPU": 1
|
||||
},
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": 100 * (2**20) # 100 MB
|
||||
}
|
||||
node_kwargs.update(override_kwargs)
|
||||
@@ -103,7 +101,7 @@ class Cluster(object):
|
||||
node = Node(address_info, process_dict_copy)
|
||||
self.worker_nodes[node] = address_info
|
||||
logger.info("Starting Node with raylet socket {}".format(
|
||||
address_info["raylet_socket_names"]))
|
||||
address_info["raylet_socket_name"]))
|
||||
|
||||
return node
|
||||
|
||||
@@ -125,10 +123,10 @@ class Cluster(object):
|
||||
assert not node.any_processes_alive(), (
|
||||
"There are zombie processes left over after killing.")
|
||||
|
||||
def wait_for_nodes(self, retries=30):
|
||||
def wait_for_nodes(self, retries=100):
|
||||
"""Waits for all nodes to be registered with global state.
|
||||
|
||||
By default, waits for 3 seconds.
|
||||
By default, waits for 10 seconds.
|
||||
|
||||
Args:
|
||||
retries (int): Number of times to retry checking client table.
|
||||
@@ -239,4 +237,4 @@ class Node(object):
|
||||
Assuming one plasma store per raylet, this may be used as a unique
|
||||
identifier for a node.
|
||||
"""
|
||||
return self.address_info['object_store_addresses'][0]
|
||||
return self.address_info['object_store_address']
|
||||
|
||||
@@ -30,7 +30,7 @@ def cluster_start():
|
||||
initialize_head=True,
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"resources": dict(CPU=1),
|
||||
"num_cpus": 1,
|
||||
"_internal_config": json.dumps({
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
@@ -94,7 +94,7 @@ def test_add_remove_cluster_resources(cluster_start):
|
||||
cluster = cluster_start
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 1
|
||||
nodes = []
|
||||
nodes += [cluster.add_node(resources=dict(CPU=1))]
|
||||
nodes += [cluster.add_node(num_cpus=1)]
|
||||
assert cluster.wait_for_nodes()
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 2
|
||||
|
||||
@@ -103,6 +103,6 @@ def test_add_remove_cluster_resources(cluster_start):
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 1
|
||||
|
||||
for i in range(5):
|
||||
nodes += [cluster.add_node(resources=dict(CPU=1))]
|
||||
nodes += [cluster.add_node(num_cpus=1)]
|
||||
assert cluster.wait_for_nodes()
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 6
|
||||
|
||||
@@ -30,7 +30,7 @@ def _start_new_cluster():
|
||||
initialize_head=True,
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"resources": dict(CPU=1),
|
||||
"num_cpus": 1,
|
||||
"_internal_config": json.dumps({
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
@@ -58,7 +58,7 @@ def start_connected_emptyhead_cluster():
|
||||
initialize_head=True,
|
||||
connect=True,
|
||||
head_node_args={
|
||||
"resources": dict(CPU=0),
|
||||
"num_cpus": 0,
|
||||
"_internal_config": json.dumps({
|
||||
"num_heartbeats_timeout": 10
|
||||
})
|
||||
@@ -84,7 +84,7 @@ def test_counting_resources(start_connected_cluster):
|
||||
runner.add_trial(t)
|
||||
|
||||
runner.step() # run 1
|
||||
nodes += [cluster.add_node(resources=dict(CPU=1))]
|
||||
nodes += [cluster.add_node(num_cpus=1)]
|
||||
assert cluster.wait_for_nodes()
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 2
|
||||
cluster.remove_node(nodes.pop())
|
||||
@@ -94,7 +94,7 @@ def test_counting_resources(start_connected_cluster):
|
||||
assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1
|
||||
|
||||
for i in range(5):
|
||||
nodes += [cluster.add_node(resources=dict(CPU=1))]
|
||||
nodes += [cluster.add_node(num_cpus=1)]
|
||||
assert cluster.wait_for_nodes()
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 6
|
||||
|
||||
@@ -105,7 +105,7 @@ def test_counting_resources(start_connected_cluster):
|
||||
def test_remove_node_before_result(start_connected_emptyhead_cluster):
|
||||
"""Tune continues when node is removed before trial returns."""
|
||||
cluster = start_connected_emptyhead_cluster
|
||||
node = cluster.add_node(resources=dict(CPU=1))
|
||||
node = cluster.add_node(num_cpus=1)
|
||||
assert cluster.wait_for_nodes()
|
||||
|
||||
runner = TrialRunner(BasicVariantGenerator())
|
||||
@@ -122,7 +122,7 @@ def test_remove_node_before_result(start_connected_emptyhead_cluster):
|
||||
runner.step() # run 1
|
||||
assert trial.status == Trial.RUNNING
|
||||
cluster.remove_node(node)
|
||||
cluster.add_node(resources=dict(CPU=1))
|
||||
cluster.add_node(num_cpus=1)
|
||||
cluster.wait_for_nodes()
|
||||
assert ray.global_state.cluster_resources()["CPU"] == 1
|
||||
|
||||
@@ -144,7 +144,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
|
||||
The trial state should also be consistent with the checkpoint.
|
||||
"""
|
||||
cluster = start_connected_emptyhead_cluster
|
||||
node = cluster.add_node(resources=dict(CPU=1))
|
||||
node = cluster.add_node(num_cpus=1)
|
||||
assert cluster.wait_for_nodes()
|
||||
|
||||
runner = TrialRunner(BasicVariantGenerator())
|
||||
@@ -162,7 +162,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
|
||||
runner.step() # start
|
||||
runner.step() # 1 result
|
||||
assert t.last_result is not None
|
||||
node2 = cluster.add_node(resources=dict(CPU=1))
|
||||
node2 = cluster.add_node(num_cpus=1)
|
||||
cluster.remove_node(node)
|
||||
assert cluster.wait_for_nodes()
|
||||
runner.step() # Recovery step
|
||||
@@ -183,7 +183,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
|
||||
runner.step() # 1 result
|
||||
runner.step() # 2 result and checkpoint
|
||||
assert t2.has_checkpoint()
|
||||
node3 = cluster.add_node(resources=dict(CPU=1))
|
||||
node3 = cluster.add_node(num_cpus=1)
|
||||
cluster.remove_node(node2)
|
||||
assert cluster.wait_for_nodes()
|
||||
runner.step() # Recovery step
|
||||
@@ -198,7 +198,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
|
||||
runner.add_trial(t3)
|
||||
runner.step() # start
|
||||
runner.step() # 1 result
|
||||
cluster.add_node(resources=dict(CPU=1))
|
||||
cluster.add_node(num_cpus=1)
|
||||
cluster.remove_node(node3)
|
||||
assert cluster.wait_for_nodes()
|
||||
runner.step() # Error handling step
|
||||
@@ -215,7 +215,7 @@ def test_trial_migration(start_connected_emptyhead_cluster):
|
||||
def test_trial_requeue(start_connected_emptyhead_cluster):
|
||||
"""Removing a node in full cluster causes Trial to be requeued."""
|
||||
cluster = start_connected_emptyhead_cluster
|
||||
node = cluster.add_node(resources=dict(CPU=1))
|
||||
node = cluster.add_node(num_cpus=1)
|
||||
assert cluster.wait_for_nodes()
|
||||
|
||||
runner = TrialRunner(BasicVariantGenerator())
|
||||
@@ -246,7 +246,7 @@ def test_trial_requeue(start_connected_emptyhead_cluster):
|
||||
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
|
||||
"""Test checks that trial restarts if checkpoint is lost w/ node fail."""
|
||||
cluster = start_connected_emptyhead_cluster
|
||||
node = cluster.add_node(resources=dict(CPU=1))
|
||||
node = cluster.add_node(num_cpus=1)
|
||||
assert cluster.wait_for_nodes()
|
||||
|
||||
runner = TrialRunner(BasicVariantGenerator())
|
||||
@@ -265,7 +265,7 @@ def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
|
||||
runner.step() # 1 result
|
||||
runner.step() # 2 result and checkpoint
|
||||
assert t1.has_checkpoint()
|
||||
cluster.add_node(resources=dict(CPU=1))
|
||||
cluster.add_node(num_cpus=1)
|
||||
cluster.remove_node(node)
|
||||
assert cluster.wait_for_nodes()
|
||||
shutil.rmtree(os.path.dirname(t1._checkpoint.value))
|
||||
@@ -280,7 +280,7 @@ def test_migration_checkpoint_removal(start_connected_emptyhead_cluster):
|
||||
def test_cluster_down_simple(start_connected_cluster, tmpdir):
|
||||
"""Tests that TrialRunner save/restore works on cluster shutdown."""
|
||||
cluster = start_connected_cluster
|
||||
cluster.add_node(resources=dict(CPU=1))
|
||||
cluster.add_node(num_cpus=1)
|
||||
assert cluster.wait_for_nodes()
|
||||
|
||||
dirpath = str(tmpdir)
|
||||
|
||||
+9
-62
@@ -1204,17 +1204,14 @@ def get_address_info_from_redis_helper(redis_address,
|
||||
if len(raylets) == 0:
|
||||
raise Exception(
|
||||
"Redis has started but no raylets have registered yet.")
|
||||
object_store_addresses = [
|
||||
ray.utils.decode(raylet.ObjectStoreSocketName()) for raylet in raylets
|
||||
]
|
||||
raylet_socket_names = [
|
||||
ray.utils.decode(raylet.RayletSocketName()) for raylet in raylets
|
||||
]
|
||||
|
||||
object_store_address = ray.utils.decode(raylets[0].ObjectStoreSocketName())
|
||||
raylet_socket_name = ray.utils.decode(raylets[0].RayletSocketName())
|
||||
return {
|
||||
"node_ip_address": node_ip_address,
|
||||
"redis_address": redis_address,
|
||||
"object_store_addresses": object_store_addresses,
|
||||
"raylet_socket_names": raylet_socket_names,
|
||||
"object_store_address": object_store_address,
|
||||
"raylet_socket_name": raylet_socket_name,
|
||||
# Web UI should be running.
|
||||
"webui_url": _webui_url_helper(redis_client)
|
||||
}
|
||||
@@ -1242,44 +1239,6 @@ def get_address_info_from_redis(redis_address,
|
||||
counter += 1
|
||||
|
||||
|
||||
def _normalize_resource_arguments(num_cpus, num_gpus, resources,
|
||||
num_local_schedulers):
|
||||
"""Stick the CPU and GPU arguments into the resources dictionary.
|
||||
|
||||
This also checks that the arguments are well-formed.
|
||||
|
||||
Args:
|
||||
num_cpus: Either a number of CPUs or a list of numbers of CPUs.
|
||||
num_gpus: Either a number of CPUs or a list of numbers of CPUs.
|
||||
resources: Either a dictionary of resource mappings or a list of
|
||||
dictionaries of resource mappings.
|
||||
num_local_schedulers: The number of local schedulers.
|
||||
|
||||
Returns:
|
||||
A list of dictionaries of resources of length num_local_schedulers.
|
||||
"""
|
||||
if resources is None:
|
||||
resources = {}
|
||||
if not isinstance(num_cpus, list):
|
||||
num_cpus = num_local_schedulers * [num_cpus]
|
||||
if not isinstance(num_gpus, list):
|
||||
num_gpus = num_local_schedulers * [num_gpus]
|
||||
if not isinstance(resources, list):
|
||||
resources = num_local_schedulers * [resources]
|
||||
|
||||
new_resources = [r.copy() for r in resources]
|
||||
|
||||
for i in range(num_local_schedulers):
|
||||
assert "CPU" not in new_resources[i], "Use the 'num_cpus' argument."
|
||||
assert "GPU" not in new_resources[i], "Use the 'num_gpus' argument."
|
||||
if num_cpus[i] is not None:
|
||||
new_resources[i]["CPU"] = num_cpus[i]
|
||||
if num_gpus[i] is not None:
|
||||
new_resources[i]["GPU"] = num_gpus[i]
|
||||
|
||||
return new_resources
|
||||
|
||||
|
||||
def _init(ray_params, driver_id=None):
|
||||
"""Helper method to connect to an existing Ray cluster or start a new one.
|
||||
|
||||
@@ -1291,8 +1250,8 @@ def _init(ray_params, driver_id=None):
|
||||
ray_params (ray.params.RayParams): The RayParams instance. The
|
||||
following parameters could be checked: address_info,
|
||||
start_ray_local, object_id_seed, num_workers,
|
||||
num_local_schedulers, object_store_memory, redis_max_memory,
|
||||
local_mode, redirect_worker_output, driver_mode, redirect_output,
|
||||
object_store_memory, redis_max_memory, local_mode,
|
||||
redirect_worker_output, driver_mode, redirect_output,
|
||||
start_workers_from_local_scheduler, num_cpus, num_gpus, resources,
|
||||
num_redis_shards, redis_max_clients, redis_password,
|
||||
plasma_directory, huge_pages, include_webui, driver_id,
|
||||
@@ -1333,18 +1292,9 @@ def _init(ray_params, driver_id=None):
|
||||
# are already registered in address_info.
|
||||
ray_params.update_if_absent(
|
||||
node_ip_address=ray.services.get_node_ip_address())
|
||||
# Use 1 local scheduler if num_local_schedulers is not provided. If
|
||||
# existing local schedulers are provided, use that count as
|
||||
# num_local_schedulers.
|
||||
ray_params.update_if_absent(num_local_schedulers=1)
|
||||
# Use 1 additional redis shard if num_redis_shards is not provided.
|
||||
ray_params.update_if_absent(num_redis_shards=1)
|
||||
|
||||
# Stick the CPU and GPU resources into the resource dictionary.
|
||||
ray_params.resources = _normalize_resource_arguments(
|
||||
ray_params.num_cpus, ray_params.num_gpus, ray_params.resources,
|
||||
ray_params.num_local_schedulers)
|
||||
|
||||
# Start the scheduler, object store, and some workers. These will be
|
||||
# killed by the call to shutdown(), which happens when the Python
|
||||
# script exits.
|
||||
@@ -1356,9 +1306,6 @@ def _init(ray_params, driver_id=None):
|
||||
if ray_params.num_workers is not None:
|
||||
raise Exception("When connecting to an existing cluster, "
|
||||
"num_workers must not be provided.")
|
||||
if ray_params.num_local_schedulers is not None:
|
||||
raise Exception("When connecting to an existing cluster, "
|
||||
"num_local_schedulers must not be provided.")
|
||||
if ray_params.num_cpus is not None or ray_params.num_gpus is not None:
|
||||
raise Exception("When connecting to an existing cluster, num_cpus "
|
||||
"and num_gpus must not be provided.")
|
||||
@@ -1417,11 +1364,11 @@ def _init(ray_params, driver_id=None):
|
||||
"node_ip_address": ray_params.node_ip_address,
|
||||
"redis_address": ray_params.address_info["redis_address"],
|
||||
"store_socket_name": ray_params.address_info[
|
||||
"object_store_addresses"][0],
|
||||
"object_store_address"],
|
||||
"webui_url": ray_params.address_info["webui_url"],
|
||||
}
|
||||
driver_address_info["raylet_socket_name"] = (
|
||||
ray_params.address_info["raylet_socket_names"][0])
|
||||
ray_params.address_info["raylet_socket_name"])
|
||||
|
||||
# We only pass `temp_dir` to a worker (WORKER_MODE).
|
||||
# It can't be a worker here.
|
||||
|
||||
Reference in New Issue
Block a user