By default, start a number of workers equal to the number of CPUs. (#430)

* By default, start a number of workers equal to the number of CPUs.

* Fix stress tests.
This commit is contained in:
Robert Nishihara
2017-04-06 00:02:58 -07:00
committed by Philipp Moritz
parent fa363a5a3a
commit 320109a5bd
7 changed files with 33 additions and 33 deletions
+6
View File
@@ -35,6 +35,12 @@ If there are GPUs available on the machine, you should specify this with the
By default, Ray will use ``psutil.cpu_count()`` to determine the number of CPUs,
and by default the number of GPUs will be zero.
Instead of thinking about the number of "worker" processes on each node, we
prefer to think in terms of the quantities of CPU and GPU resources on each
node and to provide the illusion of an infinite pool of workers. Tasks will be
assigned to workers based on the availability of resources so as to avoid
contention and not based on the number of available worker processes.
Connecting to an existing cluster
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+1 -1
View File
@@ -87,7 +87,7 @@ This can be addressed by calling ``ray.register_class(Foo)``.
import ray
ray.init(num_workers=10)
ray.init()
# Define a custom class.
class Foo(object):
+10 -9
View File
@@ -3,7 +3,6 @@ from __future__ import division
from __future__ import print_function
from collections import namedtuple, OrderedDict
import multiprocessing
import os
import psutil
import random
@@ -161,7 +160,7 @@ def all_processes_alive(exclude=[]):
# an exit code of None indicates that the process is still alive.
processes_alive = [p.poll() is None for p in processes]
if (not all(processes_alive) and process_type not in exclude):
print("A process of type {} has dead.".format(process_type))
print("A process of type {} has died.".format(process_type))
return False
return True
@@ -511,10 +510,12 @@ def start_local_scheduler(redis_address,
if num_cpus is None:
# By default, use the number of hardware execution threads for the number
# of cores.
num_cpus = multiprocessing.cpu_count()
num_cpus = psutil.cpu_count()
if num_gpus is None:
# By default, assume this node has no GPUs.
num_gpus = 0
print("Starting local scheduler with {} CPUs and {} GPUs.".format(num_cpus,
num_gpus))
local_scheduler_name, p = ray.local_scheduler.start_local_scheduler(
plasma_store_name,
plasma_manager_name,
@@ -693,7 +694,7 @@ def start_monitor(redis_address, node_ip_address, stdout_file=None,
def start_ray_processes(address_info=None,
node_ip_address="127.0.0.1",
num_workers=0,
num_workers=None,
num_local_schedulers=1,
worker_path=None,
cleanup=True,
@@ -753,6 +754,11 @@ def start_ray_processes(address_info=None,
assert len(num_cpus) == num_local_schedulers
assert len(num_gpus) == num_local_schedulers
if num_workers is not None:
workers_per_local_scheduler = num_local_schedulers * [num_workers]
else:
workers_per_local_scheduler = num_local_schedulers * [psutil.cpu_count()]
if address_info is None:
address_info = {}
address_info["node_ip_address"] = node_ip_address
@@ -854,11 +860,6 @@ def start_ray_processes(address_info=None,
object_store_addresses.append(object_store_address)
time.sleep(0.1)
# Determine how many workers to start for each local scheduler.
workers_per_local_scheduler = [0] * num_local_schedulers
for i in range(num_workers):
workers_per_local_scheduler[i % num_local_schedulers] += 1
# Start any local schedulers that do not yet exist.
for i in range(len(local_scheduler_socket_names), num_local_schedulers):
# Connect the local scheduler to the object store at the same index.
+1 -3
View File
@@ -723,7 +723,7 @@ def check_connected(worker=global_worker):
if not worker.connected:
raise RayConnectionError("This command cannot be called before Ray has "
"been started. You can start Ray with "
"'ray.init(num_workers=10)'.")
"'ray.init()'.")
def print_failed_task(task_status):
@@ -956,8 +956,6 @@ def _init(address_info=None,
# Use the address 127.0.0.1 in local mode.
node_ip_address = ("127.0.0.1" if node_ip_address is None
else node_ip_address)
# Use 1 worker if num_workers is not provided.
num_workers = 10 if num_workers is None else num_workers
# Use 1 local scheduler if num_local_schedulers is not provided. If
# existing local schedulers are provided, use that count as
# num_local_schedulers.
+10 -15
View File
@@ -8,17 +8,17 @@ import redis
import ray.services as services
parser = argparse.ArgumentParser(
description="Parse addresses for the worker to connect to.")
description="Start the Ray processes on a node.")
parser.add_argument("--node-ip-address", required=False, type=str,
help="the IP address of the worker's node")
help="the IP address of this node")
parser.add_argument("--redis-address", required=False, type=str,
help="the address to use for connecting to Redis")
parser.add_argument("--redis-port", required=False, type=str,
help="the port to use for starting Redis")
parser.add_argument("--object-manager-port", required=False, type=int,
help="the port to use for starting the object manager")
parser.add_argument("--num-workers", default=10, required=False, type=int,
help="the number of workers to start on this node")
parser.add_argument("--num-workers", required=False, type=int,
help="the initial number of workers to start on this node")
parser.add_argument("--num-cpus", required=False, type=int,
help="the number of CPUs on this node")
parser.add_argument("--num-gpus", required=False, type=int,
@@ -93,10 +93,8 @@ if __name__ == "__main__":
num_cpus=args.num_cpus,
num_gpus=args.num_gpus)
print(address_info)
print("\nStarted Ray with {} workers on this node. A different number of "
"workers can be set with the --num-workers flag (but you have to "
"first terminate the existing cluster). You can add additional "
"nodes to the cluster by calling\n\n"
print("\nStarted Ray on this node. You can add additional nodes to the "
"cluster by calling\n\n"
" ./scripts/start_ray.sh --redis-address {}\n\n"
"from the node you wish to add. You can connect a driver to the "
"cluster from Python by running\n\n"
@@ -105,8 +103,7 @@ if __name__ == "__main__":
"If you have trouble connecting from a different machine, check "
"that your firewall is configured properly. If you wish to "
"terminate the processes that have been started, run\n\n"
" ./scripts/stop_ray.sh".format(args.num_workers,
address_info["redis_address"],
" ./scripts/stop_ray.sh".format(address_info["redis_address"],
address_info["redis_address"]))
else:
# Start Ray on a non-head node.
@@ -140,8 +137,6 @@ if __name__ == "__main__":
num_cpus=args.num_cpus,
num_gpus=args.num_gpus)
print(address_info)
print("\nStarted {} workers on this node. A different number of workers "
"can be set with the --num-workers flag (but you have to first "
"terminate the existing cluster). If you wish to terminate the "
"processes that have been started, run\n\n"
" ./scripts/stop_ray.sh".format(args.num_workers))
print("\nStarted Ray on this node. If you wish to terminate the processes "
"that have been started, run\n\n"
" ./scripts/stop_ray.sh")
+4 -4
View File
@@ -21,7 +21,7 @@ class RemoteArrayTest(unittest.TestCase):
for module in [ra.core, ra.random, ra.linalg, da.core, da.random,
da.linalg]:
reload(module)
ray.init(num_workers=1)
ray.init()
# test eye
object_id = ra.eye.remote(3)
@@ -57,7 +57,7 @@ class DistributedArrayTest(unittest.TestCase):
for module in [ra.core, ra.random, ra.linalg, da.core, da.random,
da.linalg]:
reload(module)
ray.init(num_workers=1)
ray.init()
a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
@@ -72,8 +72,8 @@ class DistributedArrayTest(unittest.TestCase):
for module in [ra.core, ra.random, ra.linalg, da.core, da.random,
da.linalg]:
reload(module)
ray.worker._init(start_ray_local=True, num_workers=10,
num_local_schedulers=2, num_cpus=[10, 10])
ray.worker._init(start_ray_local=True, num_local_schedulers=2,
num_cpus=[10, 10])
x = da.zeros.remote([9, 25, 51], "float")
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
+1 -1
View File
@@ -154,7 +154,7 @@ class ReconstructionTests(unittest.TestCase):
# Start the rest of the services in the Ray cluster.
ray.worker._init(address_info=address_info, start_ray_local=True,
num_workers=self.num_local_schedulers,
num_workers=1,
num_local_schedulers=self.num_local_schedulers,
num_cpus=[1] * self.num_local_schedulers,
driver_mode=ray.SILENT_MODE)