By default, start a number of workers equal to the number of CPUs. (#430)

* By default, start a number of workers equal to the number of CPUs.

* Fix stress tests.
This commit is contained in:
Robert Nishihara
2017-04-06 00:02:58 -07:00
committed by Philipp Moritz
parent fa363a5a3a
commit 320109a5bd
7 changed files with 33 additions and 33 deletions
+10 -9
View File
@@ -3,7 +3,6 @@ from __future__ import division
from __future__ import print_function
from collections import namedtuple, OrderedDict
import multiprocessing
import os
import psutil
import random
@@ -161,7 +160,7 @@ def all_processes_alive(exclude=[]):
# an exit code of None indicates that the process is still alive.
processes_alive = [p.poll() is None for p in processes]
if (not all(processes_alive) and process_type not in exclude):
print("A process of type {} has dead.".format(process_type))
print("A process of type {} has died.".format(process_type))
return False
return True
@@ -511,10 +510,12 @@ def start_local_scheduler(redis_address,
if num_cpus is None:
# By default, use the number of hardware execution threads for the number
# of cores.
num_cpus = multiprocessing.cpu_count()
num_cpus = psutil.cpu_count()
if num_gpus is None:
# By default, assume this node has no GPUs.
num_gpus = 0
print("Starting local scheduler with {} CPUs and {} GPUs.".format(num_cpus,
num_gpus))
local_scheduler_name, p = ray.local_scheduler.start_local_scheduler(
plasma_store_name,
plasma_manager_name,
@@ -693,7 +694,7 @@ def start_monitor(redis_address, node_ip_address, stdout_file=None,
def start_ray_processes(address_info=None,
node_ip_address="127.0.0.1",
num_workers=0,
num_workers=None,
num_local_schedulers=1,
worker_path=None,
cleanup=True,
@@ -753,6 +754,11 @@ def start_ray_processes(address_info=None,
assert len(num_cpus) == num_local_schedulers
assert len(num_gpus) == num_local_schedulers
if num_workers is not None:
workers_per_local_scheduler = num_local_schedulers * [num_workers]
else:
workers_per_local_scheduler = num_local_schedulers * [psutil.cpu_count()]
if address_info is None:
address_info = {}
address_info["node_ip_address"] = node_ip_address
@@ -854,11 +860,6 @@ def start_ray_processes(address_info=None,
object_store_addresses.append(object_store_address)
time.sleep(0.1)
# Determine how many workers to start for each local scheduler.
workers_per_local_scheduler = [0] * num_local_schedulers
for i in range(num_workers):
workers_per_local_scheduler[i % num_local_schedulers] += 1
# Start any local schedulers that do not yet exist.
for i in range(len(local_scheduler_socket_names), num_local_schedulers):
# Connect the local scheduler to the object store at the same index.
+1 -3
View File
@@ -723,7 +723,7 @@ def check_connected(worker=global_worker):
if not worker.connected:
raise RayConnectionError("This command cannot be called before Ray has "
"been started. You can start Ray with "
"'ray.init(num_workers=10)'.")
"'ray.init()'.")
def print_failed_task(task_status):
@@ -956,8 +956,6 @@ def _init(address_info=None,
# Use the address 127.0.0.1 in local mode.
node_ip_address = ("127.0.0.1" if node_ip_address is None
else node_ip_address)
# Use 1 worker if num_workers is not provided.
num_workers = 10 if num_workers is None else num_workers
# Use 1 local scheduler if num_local_schedulers is not provided. If
# existing local schedulers are provided, use that count as
# num_local_schedulers.