Support concurrent Actor calls in Ray (#6053)

This commit is contained in:
Eric Liang
2019-11-04 01:14:35 -08:00
committed by GitHub
parent fbad6f543b
commit 8485304e83
21 changed files with 287 additions and 86 deletions
+36 -23
View File
@@ -6,6 +6,7 @@
from cpython.exc cimport PyErr_CheckSignals
import numpy
import threading
import time
import logging
import os
@@ -647,28 +648,34 @@ cdef CRayStatus task_execution_handler(
with gil:
try:
# The call to execute_task should never raise an exception. If it
# does, that indicates that there was an unexpected internal error.
execute_task(task_type, ray_function, c_resources, c_args,
c_arg_reference_ids, c_return_ids,
return_results_directly, returns)
except Exception:
traceback_str = traceback.format_exc() + (
"An unexpected internal error occurred while the worker was"
"executing a task.")
ray.utils.push_error_to_driver(
ray.worker.global_worker,
"worker_crash",
traceback_str,
job_id=None)
# TODO(rkn): Note that if the worker was in the middle of executing
# a task, then any worker or driver that is blocking in a get call
# and waiting for the output of that task will hang. We need to
# address this.
sys.exit(1)
try:
# The call to execute_task should never raise an exception. If
# it does, that indicates that there was an internal error.
execute_task(task_type, ray_function, c_resources, c_args,
c_arg_reference_ids, c_return_ids,
return_results_directly, returns)
except Exception:
traceback_str = traceback.format_exc() + (
"An unexpected internal error occurred while the worker "
"was executing a task.")
ray.utils.push_error_to_driver(
ray.worker.global_worker,
"worker_crash",
traceback_str,
job_id=None)
sys.exit(1)
except SystemExit:
if isinstance(threading.current_thread(), threading._MainThread):
raise
else:
# We cannot exit from a non-main thread, so return a special
# status that tells the core worker to call sys.exit() on the
# main thread instead. This only applies to direct actor calls.
return CRayStatus.SystemExit()
return CRayStatus.OK()
cdef CRayStatus check_signals() nogil:
with gil:
try:
@@ -678,6 +685,11 @@ cdef CRayStatus check_signals() nogil:
return CRayStatus.OK()
cdef void exit_handler() nogil:
with gil:
sys.exit(0)
cdef void push_objects_into_return_vector(
py_objects,
c_vector[shared_ptr[CRayObject]] *returns):
@@ -733,7 +745,7 @@ cdef class CoreWorker:
raylet_socket.encode("ascii"), job_id.native(),
gcs_options.native()[0], log_dir.encode("utf-8"),
node_ip_address.encode("utf-8"), task_execution_handler,
check_signals))
check_signals, exit_handler))
def disconnect(self):
with nogil:
@@ -966,6 +978,7 @@ cdef class CoreWorker:
resources,
placement_resources,
c_bool is_direct_call,
int32_t max_concurrency,
c_bool is_detached):
cdef:
CRayFunction ray_function
@@ -986,9 +999,9 @@ cdef class CoreWorker:
check_status(self.core_worker.get().CreateActor(
ray_function, args_vector,
CActorCreationOptions(
max_reconstructions, is_direct_call, c_resources,
c_placement_resources, dynamic_worker_options,
is_detached),
max_reconstructions, is_direct_call, max_concurrency,
c_resources, c_placement_resources,
dynamic_worker_options, is_detached),
&c_actor_id))
return ActorID(c_actor_id.Binary())
+35 -1
View File
@@ -326,6 +326,26 @@ class ActorClass(object):
"""
return self._remote(args=args, kwargs=kwargs)
def options(self, **options):
"""Convenience method for creating an actor with options.
Same arguments as Actor._remote(), but returns a wrapped actor class
that a non-underscore .remote() can be called on.
Examples:
# The following two calls are equivalent.
>>> Actor._remote(num_cpus=4, max_concurrency=8, args=[x, y])
>>> Actor.options(num_cpus=4, max_concurrency=8).remote(x, y)
"""
actor_cls = self
class ActorOptionWrapper(object):
def remote(self, *args, **kwargs):
return actor_cls._remote(args=args, kwargs=kwargs, **options)
return ActorOptionWrapper()
def _remote(self,
args=None,
kwargs=None,
@@ -335,6 +355,7 @@ class ActorClass(object):
object_store_memory=None,
resources=None,
is_direct_call=None,
max_concurrency=None,
name=None,
detached=False):
"""Create an actor.
@@ -354,6 +375,8 @@ class ActorClass(object):
resources: The custom resources required by the actor creation
task.
is_direct_call: Use direct actor calls.
max_concurrency: The max number of concurrent calls to allow for
this actor. This only works with direct actor calls.
name: The globally unique name for the actor.
detached: Whether the actor should be kept alive after driver
exits.
@@ -365,6 +388,16 @@ class ActorClass(object):
args = []
if kwargs is None:
kwargs = {}
if is_direct_call is None:
is_direct_call = False
if max_concurrency is None:
max_concurrency = 1
if max_concurrency > 1 and not is_direct_call:
raise ValueError(
"setting max_concurrency requires is_direct_call=True")
if max_concurrency < 1:
raise ValueError("max_concurrency must be >= 1")
worker = ray.worker.get_global_worker()
if worker.mode is None:
@@ -452,7 +485,8 @@ class ActorClass(object):
actor_id = worker.core_worker.create_actor(
function_descriptor.get_function_descriptor_list(),
creation_args, meta.max_reconstructions, resources,
actor_placement_resources, is_direct_call, detached)
actor_placement_resources, is_direct_call, max_concurrency,
detached)
actor_handle = ActorHandle(
actor_id,
+5 -1
View File
@@ -2,7 +2,7 @@ from libcpp cimport bool as c_bool
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.string cimport string as c_string
from libc.stdint cimport uint8_t, uint64_t, int64_t
from libc.stdint cimport uint8_t, int32_t, uint64_t, int64_t
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector as c_vector
@@ -76,6 +76,9 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
@staticmethod
CRayStatus Interrupted(const c_string &msg)
@staticmethod
CRayStatus SystemExit()
c_bool ok()
c_bool IsOutOfMemory()
c_bool IsKeyError()
@@ -205,6 +208,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
CActorCreationOptions()
CActorCreationOptions(
uint64_t max_reconstructions, c_bool is_direct_call,
int32_t max_concurrency,
const unordered_map[c_string, double] &resources,
const unordered_map[c_string, double] &placement_resources,
const c_vector[c_string] &dynamic_worker_options,
+2 -1
View File
@@ -64,7 +64,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const c_vector[CObjectID] &return_ids,
c_bool is_direct_call,
c_vector[shared_ptr[CRayObject]] *returns) nogil,
CRayStatus() nogil)
CRayStatus() nogil,
void () nogil)
void Disconnect()
CWorkerType &GetWorkerType()
CLanguage &GetLanguage()
+29 -6
View File
@@ -142,14 +142,28 @@ def main():
def actor_sync():
ray.get(a.small_value.remote())
timeit("single client actor calls sync", actor_sync)
timeit("1:1 actor calls sync", actor_sync)
a = Actor.remote()
def actor_async():
ray.get([a.small_value.remote() for _ in range(1000)])
timeit("single client actor calls async", actor_async, 1000)
timeit("1:1 actor calls async", actor_async, 1000)
a = Actor.options(is_direct_call=True).remote()
def actor_concurrent():
ray.get([a.small_value.remote() for _ in range(1000)])
timeit("1:1 direct actor calls async", actor_concurrent, 1000)
a = Actor.options(is_direct_call=True, max_concurrency=16).remote()
def actor_concurrent():
ray.get([a.small_value.remote() for _ in range(1000)])
timeit("1:1 direct actor calls concurrent", actor_concurrent, 1000)
n_cpu = multiprocessing.cpu_count() // 2
a = [Actor.remote() for _ in range(n_cpu)]
@@ -161,7 +175,7 @@ def main():
def actor_multi2():
ray.get([work.remote(a) for _ in range(m)])
timeit("multi client actor calls async", actor_multi2, m * n)
timeit("n:n actor calls async", actor_multi2, m * n)
n = 5000
n_cpu = multiprocessing.cpu_count() // 2
@@ -171,15 +185,24 @@ def main():
def actor_async_direct():
ray.get(client.small_value_batch.remote(n))
timeit("single client direct actor calls async", actor_async_direct,
n * len(actors))
timeit("1:n direct actor calls async", actor_async_direct, n * len(actors))
clients = [Client.remote(a) for a in actors]
def actor_multi2_direct():
ray.get([c.small_value_batch.remote(n) for c in clients])
timeit("multi client direct actor calls async", actor_multi2_direct,
timeit("n:n direct actor calls async", actor_multi2_direct,
n * len(clients))
n = 1000
actors = [Actor._remote(is_direct_call=True) for _ in range(n_cpu)]
clients = [Client.remote(a) for a in actors]
def actor_multi2_direct_arg():
ray.get([c.small_value_batch_arg.remote(n) for c in clients])
timeit("n:n direct actor calls with arg async", actor_multi2_direct_arg,
n * len(clients))
n = 1000
+20
View File
@@ -110,6 +110,26 @@ class RemoteFunction(object):
num_gpus=num_gpus,
resources=resources)
def options(self, **options):
"""Convenience method for executing a task with options.
Same arguments as func._remote(), but returns a wrapped function
that a non-underscore .remote() can be called on.
Examples:
# The following two calls are equivalent.
>>> func._remote(num_cpus=4, args=[x, y])
>>> func.options(num_cpus=4).remote(x, y)
"""
func_cls = self
class FuncWrapper(object):
def remote(self, *args, **kwargs):
return func_cls._remote(args=args, kwargs=kwargs, **options)
return FuncWrapper()
def _remote(self,
args=None,
kwargs=None,
+26 -1
View File
@@ -1318,6 +1318,32 @@ def test_direct_actor_recursive(ray_start_regular):
assert result == [x * 2 for x in range(100)]
def test_direct_actor_concurrent(ray_start_regular):
@ray.remote
class Batcher(object):
def __init__(self):
self.batch = []
self.event = threading.Event()
def add(self, x):
self.batch.append(x)
if len(self.batch) >= 3:
self.event.set()
else:
self.event.wait()
return sorted(self.batch)
a = Batcher.options(is_direct_call=True, max_concurrency=3).remote()
x1 = a.add.remote(1)
x2 = a.add.remote(2)
x3 = a.add.remote(3)
r1 = ray.get(x1)
r2 = ray.get(x2)
r3 = ray.get(x3)
assert r1 == [1, 2, 3]
assert r1 == r2 == r3
def test_wait(ray_start_regular):
@ray.remote
def f(delay):
@@ -1516,7 +1542,6 @@ def test_profiling_api(ray_start_2_cpus):
profile_data = ray.timeline()
event_types = {event["cat"] for event in profile_data}
expected_types = [
"worker_idle",
"task",
"task:deserialize_arguments",
"task:execute",