mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 11:01:06 +08:00
Support concurrent Actor calls in Ray (#6053)
This commit is contained in:
+36
-23
@@ -6,6 +6,7 @@
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
|
||||
import numpy
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
@@ -647,28 +648,34 @@ cdef CRayStatus task_execution_handler(
|
||||
|
||||
with gil:
|
||||
try:
|
||||
# The call to execute_task should never raise an exception. If it
|
||||
# does, that indicates that there was an unexpected internal error.
|
||||
execute_task(task_type, ray_function, c_resources, c_args,
|
||||
c_arg_reference_ids, c_return_ids,
|
||||
return_results_directly, returns)
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + (
|
||||
"An unexpected internal error occurred while the worker was"
|
||||
"executing a task.")
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.worker.global_worker,
|
||||
"worker_crash",
|
||||
traceback_str,
|
||||
job_id=None)
|
||||
# TODO(rkn): Note that if the worker was in the middle of executing
|
||||
# a task, then any worker or driver that is blocking in a get call
|
||||
# and waiting for the output of that task will hang. We need to
|
||||
# address this.
|
||||
sys.exit(1)
|
||||
try:
|
||||
# The call to execute_task should never raise an exception. If
|
||||
# it does, that indicates that there was an internal error.
|
||||
execute_task(task_type, ray_function, c_resources, c_args,
|
||||
c_arg_reference_ids, c_return_ids,
|
||||
return_results_directly, returns)
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + (
|
||||
"An unexpected internal error occurred while the worker "
|
||||
"was executing a task.")
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.worker.global_worker,
|
||||
"worker_crash",
|
||||
traceback_str,
|
||||
job_id=None)
|
||||
sys.exit(1)
|
||||
except SystemExit:
|
||||
if isinstance(threading.current_thread(), threading._MainThread):
|
||||
raise
|
||||
else:
|
||||
# We cannot exit from a non-main thread, so return a special
|
||||
# status that tells the core worker to call sys.exit() on the
|
||||
# main thread instead. This only applies to direct actor calls.
|
||||
return CRayStatus.SystemExit()
|
||||
|
||||
return CRayStatus.OK()
|
||||
|
||||
|
||||
cdef CRayStatus check_signals() nogil:
|
||||
with gil:
|
||||
try:
|
||||
@@ -678,6 +685,11 @@ cdef CRayStatus check_signals() nogil:
|
||||
return CRayStatus.OK()
|
||||
|
||||
|
||||
cdef void exit_handler() nogil:
|
||||
with gil:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
cdef void push_objects_into_return_vector(
|
||||
py_objects,
|
||||
c_vector[shared_ptr[CRayObject]] *returns):
|
||||
@@ -733,7 +745,7 @@ cdef class CoreWorker:
|
||||
raylet_socket.encode("ascii"), job_id.native(),
|
||||
gcs_options.native()[0], log_dir.encode("utf-8"),
|
||||
node_ip_address.encode("utf-8"), task_execution_handler,
|
||||
check_signals))
|
||||
check_signals, exit_handler))
|
||||
|
||||
def disconnect(self):
|
||||
with nogil:
|
||||
@@ -966,6 +978,7 @@ cdef class CoreWorker:
|
||||
resources,
|
||||
placement_resources,
|
||||
c_bool is_direct_call,
|
||||
int32_t max_concurrency,
|
||||
c_bool is_detached):
|
||||
cdef:
|
||||
CRayFunction ray_function
|
||||
@@ -986,9 +999,9 @@ cdef class CoreWorker:
|
||||
check_status(self.core_worker.get().CreateActor(
|
||||
ray_function, args_vector,
|
||||
CActorCreationOptions(
|
||||
max_reconstructions, is_direct_call, c_resources,
|
||||
c_placement_resources, dynamic_worker_options,
|
||||
is_detached),
|
||||
max_reconstructions, is_direct_call, max_concurrency,
|
||||
c_resources, c_placement_resources,
|
||||
dynamic_worker_options, is_detached),
|
||||
&c_actor_id))
|
||||
|
||||
return ActorID(c_actor_id.Binary())
|
||||
|
||||
+35
-1
@@ -326,6 +326,26 @@ class ActorClass(object):
|
||||
"""
|
||||
return self._remote(args=args, kwargs=kwargs)
|
||||
|
||||
def options(self, **options):
|
||||
"""Convenience method for creating an actor with options.
|
||||
|
||||
Same arguments as Actor._remote(), but returns a wrapped actor class
|
||||
that a non-underscore .remote() can be called on.
|
||||
|
||||
Examples:
|
||||
# The following two calls are equivalent.
|
||||
>>> Actor._remote(num_cpus=4, max_concurrency=8, args=[x, y])
|
||||
>>> Actor.options(num_cpus=4, max_concurrency=8).remote(x, y)
|
||||
"""
|
||||
|
||||
actor_cls = self
|
||||
|
||||
class ActorOptionWrapper(object):
|
||||
def remote(self, *args, **kwargs):
|
||||
return actor_cls._remote(args=args, kwargs=kwargs, **options)
|
||||
|
||||
return ActorOptionWrapper()
|
||||
|
||||
def _remote(self,
|
||||
args=None,
|
||||
kwargs=None,
|
||||
@@ -335,6 +355,7 @@ class ActorClass(object):
|
||||
object_store_memory=None,
|
||||
resources=None,
|
||||
is_direct_call=None,
|
||||
max_concurrency=None,
|
||||
name=None,
|
||||
detached=False):
|
||||
"""Create an actor.
|
||||
@@ -354,6 +375,8 @@ class ActorClass(object):
|
||||
resources: The custom resources required by the actor creation
|
||||
task.
|
||||
is_direct_call: Use direct actor calls.
|
||||
max_concurrency: The max number of concurrent calls to allow for
|
||||
this actor. This only works with direct actor calls.
|
||||
name: The globally unique name for the actor.
|
||||
detached: Whether the actor should be kept alive after driver
|
||||
exits.
|
||||
@@ -365,6 +388,16 @@ class ActorClass(object):
|
||||
args = []
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
if is_direct_call is None:
|
||||
is_direct_call = False
|
||||
if max_concurrency is None:
|
||||
max_concurrency = 1
|
||||
|
||||
if max_concurrency > 1 and not is_direct_call:
|
||||
raise ValueError(
|
||||
"setting max_concurrency requires is_direct_call=True")
|
||||
if max_concurrency < 1:
|
||||
raise ValueError("max_concurrency must be >= 1")
|
||||
|
||||
worker = ray.worker.get_global_worker()
|
||||
if worker.mode is None:
|
||||
@@ -452,7 +485,8 @@ class ActorClass(object):
|
||||
actor_id = worker.core_worker.create_actor(
|
||||
function_descriptor.get_function_descriptor_list(),
|
||||
creation_args, meta.max_reconstructions, resources,
|
||||
actor_placement_resources, is_direct_call, detached)
|
||||
actor_placement_resources, is_direct_call, max_concurrency,
|
||||
detached)
|
||||
|
||||
actor_handle = ActorHandle(
|
||||
actor_id,
|
||||
|
||||
@@ -2,7 +2,7 @@ from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport shared_ptr, unique_ptr
|
||||
from libcpp.string cimport string as c_string
|
||||
|
||||
from libc.stdint cimport uint8_t, uint64_t, int64_t
|
||||
from libc.stdint cimport uint8_t, int32_t, uint64_t, int64_t
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
|
||||
@@ -76,6 +76,9 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
|
||||
@staticmethod
|
||||
CRayStatus Interrupted(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus SystemExit()
|
||||
|
||||
c_bool ok()
|
||||
c_bool IsOutOfMemory()
|
||||
c_bool IsKeyError()
|
||||
@@ -205,6 +208,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
|
||||
CActorCreationOptions()
|
||||
CActorCreationOptions(
|
||||
uint64_t max_reconstructions, c_bool is_direct_call,
|
||||
int32_t max_concurrency,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const unordered_map[c_string, double] &placement_resources,
|
||||
const c_vector[c_string] &dynamic_worker_options,
|
||||
|
||||
@@ -64,7 +64,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const c_vector[CObjectID] &return_ids,
|
||||
c_bool is_direct_call,
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil,
|
||||
CRayStatus() nogil)
|
||||
CRayStatus() nogil,
|
||||
void () nogil)
|
||||
void Disconnect()
|
||||
CWorkerType &GetWorkerType()
|
||||
CLanguage &GetLanguage()
|
||||
|
||||
+29
-6
@@ -142,14 +142,28 @@ def main():
|
||||
def actor_sync():
|
||||
ray.get(a.small_value.remote())
|
||||
|
||||
timeit("single client actor calls sync", actor_sync)
|
||||
timeit("1:1 actor calls sync", actor_sync)
|
||||
|
||||
a = Actor.remote()
|
||||
|
||||
def actor_async():
|
||||
ray.get([a.small_value.remote() for _ in range(1000)])
|
||||
|
||||
timeit("single client actor calls async", actor_async, 1000)
|
||||
timeit("1:1 actor calls async", actor_async, 1000)
|
||||
|
||||
a = Actor.options(is_direct_call=True).remote()
|
||||
|
||||
def actor_concurrent():
|
||||
ray.get([a.small_value.remote() for _ in range(1000)])
|
||||
|
||||
timeit("1:1 direct actor calls async", actor_concurrent, 1000)
|
||||
|
||||
a = Actor.options(is_direct_call=True, max_concurrency=16).remote()
|
||||
|
||||
def actor_concurrent():
|
||||
ray.get([a.small_value.remote() for _ in range(1000)])
|
||||
|
||||
timeit("1:1 direct actor calls concurrent", actor_concurrent, 1000)
|
||||
|
||||
n_cpu = multiprocessing.cpu_count() // 2
|
||||
a = [Actor.remote() for _ in range(n_cpu)]
|
||||
@@ -161,7 +175,7 @@ def main():
|
||||
def actor_multi2():
|
||||
ray.get([work.remote(a) for _ in range(m)])
|
||||
|
||||
timeit("multi client actor calls async", actor_multi2, m * n)
|
||||
timeit("n:n actor calls async", actor_multi2, m * n)
|
||||
|
||||
n = 5000
|
||||
n_cpu = multiprocessing.cpu_count() // 2
|
||||
@@ -171,15 +185,24 @@ def main():
|
||||
def actor_async_direct():
|
||||
ray.get(client.small_value_batch.remote(n))
|
||||
|
||||
timeit("single client direct actor calls async", actor_async_direct,
|
||||
n * len(actors))
|
||||
timeit("1:n direct actor calls async", actor_async_direct, n * len(actors))
|
||||
|
||||
clients = [Client.remote(a) for a in actors]
|
||||
|
||||
def actor_multi2_direct():
|
||||
ray.get([c.small_value_batch.remote(n) for c in clients])
|
||||
|
||||
timeit("multi client direct actor calls async", actor_multi2_direct,
|
||||
timeit("n:n direct actor calls async", actor_multi2_direct,
|
||||
n * len(clients))
|
||||
|
||||
n = 1000
|
||||
actors = [Actor._remote(is_direct_call=True) for _ in range(n_cpu)]
|
||||
clients = [Client.remote(a) for a in actors]
|
||||
|
||||
def actor_multi2_direct_arg():
|
||||
ray.get([c.small_value_batch_arg.remote(n) for c in clients])
|
||||
|
||||
timeit("n:n direct actor calls with arg async", actor_multi2_direct_arg,
|
||||
n * len(clients))
|
||||
|
||||
n = 1000
|
||||
|
||||
@@ -110,6 +110,26 @@ class RemoteFunction(object):
|
||||
num_gpus=num_gpus,
|
||||
resources=resources)
|
||||
|
||||
def options(self, **options):
|
||||
"""Convenience method for executing a task with options.
|
||||
|
||||
Same arguments as func._remote(), but returns a wrapped function
|
||||
that a non-underscore .remote() can be called on.
|
||||
|
||||
Examples:
|
||||
# The following two calls are equivalent.
|
||||
>>> func._remote(num_cpus=4, args=[x, y])
|
||||
>>> func.options(num_cpus=4).remote(x, y)
|
||||
"""
|
||||
|
||||
func_cls = self
|
||||
|
||||
class FuncWrapper(object):
|
||||
def remote(self, *args, **kwargs):
|
||||
return func_cls._remote(args=args, kwargs=kwargs, **options)
|
||||
|
||||
return FuncWrapper()
|
||||
|
||||
def _remote(self,
|
||||
args=None,
|
||||
kwargs=None,
|
||||
|
||||
@@ -1318,6 +1318,32 @@ def test_direct_actor_recursive(ray_start_regular):
|
||||
assert result == [x * 2 for x in range(100)]
|
||||
|
||||
|
||||
def test_direct_actor_concurrent(ray_start_regular):
|
||||
@ray.remote
|
||||
class Batcher(object):
|
||||
def __init__(self):
|
||||
self.batch = []
|
||||
self.event = threading.Event()
|
||||
|
||||
def add(self, x):
|
||||
self.batch.append(x)
|
||||
if len(self.batch) >= 3:
|
||||
self.event.set()
|
||||
else:
|
||||
self.event.wait()
|
||||
return sorted(self.batch)
|
||||
|
||||
a = Batcher.options(is_direct_call=True, max_concurrency=3).remote()
|
||||
x1 = a.add.remote(1)
|
||||
x2 = a.add.remote(2)
|
||||
x3 = a.add.remote(3)
|
||||
r1 = ray.get(x1)
|
||||
r2 = ray.get(x2)
|
||||
r3 = ray.get(x3)
|
||||
assert r1 == [1, 2, 3]
|
||||
assert r1 == r2 == r3
|
||||
|
||||
|
||||
def test_wait(ray_start_regular):
|
||||
@ray.remote
|
||||
def f(delay):
|
||||
@@ -1516,7 +1542,6 @@ def test_profiling_api(ray_start_2_cpus):
|
||||
profile_data = ray.timeline()
|
||||
event_types = {event["cat"] for event in profile_data}
|
||||
expected_types = [
|
||||
"worker_idle",
|
||||
"task",
|
||||
"task:deserialize_arguments",
|
||||
"task:execute",
|
||||
|
||||
Reference in New Issue
Block a user