[core worker] Submit Python actor tasks through core worker (#5750)

* Submit actor tasks through core worker * Fix java * add comment * Remove task builder * Check negative * Increase -> Increment * pass by reference * fix signal * Clean up c++ actor handle * more cleanup * Clean up headers * Fix unique_ptr construction * Fix java * Move profiling to c++ * dedup * fix error * comments * fix java * Fix tests * wait for actor to exit * Start after constructor * ignore java build * fix comment * always init logging * Fix logging * fix logging issue * shared_ptr for profiler * DEBUG -> WARNING * fix killed_ init * Fix flaky checkpointing tests * -v flag for tune tests * Fix checkpoint test logic * Fix exception matching * timeout exception * Fix test exception info * Fix import * fix build * Fix test * shared_ptr
2026-07-02 01:37:40 +08:00 · 2019-10-07 15:42:19 -07:00
parent 04e997fe0d
commit 08e4e3a153
24 changed files with 659 additions and 888 deletions
@@ -3,76 +3,24 @@ from __future__ import division
 from __future__ import print_function

 import copy
-import hashlib
 import inspect
 import logging
 import six
 import sys
-import threading

 from abc import ABCMeta, abstractmethod
 from collections import namedtuple

 from ray.function_manager import FunctionDescriptor
 import ray.ray_constants as ray_constants
+import ray._raylet
 import ray.signature as signature
 import ray.worker
-from ray import (ObjectID, ActorID, ActorHandleID, ActorClassID, TaskID)
+from ray import ActorID, ActorHandleID, ActorClassID, profiling

 logger = logging.getLogger(__name__)


-def compute_actor_handle_id(actor_handle_id, num_forks):
-    """Deterministically compute an actor handle ID.
-
-    A new actor handle ID is generated when it is forked from another actor
-    handle. The new handle ID is computed as hash(old_handle_id || num_forks).
-
-    Args:
-        actor_handle_id (common.ObjectID): The original actor handle ID.
-        num_forks: The number of times the original actor handle has been
-                   forked so far.
-
-    Returns:
-        An ID for the new actor handle.
-    """
-    assert isinstance(actor_handle_id, ActorHandleID)
-    handle_id_hash = hashlib.sha1()
-    handle_id_hash.update(actor_handle_id.binary())
-    handle_id_hash.update(str(num_forks).encode("ascii"))
-    handle_id = handle_id_hash.digest()
-    return ActorHandleID(handle_id)
-
-
-def compute_actor_handle_id_non_forked(actor_handle_id, current_task_id):
-    """Deterministically compute an actor handle ID in the non-forked case.
-
-    This code path is used whenever an actor handle is pickled and unpickled
-    (for example, if a remote function closes over an actor handle). Then,
-    whenever the actor handle is used, a new actor handle ID will be generated
-    on the fly as a deterministic function of the actor ID, the previous actor
-    handle ID and the current task ID.
-
-    TODO(rkn): It may be possible to cause problems by closing over multiple
-    actor handles in a remote function, which then get unpickled and give rise
-    to the same actor handle IDs.
-
-    Args:
-        actor_handle_id: The original actor handle ID.
-        current_task_id: The ID of the task that is unpickling the handle.
-
-    Returns:
-        An ID for the new actor handle.
-    """
-    assert isinstance(actor_handle_id, ActorHandleID)
-    assert isinstance(current_task_id, TaskID)
-    handle_id_hash = hashlib.sha1()
-    handle_id_hash.update(actor_handle_id.binary())
-    handle_id_hash.update(current_task_id.binary())
-    handle_id = handle_id_hash.digest()
-    return ActorHandleID(handle_id)
-
-
 def method(*args, **kwargs):
    """Annotate an actor method.

@@ -359,14 +307,6 @@ class ActorClass(object):
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

-        actor_id = ActorID.of(worker.current_job_id, worker.current_task_id,
-                              worker.task_context.task_index + 1)
-        # The actor cursor is a dummy object representing the most recent
-        # actor method invocation. For each subsequent method invocation,
-        # the current cursor should be added as a dependency, and then
-        # updated to reflect the new invocation.
-        actor_cursor = None
-
        # Set the actor's default resources if not already set. First three
        # conditions are to check that no resources were specified in the
        # decorator. Last three conditions are to check that no resources were
@@ -386,12 +326,23 @@ class ActorClass(object):
                           if self._num_cpus is None else self._num_cpus)
            actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SPECIFIED

+        function_name = "__init__"
+        function_descriptor = FunctionDescriptor(
+            self._modified_class.__module__, function_name,
+            self._modified_class.__name__)
+
        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
+            actor_id = ActorID.of(worker.current_job_id,
+                                  worker.current_task_id,
+                                  worker.task_context.task_index + 1)
            worker.actors[actor_id] = self._modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
+            core_handle = ray._raylet.ActorHandle(
+                actor_id, ActorHandleID.nil(), worker.current_job_id,
+                function_descriptor.get_function_descriptor_list())
        else:
            # Export the actor.
            if (self._last_export_session_and_job !=
@@ -418,32 +369,25 @@ class ActorClass(object):
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

-            function_name = "__init__"
            function_signature = self._method_signatures[function_name]
            creation_args = signature.extend_args(function_signature, args,
                                                  kwargs)
-            function_descriptor = FunctionDescriptor(
-                self._modified_class.__module__, function_name,
-                self._modified_class.__name__)
-            [actor_cursor] = worker.submit_task(
-                function_descriptor,
-                creation_args,
-                actor_creation_id=actor_id,
-                max_actor_reconstructions=self._max_reconstructions,
-                num_return_vals=1,
-                resources=resources,
-                placement_resources=actor_placement_resources)
-            assert isinstance(actor_cursor, ObjectID)
+            core_handle = worker.core_worker.create_actor(
+                function_descriptor.get_function_descriptor_list(),
+                creation_args, self._max_reconstructions, resources,
+                actor_placement_resources)

        actor_handle = ActorHandle(
-            actor_id, self._modified_class.__module__, self._class_name,
-            actor_cursor, self._actor_method_names, self._method_decorators,
-            self._method_signatures, self._actor_method_num_return_vals,
-            actor_cursor, actor_method_cpu, worker.current_job_id,
-            worker.current_session_and_job)
-        # We increment the actor counter by 1 to account for the actor creation
-        # task.
-        actor_handle._ray_actor_counter += 1
+            core_handle,
+            self._modified_class.__module__,
+            self._class_name,
+            self._actor_method_names,
+            self._method_decorators,
+            self._method_signatures,
+            self._actor_method_num_return_vals,
+            actor_method_cpu,
+            worker.current_session_and_job,
+            original_handle=True)

        return actor_handle

@@ -464,23 +408,8 @@ class ActorHandle(object):
    cloudpickle).

    Attributes:
-        _ray_actor_id: The ID of the corresponding actor.
+        _ray_core_handle: Core worker actor handle for this actor.
        _ray_module_name: The module name of this actor.
-        _ray_actor_handle_id: The ID of this handle. If this is the "original"
-            handle for an actor (as opposed to one created by passing another
-            handle into a task), then this ID must be NIL_ID. If this
-            ActorHandle was created by forking an existing ActorHandle, then
-            this ID must be computed deterministically via
-            compute_actor_handle_id. If this ActorHandle was created by an
-            out-of-band mechanism (e.g., pickling), then this must be None (in
-            this case, a new actor handle ID will be generated on the fly every
-            time a method is invoked).
-        _ray_actor_cursor: The actor cursor is a dummy object representing the
-            most recent actor method invocation. For each subsequent method
-            invocation, the current cursor should be added as a dependency, and
-            then updated to reflect the new invocation.
-        _ray_actor_counter: The number of actor method invocations that we've
-            called so far.
        _ray_actor_method_names: The names of the actor methods.
        _ray_method_decorators: Optional decorators for the function
            invocation. This can be used to change the behavior on the
@@ -490,63 +419,33 @@ class ActorHandle(object):
        _ray_method_num_return_vals: The default number of return values for
            each method.
        _ray_class_name: The name of the actor class.
-        _ray_actor_forks: The number of times this handle has been forked.
-        _ray_actor_creation_dummy_object_id: The dummy object ID from the actor
-            creation task.
        _ray_actor_method_cpus: The number of CPUs required by actor methods.
        _ray_original_handle: True if this is the original actor handle for a
            given actor. If this is true, then the actor will be destroyed when
            this handle goes out of scope.
-        _ray_actor_job_id: The ID of the job that created the actor
-            (it is possible that this ActorHandle exists on a job with a
-            different job ID).
-        _ray_new_actor_handles: The new actor handles that were created from
-            this handle since the last task on this handle was submitted. This
-            is used to garbage-collect dummy objects that are no longer
-            necessary in the backend.
    """

    def __init__(self,
-                 actor_id,
+                 core_handle,
                 module_name,
                 class_name,
-                 actor_cursor,
                 actor_method_names,
                 method_decorators,
                 method_signatures,
                 method_num_return_vals,
-                 actor_creation_dummy_object_id,
                 actor_method_cpus,
-                 actor_job_id,
                 session_and_job,
-                 actor_handle_id=None):
-        assert isinstance(actor_id, ActorID)
-        assert isinstance(actor_job_id, ray.JobID)
-        self._ray_actor_id = actor_id
+                 original_handle=False):
+        self._ray_core_handle = core_handle
        self._ray_module_name = module_name
-        # False if this actor handle was created by forking or pickling. True
-        # if it was created by the _serialization_helper function.
-        self._ray_original_handle = actor_handle_id is None
-        if self._ray_original_handle:
-            self._ray_actor_handle_id = ActorHandleID.nil()
-        else:
-            assert isinstance(actor_handle_id, ActorHandleID)
-            self._ray_actor_handle_id = actor_handle_id
-        self._ray_actor_cursor = actor_cursor
-        self._ray_actor_counter = 0
+        self._ray_original_handle = original_handle
        self._ray_actor_method_names = actor_method_names
        self._ray_method_decorators = method_decorators
        self._ray_method_signatures = method_signatures
        self._ray_method_num_return_vals = method_num_return_vals
        self._ray_class_name = class_name
-        self._ray_actor_forks = 0
-        self._ray_actor_creation_dummy_object_id = (
-            actor_creation_dummy_object_id)
        self._ray_actor_method_cpus = actor_method_cpus
-        self._ray_actor_job_id = actor_job_id
        self._ray_session_and_job = session_and_job
-        self._ray_new_actor_handles = []
-        self._ray_actor_lock = threading.Lock()

    def _actor_method_call(self,
                           method_name,
@@ -584,38 +483,16 @@ class ActorHandle(object):
        function_descriptor = FunctionDescriptor(
            self._ray_module_name, method_name, self._ray_class_name)

-        if worker.mode == ray.LOCAL_MODE:
-            function = getattr(worker.actors[self._ray_actor_id], method_name)
-            object_ids = worker.local_mode_manager.execute(
-                function, function_descriptor, args, num_return_vals)
-        else:
-            with self._ray_actor_lock:
-                object_ids = worker.submit_task(
-                    function_descriptor,
-                    args,
-                    actor_id=self._ray_actor_id,
-                    actor_handle_id=self._ray_actor_handle_id,
-                    actor_counter=self._ray_actor_counter,
-                    actor_creation_dummy_object_id=(
-                        self._ray_actor_creation_dummy_object_id),
-                    previous_actor_task_dummy_object_id=self._ray_actor_cursor,
-                    new_actor_handles=self._ray_new_actor_handles,
-                    # We add one for the dummy return ID.
-                    num_return_vals=num_return_vals + 1,
-                    resources={"CPU": self._ray_actor_method_cpus},
-                    placement_resources={},
-                    job_id=self._ray_actor_job_id,
-                )
-                # Update the actor counter and cursor to reflect the most
-                # recent invocation.
-                self._ray_actor_counter += 1
-                # The last object returned is the dummy object that should be
-                # passed in to the next actor method. Do not return it to the
-                # user.
-                self._ray_actor_cursor = object_ids.pop()
-                # We have notified the backend of the new actor handles to
-                # expect since the last task was submitted, so clear the list.
-                self._ray_new_actor_handles = []
+        with profiling.profile("submit_task"):
+            if worker.mode == ray.LOCAL_MODE:
+                function = getattr(worker.actors[self._actor_id], method_name)
+                object_ids = worker.local_mode_manager.execute(
+                    function, function_descriptor, args, num_return_vals)
+            else:
+                object_ids = worker.core_worker.submit_actor_task(
+                    self._ray_core_handle,
+                    function_descriptor.get_function_descriptor_list(), args,
+                    num_return_vals, {"CPU": self._ray_actor_method_cpus})

        if len(object_ids) == 1:
            object_ids = object_ids[0]
@@ -654,7 +531,7 @@ class ActorHandle(object):

    def __repr__(self):
        return "Actor({}, {})".format(self._ray_class_name,
-                                      self._ray_actor_id.hex())
+                                      self._actor_id.hex())

    def __del__(self):
        """Kill the worker that is running this actor."""
@@ -674,8 +551,8 @@ class ActorHandle(object):
            # and we don't need to send `__ray_terminate__` again.
            logger.warning(
                "Actor is garbage collected in the wrong driver." +
-                " Actor id = %s, class name = %s.", self._ray_actor_id,
-                self._ray_class_name)
+                " Actor id = %s, class name = %s.",
+                self._ray_core_handle.actor_id(), self._ray_class_name)
            return
        if worker.connected and self._ray_original_handle:
            # TODO(rkn): Should we be passing in the actor cursor as a
@@ -684,11 +561,11 @@ class ActorHandle(object):

    @property
    def _actor_id(self):
-        return self._ray_actor_id
+        return self._ray_core_handle.actor_id()

    @property
    def _actor_handle_id(self):
-        return self._ray_actor_handle_id
+        return self._ray_core_handle.actor_handle_id()

    def _serialization_helper(self, ray_forking):
        """This is defined in order to make pickling work.
@@ -700,48 +577,17 @@ class ActorHandle(object):
        Returns:
            A dictionary of the information needed to reconstruct the object.
        """
-        if ray_forking:
-            actor_handle_id = compute_actor_handle_id(
-                self._ray_actor_handle_id, self._ray_actor_forks)
-        else:
-            actor_handle_id = self._ray_actor_handle_id
-
-        # Note: _ray_actor_cursor and _ray_actor_creation_dummy_object_id
-        # could be None.
        state = {
-            "actor_id": self._ray_actor_id,
-            "actor_handle_id": actor_handle_id,
+            "core_handle": self._ray_core_handle.fork(ray_forking).to_bytes(),
            "module_name": self._ray_module_name,
            "class_name": self._ray_class_name,
-            "actor_cursor": self._ray_actor_cursor,
            "actor_method_names": self._ray_actor_method_names,
            "method_decorators": self._ray_method_decorators,
            "method_signatures": self._ray_method_signatures,
            "method_num_return_vals": self._ray_method_num_return_vals,
-            # Actors in local mode don't have dummy objects.
-            "actor_creation_dummy_object_id": self.
-            _ray_actor_creation_dummy_object_id,
-            "actor_method_cpus": self._ray_actor_method_cpus,
-            "actor_job_id": self._ray_actor_job_id,
-            "ray_forking": ray_forking
+            "actor_method_cpus": self._ray_actor_method_cpus
        }

-        if ray_forking:
-            self._ray_actor_forks += 1
-            new_actor_handle_id = actor_handle_id
-        else:
-            # The execution dependency for a pickled actor handle is never safe
-            # to release, since it could be unpickled and submit another
-            # dependent task at any time. Therefore, we notify the backend of a
-            # random handle ID that will never actually be used.
-            new_actor_handle_id = ActorHandleID.from_random()
-        # Notify the backend to expect this new actor handle. The backend will
-        # not release the cursor for any new handles until the first task for
-        # each of the new handles is submitted.
-        # NOTE(swang): There is currently no garbage collection for actor
-        # handles until the actor itself is removed.
-        self._ray_new_actor_handles.append(new_actor_handle_id)
-
        return state

    def _deserialization_helper(self, state, ray_forking):
@@ -755,39 +601,19 @@ class ActorHandle(object):
        worker = ray.worker.get_global_worker()
        worker.check_connected()

-        if state["ray_forking"]:
-            actor_handle_id = state["actor_handle_id"]
-        else:
-            # Right now, if the actor handle has been pickled, we create a
-            # temporary actor handle id for invocations.
-            # TODO(pcm): This still leads to a lot of actor handles being
-            # created, there should be a better way to handle pickled
-            # actor handles.
+        self.__init__(
            # TODO(swang): Accessing the worker's current task ID is not
            # thread-safe.
-            # TODO(swang): Unpickling the same actor handle twice in the same
-            # task will break the application, and unpickling it twice in the
-            # same actor is likely a performance bug. We should consider
-            # logging a warning in these cases.
-            actor_handle_id = compute_actor_handle_id_non_forked(
-                state["actor_handle_id"], worker.current_task_id)
-
-        self.__init__(
-            state["actor_id"],
+            ray._raylet.ActorHandle.from_bytes(state["core_handle"],
+                                               worker.current_task_id),
            state["module_name"],
            state["class_name"],
-            state["actor_cursor"],
            state["actor_method_names"],
            state["method_decorators"],
            state["method_signatures"],
            state["method_num_return_vals"],
-            state["actor_creation_dummy_object_id"],
            state["actor_method_cpus"],
-            # This is the ID of the job that owns the actor, not
-            # necessarily the job that owns this actor handle.
-            state["actor_job_id"],
-            worker.current_session_and_job,
-            actor_handle_id=actor_handle_id)
+            worker.current_session_and_job)

    def __getstate__(self):
        """This code path is used by pickling but not by Ray forking."""