[core worker] Python core worker task execution (#5783)

Executes tasks via the event loop in the C++ core worker. Also properly handles signals (including KeyboardInterrupt), so ctrl-C in a python interactive shell works now (if connecting to an existing cluster).
2026-06-27 22:53:20 +08:00 · 2019-10-22 20:15:59 -07:00
parent 95241f6686
commit 02931e08f3
38 changed files with 830 additions and 678 deletions
@@ -3,11 +3,21 @@
 # cython: embedsignature = True
 # cython: language_level = 3

+from cpython.exc cimport PyErr_CheckSignals
+
 import numpy
 import time
 import logging
+import os
+import sys

-from libc.stdint cimport uint8_t, int32_t, int64_t, uint64_t
+from libc.stdint cimport (
+    int32_t,
+    int64_t,
+    INT64_MAX,
+    uint64_t,
+    uint8_t,
+)
 from libcpp cimport bool as c_bool
 from libcpp.memory cimport (
    dynamic_pointer_cast,
@@ -28,6 +38,7 @@ from ray.includes.common cimport (
    CRayStatus,
    CGcsClientOptions,
    CTaskArg,
+    CTaskType,
    CRayFunction,
    LocalMemoryBuffer,
    move,
@@ -35,6 +46,9 @@ from ray.includes.common cimport (
    LANGUAGE_JAVA,
    LANGUAGE_PYTHON,
    LocalMemoryBuffer,
+    TASK_TYPE_NORMAL_TASK,
+    TASK_TYPE_ACTOR_CREATION_TASK,
+    TASK_TYPE_ACTOR_TASK,
    WORKER_TYPE_WORKER,
    WORKER_TYPE_DRIVER,
 )
@@ -42,10 +56,10 @@ from ray.includes.libraylet cimport (
    CRayletClient,
    GCSProfileEvent,
    GCSProfileTableData,
-    ResourceMappingType,
    WaitResultPair,
 )
 from ray.includes.unique_ids cimport (
+    CActorID,
    CActorCheckpointID,
    CObjectID,
    CClientID,
@@ -54,12 +68,22 @@ from ray.includes.libcoreworker cimport (
    CActorCreationOptions,
    CCoreWorker,
    CTaskOptions,
+    ResourceMappingType,
 )
 from ray.includes.task cimport CTaskSpec
 from ray.includes.ray_config cimport RayConfig
+
 import ray
+import ray.experimental.signal as ray_signal
+import ray.ray_constants as ray_constants
 from ray import profiling
-from ray.exceptions import RayletError, ObjectStoreFullError
+from ray.exceptions import (
+    RayError,
+    RayletError,
+    RayTaskError,
+    ObjectStoreFullError
+)
+from ray.function_manager import FunctionDescriptor
 from ray.utils import decode
 from ray.ray_constants import (
    DEFAULT_PUT_OBJECT_DELAY,
@@ -105,9 +129,30 @@ cdef int check_status(const CRayStatus& status) nogil except -1:

    if status.IsObjectStoreFull():
        raise ObjectStoreFullError(message)
+    elif status.IsInterrupted():
+        raise KeyboardInterrupt()
    else:
        raise RayletError(message)

+cdef RayObjectsToDataMetadataPairs(
+        const c_vector[shared_ptr[CRayObject]] objects):
+    data_metadata_pairs = []
+    for i in range(objects.size()):
+        # core_worker will return a nullptr for objects that couldn't be
+        # retrieved from the store or if an object was an exception.
+        if not objects[i].get():
+            data_metadata_pairs.append((None, None))
+        else:
+            data = None
+            metadata = None
+            if objects[i].get().HasData():
+                data = Buffer.make(objects[i].get().GetData())
+            if objects[i].get().HasMetadata():
+                metadata = Buffer.make(
+                    objects[i].get().GetMetadata()).to_pybytes()
+            data_metadata_pairs.append((data, metadata))
+    return data_metadata_pairs
+

 cdef VectorToObjectIDs(const c_vector[CObjectID] &object_ids):
    result = []
@@ -327,17 +372,6 @@ cdef class RayletClient:
        # initialized before the raylet client.
        self.client = &core_worker.core_worker.get().GetRayletClient()

-    def get_task(self):
-        cdef:
-            unique_ptr[CTaskSpec] task_spec
-
-        with nogil:
-            check_status(self.client.GetTask(&task_spec))
-        return TaskSpec.make(task_spec)
-
-    def task_done(self):
-        check_status(self.client.TaskDone())
-
    def fetch_or_reconstruct(self, object_ids,
                             c_bool fetch_only,
                             TaskID current_task_id=TaskID.nil()):
@@ -345,27 +379,6 @@ cdef class RayletClient:
        check_status(self.client.FetchOrReconstruct(
            fetch_ids, fetch_only, current_task_id.native()))

-    def resource_ids(self):
-        cdef:
-            ResourceMappingType resource_mapping = (
-                self.client.GetResourceIDs())
-            unordered_map[
-                c_string, c_vector[pair[int64_t, double]]
-            ].iterator iterator = resource_mapping.begin()
-            c_vector[pair[int64_t, double]] c_value
-
-        resources_dict = {}
-        while iterator != resource_mapping.end():
-            key = decode(dereference(iterator).first)
-            c_value = dereference(iterator).second
-            ids_and_fractions = []
-            for i in range(c_value.size()):
-                ids_and_fractions.append(
-                    (c_value[i].first, c_value[i].second))
-            resources_dict[key] = ids_and_fractions
-            postincrement(iterator)
-        return resources_dict
-
    def push_error(self, JobID job_id, error_type, error_message,
                   double timestamp):
        check_status(self.client.PushError(job_id.native(),
@@ -403,6 +416,272 @@ cdef class RayletClient:
    def is_worker(self):
        return self.client.IsWorker()

+cdef deserialize_args(
+        const c_vector[shared_ptr[CRayObject]] &c_args,
+        const c_vector[CObjectID] &arg_reference_ids):
+    cdef:
+        c_vector[shared_ptr[CRayObject]] by_reference_objects
+
+    args = []
+    by_reference_ids = []
+    by_reference_indices = []
+    for i in range(c_args.size()):
+        # Passed by value.
+        if arg_reference_ids[i].IsNil():
+            data = Buffer.make(c_args[i].get().GetData())
+            if (c_args[i].get().HasMetadata()
+                and Buffer.make(
+                    c_args[i].get().GetMetadata()).to_pybytes()
+                    == RAW_BUFFER_METADATA):
+                args.append(data)
+            else:
+                args.append(pickle.loads(data.to_pybytes()))
+        # Passed by reference.
+        else:
+            by_reference_ids.append(
+                ObjectID(arg_reference_ids[i].Binary()))
+            by_reference_indices.append(i)
+            by_reference_objects.push_back(c_args[i])
+            args.append(None)
+
+    data_metadata_pairs = RayObjectsToDataMetadataPairs(
+        by_reference_objects)
+    for i, arg in enumerate(
+        ray.worker.global_worker.deserialize_objects(
+            data_metadata_pairs, by_reference_ids)):
+        args[by_reference_indices[i]] = arg
+
+    for arg in args:
+        if isinstance(arg, RayError):
+            raise arg
+
+    return ray.signature.recover_args(args)
+
+cdef _check_worker_state(worker, CTaskType task_type, JobID job_id):
+    assert worker.current_task_id.is_nil()
+    assert worker.task_context.task_index == 0
+    assert worker.task_context.put_index == 1
+
+    # If this worker is not an actor, check that `current_job_id`
+    # was reset when the worker finished the previous task.
+    if <int>task_type in [<int>TASK_TYPE_NORMAL_TASK,
+                          <int>TASK_TYPE_ACTOR_CREATION_TASK]:
+        assert worker.current_job_id.is_nil()
+        # Set the driver ID of the current running task. This is
+        # needed so that if the task throws an exception, we propagate
+        # the error message to the correct driver.
+        worker.current_job_id = job_id
+    else:
+        # If this worker is an actor, current_job_id wasn't reset.
+        # Check that current task's driver ID equals the previous
+        # one.
+        assert worker.current_job_id == job_id
+
+
+cdef _store_task_outputs(worker, return_ids, outputs):
+    for i in range(len(return_ids)):
+        return_id, output = return_ids[i], outputs[i]
+        if isinstance(output, ray.actor.ActorHandle):
+            raise Exception("Returning an actor handle from a remote "
+                            "function is not allowed).")
+        if output is ray.experimental.no_return.NoReturn:
+            if not worker.core_worker.object_exists(return_id):
+                raise RuntimeError(
+                    "Attempting to return 'ray.experimental.NoReturn' "
+                    "from a remote function, but the corresponding "
+                    "ObjectID does not exist in the local object store.")
+        else:
+            worker.put_object(return_id, output)
+
+
+cdef execute_task(
+        CTaskType task_type,
+        const CRayFunction &ray_function,
+        const CJobID &c_job_id,
+        const CActorID &c_actor_id,
+        const unordered_map[c_string, double] &c_resources,
+        const c_vector[shared_ptr[CRayObject]] &c_args,
+        const c_vector[CObjectID] &c_arg_reference_ids,
+        const c_vector[CObjectID] &c_return_ids,
+        c_vector[shared_ptr[CRayObject]] *returns):
+
+    worker = ray.worker.global_worker
+
+    actor_id = ActorID(c_actor_id.Binary())
+    job_id = JobID(c_job_id.Binary())
+    task_id = worker.core_worker.get_current_task_id()
+
+    # Check that the worker is in the expected state to execute the task.
+    _check_worker_state(worker, task_type, job_id)
+    worker.task_context.current_task_id = task_id
+
+    # Automatically restrict the GPUs available to this task.
+    ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
+
+    function_descriptor = FunctionDescriptor.from_bytes_list(
+        ray_function.GetFunctionDescriptor())
+
+    if <int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK:
+        worker.actor_id = actor_id
+        actor_class = worker.function_actor_manager.load_actor_class(
+            job_id, function_descriptor)
+        worker.actors[actor_id] = actor_class.__new__(actor_class)
+        worker.actor_checkpoint_info[actor_id] = (
+            ray.worker.ActorCheckpointInfo(
+                num_tasks_since_last_checkpoint=0,
+                last_checkpoint_timestamp=int(1000 * time.time()),
+                checkpoint_ids=[]))
+
+    execution_info = worker.function_actor_manager.get_execution_info(
+        job_id, function_descriptor)
+    function_name = execution_info.function_name
+    extra_data = {"name": function_name, "task_id": task_id.hex()}
+
+    if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
+        title = "ray_worker:{}()".format(function_name)
+        next_title = "ray_worker"
+        function_executor = execution_info.function
+    else:
+        actor = worker.actors[actor_id]
+        class_name = actor.__class__.__name__
+        title = "ray_{}:{}()".format(class_name, function_name)
+        next_title = "ray_{}".format(class_name)
+        worker_name = "ray_{}_{}".format(class_name, os.getpid())
+        if c_resources.find(b"memory") != c_resources.end():
+            worker.memory_monitor.set_heap_limit(
+                worker_name,
+                ray_constants.from_memory_units(
+                    dereference(c_resources.find(b"memory")).second))
+        if c_resources.find(b"object_store_memory") != c_resources.end():
+            worker._set_object_store_client_options(
+                worker_name,
+                int(ray_constants.from_memory_units(
+                        dereference(
+                            c_resources.find(b"object_store_memory")).second)))
+
+        def function_executor(*arguments, **kwarguments):
+            return execution_info.function(actor, *arguments, **kwarguments)
+
+    return_ids = VectorToObjectIDs(c_return_ids)
+    with profiling.profile("task", extra_data=extra_data):
+        try:
+            task_exception = False
+            if not (<int>task_type == <int>TASK_TYPE_ACTOR_TASK
+                    and function_name == "__ray_terminate__"):
+                worker.reraise_actor_init_error()
+                worker.memory_monitor.raise_if_low_memory()
+
+            with profiling.profile("task:deserialize_arguments"):
+                args, kwargs = deserialize_args(c_args, c_arg_reference_ids)
+
+            # Execute the task.
+            with ray.worker._changeproctitle(title, next_title):
+                with profiling.profile("task:execute"):
+                    task_exception = True
+                    outputs = function_executor(*args, **kwargs)
+                    task_exception = False
+                    if len(return_ids) == 1:
+                        outputs = (outputs,)
+
+            # Store the outputs in the object store.
+            with profiling.profile("task:store_outputs"):
+                _store_task_outputs(worker, return_ids, outputs)
+        except Exception as error:
+            if (<int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK):
+                worker.mark_actor_init_failed(error)
+
+            backtrace = ray.utils.format_error_message(
+                traceback.format_exc(), task_exception=task_exception)
+            if isinstance(error, RayTaskError):
+                # Avoid recursive nesting of RayTaskError.
+                failure_object = RayTaskError(function_name, backtrace,
+                                              error.cause_cls)
+            else:
+                failure_object = RayTaskError(function_name, backtrace,
+                                              error.__class__)
+            _store_task_outputs(
+                worker, return_ids, [failure_object] * len(return_ids))
+            ray.utils.push_error_to_driver(
+                worker,
+                ray_constants.TASK_PUSH_ERROR,
+                str(failure_object),
+                job_id=worker.current_job_id)
+
+            # Send signal with the error.
+            ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
+
+    # Reset the state fields so the next task can run.
+    worker.task_context.current_task_id = TaskID.nil()
+    worker.core_worker.set_current_task_id(TaskID.nil())
+    worker.task_context.task_index = 0
+    worker.task_context.put_index = 1
+
+    # Don't need to reset `current_job_id` if the worker is an
+    # actor. Because the following tasks should all have the
+    # same driver id.
+    if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
+        worker.current_job_id = JobID.nil()
+        worker.core_worker.set_current_job_id(JobID.nil())
+
+        # Reset signal counters so that the next task can get
+        # all past signals.
+        ray_signal.reset()
+
+    # Reset the state of the worker for the next task to execute.
+    # Increase the task execution counter.
+    worker.function_actor_manager.increase_task_counter(
+        job_id, function_descriptor)
+
+    # If we've reached the max number of executions for this worker, exit.
+    reached_max_executions = (
+        worker.function_actor_manager.get_task_counter(
+            job_id, function_descriptor) == execution_info.max_calls)
+    if reached_max_executions:
+        worker.core_worker.disconnect()
+        sys.exit(0)
+
+cdef CRayStatus task_execution_handler(
+        CTaskType task_type,
+        const CRayFunction &ray_function,
+        const CJobID &c_job_id,
+        const CActorID &c_actor_id,
+        const unordered_map[c_string, double] &c_resources,
+        const c_vector[shared_ptr[CRayObject]] &c_args,
+        const c_vector[CObjectID] &c_arg_reference_ids,
+        const c_vector[CObjectID] &c_return_ids,
+        c_vector[shared_ptr[CRayObject]] *returns) nogil:
+
+    with gil:
+        try:
+            # The call to execute_task should never raise an exception. If it
+            # does, that indicates that there was an unexpected internal error.
+            execute_task(task_type, ray_function, c_job_id,
+                         c_actor_id, c_resources, c_args,
+                         c_arg_reference_ids, c_return_ids, returns)
+        except Exception:
+            traceback_str = traceback.format_exc() + (
+                "An unexpected internal error occurred while the worker was"
+                "executing a task.")
+            ray.utils.push_error_to_driver(
+                ray.worker.global_worker,
+                "worker_crash",
+                traceback_str,
+                job_id=None)
+            # TODO(rkn): Note that if the worker was in the middle of executing
+            # a task, then any worker or driver that is blocking in a get call
+            # and waiting for the output of that task will hang. We need to
+            # address this.
+            sys.exit(1)
+
+    return CRayStatus.OK()
+
+cdef CRayStatus check_signals() nogil:
+    with gil:
+        try:
+            PyErr_CheckSignals()
+        except KeyboardInterrupt:
+            return CRayStatus.Interrupted(b"")
+    return CRayStatus.OK()

 cdef class CoreWorker:
    cdef unique_ptr[CCoreWorker] core_worker
@@ -419,12 +698,20 @@ cdef class CoreWorker:
            LANGUAGE_PYTHON, store_socket.encode("ascii"),
            raylet_socket.encode("ascii"), job_id.native(),
            gcs_options.native()[0], log_dir.encode("utf-8"),
-            node_ip_address.encode("utf-8"), NULL, False))
+            node_ip_address.encode("utf-8"), task_execution_handler,
+            check_signals, False))

    def disconnect(self):
        with nogil:
            self.core_worker.get().Disconnect()

+    def run_task_loop(self):
+        with nogil:
+            self.core_worker.get().Execution().Run()
+
+    def get_current_task_id(self):
+        return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
+
    def set_current_task_id(self, TaskID task_id):
        cdef:
            CTaskID c_task_id = task_id.native()
@@ -432,15 +719,8 @@ cdef class CoreWorker:
        with nogil:
            self.core_worker.get().SetCurrentTaskId(c_task_id)

-    def set_actor_id(self, ActorID actor_id):
-        cdef:
-            CActorID c_actor_id = actor_id.native()
-
-        with nogil:
-            self.core_worker.get().SetActorId(c_actor_id)
-
-    def get_current_task_id(self):
-        return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
+    def get_current_job_id(self):
+        return JobID(self.core_worker.get().GetCurrentJobId().Binary())

    def set_current_job_id(self, JobID job_id):
        cdef:
@@ -449,7 +729,8 @@ cdef class CoreWorker:
        with nogil:
            self.core_worker.get().SetCurrentJobId(c_job_id)

-    def get_objects(self, object_ids, TaskID current_task_id):
+    def get_objects(self, object_ids, TaskID current_task_id,
+                    int64_t timeout_ms=-1):
        cdef:
            c_vector[shared_ptr[CRayObject]] results
            CTaskID c_task_id = current_task_id.native()
@@ -457,25 +738,9 @@ cdef class CoreWorker:

        with nogil:
            check_status(self.core_worker.get().Objects().Get(
-                c_object_ids, -1, &results))
+                c_object_ids, timeout_ms, &results))

-        data_metadata_pairs = []
-        for result in results:
-            # core_worker will return a nullptr for objects that couldn't be
-            # retrieved from the store or if an object was an exception.
-            if not result.get():
-                data_metadata_pairs.append((None, None))
-            else:
-                data = None
-                metadata = None
-                if result.get().HasData():
-                    data = Buffer.make(result.get().GetData())
-                if result.get().HasMetadata():
-                    metadata = Buffer.make(
-                        result.get().GetMetadata()).to_pybytes()
-                data_metadata_pairs.append((data, metadata))
-
-        return data_metadata_pairs
+        return RayObjectsToDataMetadataPairs(results)

    def object_exists(self, ObjectID object_id):
        cdef:
@@ -570,7 +835,7 @@ cdef class CoreWorker:
        with nogil:
            check_status(self.core_worker.get().Objects().Seal(c_object_id))

-    def wait(self, object_ids, int num_returns, int64_t timeout_milliseconds,
+    def wait(self, object_ids, int num_returns, int64_t timeout_ms,
             TaskID current_task_id):
        cdef:
            WaitResultPair result
@@ -581,7 +846,7 @@ cdef class CoreWorker:
        wait_ids = ObjectIDsToVector(object_ids)
        with nogil:
            check_status(self.core_worker.get().Objects().Wait(
-                wait_ids, num_returns, timeout_milliseconds, &results))
+                wait_ids, num_returns, timeout_ms, &results))

        assert len(results) == len(object_ids)

@@ -704,6 +969,28 @@ cdef class CoreWorker:

            return VectorToObjectIDs(return_ids)

+    def resource_ids(self):
+        cdef:
+            ResourceMappingType resource_mapping = (
+                self.core_worker.get().GetResourceIDs())
+            unordered_map[
+                c_string, c_vector[pair[int64_t, double]]
+            ].iterator iterator = resource_mapping.begin()
+            c_vector[pair[int64_t, double]] c_value
+
+        resources_dict = {}
+        while iterator != resource_mapping.end():
+            key = decode(dereference(iterator).first)
+            c_value = dereference(iterator).second
+            ids_and_fractions = []
+            for i in range(c_value.size()):
+                ids_and_fractions.append(
+                    (c_value[i].first, c_value[i].second))
+            resources_dict[key] = ids_and_fractions
+            postincrement(iterator)
+
+        return resources_dict
+
    def profile_event(self, event_type, dict extra_data):
        cdef:
            c_string c_event_type = event_type.encode("ascii")
@@ -199,8 +199,8 @@ class PlasmaEventHandler:
        del self._waiting_dict[fut.object_id]

    def _complete_future(self, fut):
-        obj = self._worker.retrieve_and_deserialize(
-            [ray.ObjectID(fut.object_id.binary())], 0)[0]
+        obj = self._worker.get_objects([ray.ObjectID(
+            fut.object_id.binary())])[0]
        fut.set_result(obj)

    def as_future(self, object_id, check_ready=True):
@@ -69,11 +69,10 @@ def send(signal):
    Args:
        signal: Signal to be sent.
    """
-    if hasattr(ray.worker.global_worker, "actor_creation_task_id"):
-        source_key = ray.worker.global_worker.actor_id.hex()
-    else:
-        # No actors; this function must have been called from a task
+    if ray.worker.global_worker.actor_id.is_nil():
        source_key = ray.worker.global_worker.current_task_id.hex()
+    else:
+        source_key = ray.worker.global_worker.actor_id.hex()

    encoded_signal = ray.utils.binary_to_hex(cloudpickle.dumps(signal))
    ray.worker.global_worker.redis_client.execute_command(
@@ -763,7 +763,7 @@ class FunctionActorManager(object):
                worker's internal state to record the executed method.
        """

-        def actor_method_executor(dummy_return_id, actor, *args, **kwargs):
+        def actor_method_executor(actor, *args, **kwargs):
            # Update the actor's task counter to reflect the task we're about
            # to execute.
            self._worker.actor_task_counter += 1
@@ -47,31 +47,34 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
        CRayStatus OK()

        @staticmethod
-        CRayStatus OutOfMemory()
+        CRayStatus OutOfMemory(const c_string &msg)

        @staticmethod
-        CRayStatus KeyError()
+        CRayStatus KeyError(const c_string &msg)

        @staticmethod
-        CRayStatus Invalid()
+        CRayStatus Invalid(const c_string &msg)

        @staticmethod
-        CRayStatus IOError()
+        CRayStatus IOError(const c_string &msg)

        @staticmethod
-        CRayStatus TypeError()
+        CRayStatus TypeError(const c_string &msg)

        @staticmethod
-        CRayStatus UnknownError()
+        CRayStatus UnknownError(const c_string &msg)

        @staticmethod
-        CRayStatus NotImplemented()
+        CRayStatus NotImplemented(const c_string &msg)

        @staticmethod
-        CRayStatus RedisError()
+        CRayStatus ObjectStoreFull(const c_string &msg)

        @staticmethod
-        CRayStatus ObjectStoreFull()
+        CRayStatus RedisError(const c_string &msg)
+
+        @staticmethod
+        CRayStatus Interrupted(const c_string &msg)

        c_bool ok()
        c_bool IsOutOfMemory()
@@ -81,8 +84,9 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
        c_bool IsTypeError()
        c_bool IsUnknownError()
        c_bool IsNotImplemented()
-        c_bool IsRedisError()
        c_bool IsObjectStoreFull()
+        c_bool IsRedisError()
+        c_bool IsInterrupted()

        c_string ToString()
        c_string CodeAsString()
@@ -92,6 +96,7 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
    # We can later add more of the common status factory methods as needed
    cdef CRayStatus RayStatus_OK "Status::OK"()
    cdef CRayStatus RayStatus_Invalid "Status::Invalid"()
+    cdef CRayStatus RayStatus_NotImplemented "Status::NotImplemented"()


 cdef extern from "ray/common/status.h" namespace "ray::StatusCode" nogil:
@@ -117,6 +122,8 @@ cdef extern from "ray/protobuf/common.pb.h" nogil:
        pass
    cdef cppclass CWorkerType "ray::WorkerType":
        pass
+    cdef cppclass CTaskType "ray::TaskType":
+        pass


 # This is a workaround for C++ enum class since Cython has no corresponding
@@ -130,6 +137,11 @@ cdef extern from "ray/protobuf/common.pb.h" nogil:
    cdef CWorkerType WORKER_TYPE_WORKER "ray::WorkerType::WORKER"
    cdef CWorkerType WORKER_TYPE_DRIVER "ray::WorkerType::DRIVER"

+cdef extern from "ray/protobuf/common.pb.h" nogil:
+    cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
+    cdef CTaskType TASK_TYPE_ACTOR_CREATION_TASK "ray::TaskType::ACTOR_CREATION_TASK"  # noqa: E501
+    cdef CTaskType TASK_TYPE_ACTOR_TASK "ray::TaskType::ACTOR_TASK"
+

 cdef extern from "ray/common/task/scheduling_resources.h" nogil:
    cdef cppclass ResourceSet "ray::ResourceSet":
@@ -1,7 +1,13 @@
+# cython: profile = False
+# distutils: language = c++
+# cython: embedsignature = True
+
 from libc.stdint cimport int64_t
 from libcpp cimport bool as c_bool
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string as c_string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.utility cimport pair
 from libcpp.vector cimport vector as c_vector

 from ray.includes.unique_ids cimport (
@@ -18,12 +24,30 @@ from ray.includes.common cimport (
    CRayStatus,
    CTaskArg,
    CTaskOptions,
+    CTaskType,
    CWorkerType,
    CLanguage,
    CGcsClientOptions,
 )
+from ray.includes.task cimport CTaskSpec
 from ray.includes.libraylet cimport CRayletClient

+ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
+    ResourceMappingType
+
+cdef extern from "ray/core_worker/task_execution.h" namespace "ray" nogil:
+    cdef cppclass CTaskExecutionInterface "CoreWorkerTaskExecutionInterface":
+        void Run()
+        void Stop()
+
+cdef extern from "ray/core_worker/profiling.h" nogil:
+    cdef cppclass CProfiler "ray::worker::Profiler":
+        void Start()
+
+    cdef cppclass CProfileEvent "ray::worker::ProfileEvent":
+        CProfileEvent(const shared_ptr[CProfiler] profiler,
+                      const c_string &event_type)
+        void SetExtraData(const c_string &extra_data)

 cdef extern from "ray/core_worker/profiling.h" nogil:
    cdef cppclass CProfileEvent "ray::worker::ProfileEvent":
@@ -54,12 +78,23 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                    const c_string &raylet_socket, const CJobID &job_id,
                    const CGcsClientOptions &gcs_options,
                    const c_string &log_dir, const c_string &node_ip_address,
-                    void* execution_callback,
+                    CRayStatus (
+                        CTaskType task_type,
+                        const CRayFunction &ray_function,
+                        const CJobID &job_id,
+                        const CActorID &actor_id,
+                        const unordered_map[c_string, double] &resources,
+                        const c_vector[shared_ptr[CRayObject]] &args,
+                        const c_vector[CObjectID] &arg_reference_ids,
+                        const c_vector[CObjectID] &return_ids,
+                        c_vector[shared_ptr[CRayObject]] *returns) nogil,
+                    CRayStatus() nogil,
                    c_bool use_memory_store_)
        void Disconnect()
        CWorkerType &GetWorkerType()
        CLanguage &GetLanguage()
        CObjectInterface &Objects()
+        CTaskExecutionInterface &Execution()

        CRayStatus SubmitTask(
            const CRayFunction &function, const c_vector[CTaskArg] &args,
@@ -72,7 +107,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
            const c_vector[CTaskArg] &args, const CTaskOptions &options,
            c_vector[CObjectID] *return_ids)

-        # CTaskExecutionInterface &Execution()
        unique_ptr[CProfileEvent] CreateProfileEvent(
            const c_string &event_type)

@@ -81,12 +115,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
        CRayletClient &GetRayletClient()
        # TODO(edoakes): remove these once the Python core worker uses the task
        # interfaces
+        CJobID GetCurrentJobId()
        void SetCurrentJobId(const CJobID &job_id)
        CTaskID GetCurrentTaskId()
        void SetCurrentTaskId(const CTaskID &task_id)
-        void SetActorId(const CActorID &actor_id)
        const CActorID &GetActorId()
        CTaskID GetCallerId()
+        const ResourceMappingType &GetResourceIDs() const
        CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
        CRayStatus SerializeActorHandle(const CActorID &actor_id, c_string
                                        *bytes)
@@ -3,7 +3,6 @@ from libcpp cimport bool as c_bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string as c_string
 from libcpp.utility cimport pair
-from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector as c_vector

 from ray.includes.common cimport (
@@ -38,8 +37,6 @@ cdef extern from "ray/protobuf/gcs.pb.h" nogil:
        GCSProfileTableData()


-ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
-    ResourceMappingType
 ctypedef pair[c_vector[CObjectID], c_vector[CObjectID]] WaitResultPair


@@ -78,4 +75,3 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
        CWorkerID GetWorkerID() const
        CJobID GetJobID() const
        c_bool IsWorker() const
-        const ResourceMappingType &GetResourceIDs() const
@@ -14,12 +14,6 @@ cdef class TaskSpec:
    cdef:
        unique_ptr[CTaskSpec] task_spec

-    @staticmethod
-    cdef make(unique_ptr[CTaskSpec]& task_spec):
-        cdef TaskSpec self = TaskSpec.__new__(TaskSpec)
-        self.task_spec.reset(task_spec.release())
-        return self
-
    @staticmethod
    def from_string(const c_string& task_spec_str):
        """Convert a string to a Ray task specification Python object.
@@ -82,23 +76,23 @@ cdef class TaskSpec:
    def arguments(self):
        """Return the arguments for the task."""
        cdef:
-            CTaskSpec*task_spec = self.task_spec.get()
-            int64_t num_args = task_spec.NumArgs()
-            int32_t lang = <int32_t>task_spec.GetLanguage()
+            int64_t num_args = self.task_spec.get().NumArgs()
+            int32_t lang = <int32_t>self.task_spec.get().GetLanguage()
            int count
        arg_list = []

        if lang == <int32_t>LANGUAGE_PYTHON:
            for i in range(num_args):
-                count = task_spec.ArgIdCount(i)
+                count = self.task_spec.get().ArgIdCount(i)
                if count > 0:
                    assert count == 1
                    arg_list.append(
-                        ObjectID(task_spec.ArgId(i, 0).Binary()))
+                        ObjectID(self.task_spec.get().ArgId(i, 0).Binary()))
                else:
-                    data = task_spec.ArgData(i)[:task_spec.ArgDataSize(i)]
-                    metadata = task_spec.ArgMetadata(i)[
-                        :task_spec.ArgMetadataSize(i)]
+                    data = self.task_spec.get().ArgData(i)[
+                        :self.task_spec.get().ArgDataSize(i)]
+                    metadata = self.task_spec.get().ArgMetadata(i)[
+                        :self.task_spec.get().ArgMetadataSize(i)]
                    if metadata == RAW_BUFFER_METADATA:
                        obj = data
                    else:
@@ -111,10 +105,10 @@ cdef class TaskSpec:

    def returns(self):
        """Return the object IDs for the return values of the task."""
-        cdef CTaskSpec *task_spec = self.task_spec.get()
        return_id_list = []
-        for i in range(task_spec.NumReturns()):
-            return_id_list.append(ObjectID(task_spec.ReturnId(i).Binary()))
+        for i in range(self.task_spec.get().NumReturns()):
+            return_id_list.append(
+                ObjectID(self.task_spec.get().ReturnId(i).Binary()))
        return return_id_list

    def required_resources(self):
@@ -505,6 +505,10 @@ class GlobalState(object):
            node_ip_address = profile_table_message.node_ip_address

            for profile_event_message in profile_table_message.profile_events:
+                try:
+                    extra_data = json.loads(profile_event_message.extra_data)
+                except ValueError:
+                    extra_data = {}
                profile_event = {
                    "event_type": profile_event_message.event_type,
                    "component_id": component_id,
@@ -512,7 +516,7 @@ class GlobalState(object):
                    "component_type": component_type,
                    "start_time": profile_event_message.start_time,
                    "end_time": profile_event_message.end_time,
-                    "extra_data": json.loads(profile_event_message.extra_data),
+                    "extra_data": extra_data
                }

                profile_events.append(profile_event)
@@ -106,7 +106,7 @@ class Cluster(object):

        return node

-    def remove_node(self, node, allow_graceful=False):
+    def remove_node(self, node, allow_graceful=True):
        """Kills all processes associated with worker node.

        Args:
@@ -47,4 +47,3 @@ def test_raylet_gdb(ray_gdb_start):
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        assert pgrep_command.communicate()[0]
-        subprocess.call(["pkill", "-f", "gdb.*{}".format(process_name)])
@@ -292,7 +292,7 @@ def test_incorrect_method_calls(ray_start_regular):
 def test_worker_raising_exception(ray_start_regular):
    @ray.remote
    def f():
-        ray.worker.global_worker._get_next_task_from_raylet = None
+        ray.worker.global_worker.function_actor_manager = None

    # Running this task should cause the worker to raise an exception after
    # the task has successfully completed.
@@ -618,12 +618,17 @@ def test_warning_for_too_many_nested_tasks(shutdown_only):
        time.sleep(1000)
        return 1

+    @ray.remote
+    def h():
+        time.sleep(1)
+        ray.get(f.remote())
+
    @ray.remote
    def g():
        # Sleep so that the f tasks all get submitted to the scheduler after
        # the g tasks.
        time.sleep(1)
-        ray.get(f.remote())
+        ray.get(h.remote())

    [g.remote() for _ in range(num_cpus * 4)]
    wait_for_errors(ray_constants.WORKER_POOL_LARGE_ERROR, 1)
@@ -705,8 +710,6 @@ def test_warning_for_dead_node(ray_start_cluster_2_nodes):


 def test_raylet_crash_when_get(ray_start_regular):
-    nonexistent_id = ray.ObjectID.from_random()
-
    def sleep_to_kill_raylet():
        # Don't kill raylet before default workers get connected.
        time.sleep(2)
@@ -715,14 +718,14 @@ def test_raylet_crash_when_get(ray_start_regular):
    thread = threading.Thread(target=sleep_to_kill_raylet)
    thread.start()
    with pytest.raises(ray.exceptions.UnreconstructableError):
-        ray.get(nonexistent_id)
+        ray.get(ray.ObjectID.from_random())
    thread.join()


 def test_connect_with_disconnected_node(shutdown_only):
    config = json.dumps({
        "num_heartbeats_timeout": 50,
-        "heartbeat_timeout_milliseconds": 10,
+        "raylet_heartbeat_timeout_milliseconds": 10,
    })
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _internal_config=config)
@@ -52,7 +52,7 @@ def test_internal_config(ray_start_cluster_head):
    worker = cluster.add_node()
    cluster.wait_for_nodes()

-    cluster.remove_node(worker)
+    cluster.remove_node(worker, allow_graceful=False)
    time.sleep(1)
    assert ray.cluster_resources()["CPU"] == 2

@@ -1,86 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pytest
-
-import ray
-import ray.exceptions
-import ray.experimental.no_return
-import ray.worker
-
-
-def test_set_single_output(ray_start_regular):
-    @ray.remote
-    def f():
-        return_object_ids = ray.worker.global_worker._current_task.returns()
-        ray.worker.global_worker.put_object(return_object_ids[0], 123)
-        return ray.experimental.no_return.NoReturn
-
-    assert ray.get(f.remote()) == 123
-
-
-def test_set_multiple_outputs(ray_start_regular):
-    @ray.remote(num_return_vals=3)
-    def f(set_out0, set_out1, set_out2):
-        returns = []
-        return_object_ids = ray.worker.global_worker._current_task.returns()
-        for i, set_out in enumerate([set_out0, set_out1, set_out2]):
-            if set_out:
-                ray.worker.global_worker.put_object(return_object_ids[i], True)
-                returns.append(ray.experimental.no_return.NoReturn)
-            else:
-                returns.append(False)
-        return tuple(returns)
-
-    for set_out0 in [True, False]:
-        for set_out1 in [True, False]:
-            for set_out2 in [True, False]:
-                result_object_ids = f.remote(set_out0, set_out1, set_out2)
-                assert ray.get(result_object_ids) == [
-                    set_out0, set_out1, set_out2
-                ]
-
-
-def test_set_actor_method(ray_start_regular):
-    @ray.remote
-    class Actor(object):
-        def __init__(self):
-            pass
-
-        def ping(self):
-            return_object_ids = ray.worker.global_worker._current_task.returns(
-            )
-            ray.worker.global_worker.put_object(return_object_ids[0], 123)
-            return ray.experimental.no_return.NoReturn
-
-    actor = Actor.remote()
-    assert ray.get(actor.ping.remote()) == 123
-
-
-def test_exception(ray_start_regular):
-    @ray.remote(num_return_vals=2)
-    def f():
-        return_object_ids = ray.worker.global_worker._current_task.returns()
-        # The first return value is successfully stored in the object store
-        ray.worker.global_worker.put_object(return_object_ids[0], 123)
-        raise Exception("Error")
-        # The exception is stored at the second return objcet ID.
-        return ray.experimental.no_return.NoReturn, 456
-
-    object_id, exception_id = f.remote()
-
-    assert ray.get(object_id) == 123
-    with pytest.raises(ray.exceptions.RayTaskError):
-        ray.get(exception_id)
-
-
-def test_no_set_and_no_return(ray_start_regular):
-    @ray.remote
-    def f():
-        return ray.experimental.no_return.NoReturn
-
-    object_id = f.remote()
-    with pytest.raises(ray.exceptions.RayTaskError) as e:
-        ray.get(object_id)
-    assert "Attempting to return 'ray.experimental.NoReturn'" in str(e.value)
@@ -26,7 +26,6 @@ import random
 import pyarrow
 import pyarrow.plasma as plasma
 import ray.cloudpickle as pickle
-import ray.experimental.signal as ray_signal
 import ray.experimental.no_return
 import ray.gcs_utils
 import ray.memory_monitor as memory_monitor
@@ -41,7 +40,6 @@ import ray.state

 from ray import (
    ActorID,
-    WorkerID,
    JobID,
    ObjectID,
    TaskID,
@@ -60,10 +58,7 @@ from ray.exceptions import (
    UnreconstructableError,
    RAY_EXCEPTION_TYPES,
 )
-from ray.function_manager import (
-    FunctionActorManager,
-    FunctionDescriptor,
-)
+from ray.function_manager import FunctionActorManager
 from ray.utils import (
    _random_string,
    check_oversized_pickle,
@@ -156,7 +151,6 @@ class Worker(object):
        # Index of the current session. This number will
        # increment every time when `ray.shutdown` is called.
        self._session_index = 0
-        self._current_task = None
        # Functions to run to process the values returned by ray.get. Each
        # postprocessor must take two arguments ("object_ids", and "values").
        self._post_get_hooks = []
@@ -473,9 +467,10 @@ class Worker(object):
            logger.warning(warning_message)
            self.store_and_register(object_id, value)

-    def retrieve_and_deserialize(self, object_ids, error_timeout=10):
-        data_metadata_pairs = self.core_worker.get_objects(
-            object_ids, self.current_task_id)
+    def deserialize_objects(self,
+                            data_metadata_pairs,
+                            object_ids,
+                            error_timeout=10):
        assert len(data_metadata_pairs) == len(object_ids)

        start_time = time.time()
@@ -571,9 +566,9 @@ class Worker(object):
        if self.mode == LOCAL_MODE:
            return self.local_mode_manager.get_objects(object_ids)

-        results = self.retrieve_and_deserialize(object_ids)
-        assert len(results) == len(object_ids)
-        return results
+        data_metadata_pairs = self.core_worker.get_objects(
+            object_ids, self.current_task_id)
+        return self.deserialize_objects(data_metadata_pairs, object_ids)

    def run_function_on_all_workers(self, function,
                                    run_on_other_drivers=False):
@@ -679,149 +674,6 @@ class Worker(object):

        return ray.signature.recover_args(arguments)

-    def _store_outputs_in_object_store(self, object_ids, outputs):
-        """Store the outputs of a remote function in the local object store.
-
-        This stores the values that were returned by a remote function in the
-        local object store. If any of the return values are object IDs, then
-        these object IDs are aliased with the object IDs that the scheduler
-        assigned for the return values. This is called by the worker that
-        executes the remote function.
-
-        Note:
-            The arguments object_ids and outputs should have the same length.
-
-        Args:
-            object_ids (List[ObjectID]): The object IDs that were assigned to
-                the outputs of the remote function call.
-            outputs (Tuple): The value returned by the remote function. If the
-                remote function was supposed to only return one value, then its
-                output was wrapped in a tuple with one element prior to being
-                passed into this function.
-        """
-        for i in range(len(object_ids)):
-            if isinstance(outputs[i], ray.actor.ActorHandle):
-                raise Exception("Returning an actor handle from a remote "
-                                "function is not allowed).")
-            if outputs[i] is ray.experimental.no_return.NoReturn:
-                if not self.core_worker.object_exists(object_ids[i]):
-                    raise RuntimeError(
-                        "Attempting to return 'ray.experimental.NoReturn' "
-                        "from a remote function, but the corresponding "
-                        "ObjectID does not exist in the local object store.")
-            else:
-                self.put_object(object_ids[i], outputs[i])
-
-    def _process_task(self, task, function_execution_info):
-        """Execute a task assigned to this worker.
-
-        This method deserializes a task from the scheduler, and attempts to
-        execute the task. If the task succeeds, the outputs are stored in the
-        local object store. If the task throws an exception, RayTaskError
-        objects are stored in the object store to represent the failed task
-        (these will be retrieved by calls to get or by subsequent tasks that
-        use the outputs of this task).
-        """
-        assert self.current_task_id.is_nil()
-        assert self.task_context.task_index == 0
-        assert self.task_context.put_index == 1
-        if not task.is_actor_task():
-            # If this worker is not an actor, check that `current_job_id`
-            # was reset when the worker finished the previous task.
-            assert self.current_job_id.is_nil()
-            # Set the driver ID of the current running task. This is
-            # needed so that if the task throws an exception, we propagate
-            # the error message to the correct driver.
-            self.current_job_id = task.job_id()
-            self.core_worker.set_current_job_id(task.job_id())
-        else:
-            # If this worker is an actor, current_job_id wasn't reset.
-            # Check that current task's driver ID equals the previous one.
-            assert self.current_job_id == task.job_id()
-
-        self.task_context.current_task_id = task.task_id()
-        self.core_worker.set_current_task_id(task.task_id())
-
-        function_descriptor = FunctionDescriptor.from_bytes_list(
-            task.function_descriptor_list())
-        serialized_args = task.arguments()
-        return_object_ids = task.returns()
-        if task.is_actor_task() or task.is_actor_creation_task():
-            dummy_return_id = return_object_ids.pop()
-        function_executor = function_execution_info.function
-        function_name = function_execution_info.function_name
-
-        # Get task arguments from the object store.
-        try:
-            if function_name != "__ray_terminate__":
-                self.reraise_actor_init_error()
-                self.memory_monitor.raise_if_low_memory()
-            with profiling.profile("task:deserialize_arguments"):
-                function_args, function_kwargs = (
-                    self._get_arguments_for_execution(function_name,
-                                                      serialized_args))
-        except Exception as e:
-            self._handle_process_task_failure(
-                function_descriptor, return_object_ids, e,
-                ray.utils.format_error_message(traceback.format_exc()))
-            return
-
-        # Execute the task.
-        try:
-            self._current_task = task
-            with profiling.profile("task:execute"):
-                if task.is_normal_task():
-                    outputs = function_executor(*function_args,
-                                                **function_kwargs)
-                else:
-                    if task.is_actor_task():
-                        key = task.actor_id()
-                    else:
-                        key = task.actor_creation_id()
-                    worker_name = "ray_{}_{}".format(
-                        self.actors[key].__class__.__name__, os.getpid())
-                    if "memory" in task.required_resources():
-                        self.memory_monitor.set_heap_limit(
-                            worker_name,
-                            ray_constants.from_memory_units(
-                                task.required_resources()["memory"]))
-                    if "object_store_memory" in task.required_resources():
-                        self._set_object_store_client_options(
-                            worker_name,
-                            int(
-                                ray_constants.from_memory_units(
-                                    task.required_resources()[
-                                        "object_store_memory"])))
-                    outputs = function_executor(
-                        dummy_return_id, self.actors[key], *function_args,
-                        **function_kwargs)
-        except Exception as e:
-            # Determine whether the exception occured during a task, not an
-            # actor method.
-            task_exception = not task.is_actor_task()
-            traceback_str = ray.utils.format_error_message(
-                traceback.format_exc(), task_exception=task_exception)
-            self._handle_process_task_failure(
-                function_descriptor, return_object_ids, e, traceback_str)
-            return
-        finally:
-            self._current_task = None
-
-        # Store the outputs in the local object store.
-        try:
-            with profiling.profile("task:store_outputs"):
-                # If this is an actor task, then the last object ID returned by
-                # the task is a dummy output, not returned by the function
-                # itself. Decrement to get the correct number of return values.
-                num_returns = len(return_object_ids)
-                if num_returns == 1:
-                    outputs = (outputs, )
-                self._store_outputs_in_object_store(return_object_ids, outputs)
-        except Exception as e:
-            self._handle_process_task_failure(
-                function_descriptor, return_object_ids, e,
-                ray.utils.format_error_message(traceback.format_exc()))
-
    def _set_object_store_client_options(self, name, object_store_memory):
        try:
            logger.debug("Setting plasma memory limit to {} for {}".format(
@@ -838,133 +690,15 @@ class Worker(object):
                "object store memory status is:\n\n{}".format(
                    object_store_memory, name, e))

-    def _handle_process_task_failure(self, function_descriptor,
-                                     return_object_ids, error, backtrace):
-        function_name = function_descriptor.function_name
-        if isinstance(error, RayTaskError):
-            # avoid recursively nesting of RayTaskError
-            failure_object = RayTaskError(function_name, backtrace,
-                                          error.cause_cls)
-        else:
-            failure_object = RayTaskError(function_name, backtrace,
-                                          error.__class__)
-        failure_objects = [
-            failure_object for _ in range(len(return_object_ids))
-        ]
-        self._store_outputs_in_object_store(return_object_ids, failure_objects)
-        # Log the error message.
-        ray.utils.push_error_to_driver(
-            self,
-            ray_constants.TASK_PUSH_ERROR,
-            str(failure_object),
-            job_id=self.current_job_id)
-        # Mark the actor init as failed
-        if not self.actor_id.is_nil() and function_name == "__init__":
-            self.mark_actor_init_failed(error)
-        # Send signal with the error.
-        ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
-
-    def _wait_for_and_process_task(self, task):
-        """Wait for a task to be ready and process the task.
-
-        Args:
-            task: The task to execute.
-        """
-        function_descriptor = FunctionDescriptor.from_bytes_list(
-            task.function_descriptor_list())
-        job_id = task.job_id()
-
-        # TODO(rkn): It would be preferable for actor creation tasks to share
-        # more of the code path with regular task execution.
-        if task.is_actor_creation_task():
-            # TODO: Remove Worker.actor_id and just use CoreWorker.GetActorId.
-            self.actor_id = task.actor_creation_id()
-            self.core_worker.set_actor_id(task.actor_creation_id())
-            self.actor_creation_task_id = task.task_id()
-            actor_class = self.function_actor_manager.load_actor_class(
-                job_id, function_descriptor)
-            self.actors[self.actor_id] = actor_class.__new__(actor_class)
-            self.actor_checkpoint_info[self.actor_id] = ActorCheckpointInfo(
-                num_tasks_since_last_checkpoint=0,
-                last_checkpoint_timestamp=int(1000 * time.time()),
-                checkpoint_ids=[],
-            )
-
-        execution_info = self.function_actor_manager.get_execution_info(
-            job_id, function_descriptor)
-
-        # Execute the task.
-        function_name = execution_info.function_name
-        extra_data = {"name": function_name, "task_id": task.task_id().hex()}
-        if not task.is_actor_task():
-            if not task.is_actor_creation_task():
-                title = "ray_worker:{}()".format(function_name)
-                next_title = "ray_worker"
-            else:
-                actor = self.actors[task.actor_creation_id()]
-                title = "ray_{}:{}()".format(actor.__class__.__name__,
-                                             function_name)
-                next_title = "ray_{}".format(actor.__class__.__name__)
-        else:
-            actor = self.actors[task.actor_id()]
-            title = "ray_{}:{}()".format(actor.__class__.__name__,
-                                         function_name)
-            next_title = "ray_{}".format(actor.__class__.__name__)
-
-        with profiling.profile("task", extra_data=extra_data):
-            with _changeproctitle(title, next_title):
-                self._process_task(task, execution_info)
-            # Reset the state fields so the next task can run.
-            self.task_context.current_task_id = TaskID.nil()
-            self.core_worker.set_current_task_id(TaskID.nil())
-            self.task_context.task_index = 0
-            self.task_context.put_index = 1
-            if self.actor_id.is_nil():
-                # Don't need to reset `current_job_id` if the worker is an
-                # actor. Because the following tasks should all have the
-                # same driver id.
-                self.current_job_id = WorkerID.nil()
-                self.core_worker.set_current_job_id(JobID.nil())
-                # Reset signal counters so that the next task can get
-                # all past signals.
-                ray_signal.reset()
-
-        # Increase the task execution counter.
-        self.function_actor_manager.increase_task_counter(
-            job_id, function_descriptor)
-
-        reached_max_executions = (self.function_actor_manager.get_task_counter(
-            job_id, function_descriptor) == execution_info.max_calls)
-        if reached_max_executions:
-            self.core_worker.disconnect()
-            sys.exit(0)
-
-    def _get_next_task_from_raylet(self):
-        """Get the next task from the raylet.
-
-        Returns:
-            A task from the raylet.
-        """
-        with profiling.profile("worker_idle"):
-            task = self.raylet_client.get_task()
-
-        # Automatically restrict the GPUs available to this task.
-        ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
-
-        return task
-
    def main_loop(self):
        """The main loop a worker runs to receive and execute tasks."""

-        def exit(signum, frame):
-            shutdown()
-            sys.exit(0)
+        def sigterm_handler(signum, frame):
+            shutdown(True)
+            sys.exit(1)

-        signal.signal(signal.SIGTERM, exit)
-
-        while True:
-            task = self._get_next_task_from_raylet()
-            self._wait_for_and_process_task(task)
+        signal.signal(signal.SIGTERM, sigterm_handler)
+        self.core_worker.run_task_loop()


 def get_gpu_ids():
@@ -982,7 +716,7 @@ def get_gpu_ids():
        raise Exception("ray.get_gpu_ids() currently does not work in LOCAL "
                        "MODE.")

-    all_resource_ids = global_worker.raylet_client.resource_ids()
+    all_resource_ids = global_worker.core_worker.resource_ids()
    assigned_ids = [
        resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
    ]
@@ -1010,7 +744,7 @@ def get_resource_ids():
            "ray.get_resource_ids() currently does not work in LOCAL "
            "MODE.")

-    return global_worker.raylet_client.resource_ids()
+    return global_worker.core_worker.resource_ids()


 def get_webui_url():
@@ -1437,7 +1171,7 @@ def shutdown(exiting_interpreter=False):
        # to make sure that log messages finish printing.
        time.sleep(0.5)

-    disconnect()
+    disconnect(exiting_interpreter)

    # Disconnect global state from GCS.
    ray.state.state.disconnect()
@@ -1456,6 +1190,13 @@ def shutdown(exiting_interpreter=False):

 atexit.register(shutdown, True)

+
+def sigterm_handler(signum, frame):
+    sys.exit(signal.SIGTERM)
+
+
+signal.signal(signal.SIGTERM, sigterm_handler)
+
 # Define a custom excepthook so that if the driver exits with an exception, we
 # can push that exception to Redis.
 normal_excepthook = sys.excepthook
@@ -1900,7 +1641,7 @@ def connect(node,
    worker.cached_functions_to_run = None


-def disconnect():
+def disconnect(exiting_interpreter=False):
    """Disconnect this worker from the raylet and object store."""
    # Reset the list of cached remote functions and actors so that if more
    # remote functions or actors are defined and then connect is called again,
@@ -1928,10 +1669,12 @@ def disconnect():
    worker.function_actor_manager.reset_cache()
    worker.serialization_context_map.clear()

-    if hasattr(worker, "raylet_client"):
-        del worker.raylet_client
-    if hasattr(worker, "core_worker"):
-        del worker.core_worker
+    if not exiting_interpreter:
+        if hasattr(worker, "raylet_client"):
+            del worker.raylet_client
+
+        if hasattr(worker, "core_worker"):
+            del worker.core_worker


@contextmanager
@@ -3,7 +3,6 @@ from __future__ import division
 from __future__ import print_function

 import argparse
-import traceback

 import ray
 import ray.actor
@@ -86,30 +85,5 @@ if __name__ == "__main__":
    node = ray.node.Node(
        ray_params, head=False, shutdown_at_exit=False, connect_only=True)
    ray.worker._global_node = node
-
    ray.worker.connect(node, mode=ray.WORKER_MODE)
-
-    error_explanation = """
-  This error is unexpected and should not have happened. Somehow a worker
-  crashed in an unanticipated way causing the main_loop to throw an exception,
-  which is being caught in "python/ray/workers/default_worker.py".
-  """
-
-    try:
-        # This call to main_loop should never return if things are working.
-        # Most exceptions that are thrown (e.g., inside the execution of a
-        # task) should be caught and handled inside of the call to
-        # main_loop. If an exception is thrown here, then that means that
-        # there is some error that we didn't anticipate.
-        ray.worker.global_worker.main_loop()
-    except Exception:
-        traceback_str = traceback.format_exc() + error_explanation
-        ray.utils.push_error_to_driver(
-            ray.worker.global_worker,
-            "worker_crash",
-            traceback_str,
-            job_id=None)
-        # TODO(rkn): Note that if the worker was in the middle of executing
-        # a task, then any worker or driver that is blocking in a get call
-        # and waiting for the output of that task will hang. We need to
-        # address this.
+    ray.worker.global_worker.main_loop()