[Core] Object spilling prototype (#9818)

2026-06-28 10:01:11 +08:00 · 2020-08-14 15:39:10 -07:00
parent 36e626e95d
commit 17ca1d8ff4
36 changed files with 1026 additions and 95 deletions
@@ -78,27 +78,11 @@ from ray.profiling import profile  # noqa: E402
 from ray.state import (jobs, nodes, actors, objects, timeline,
                       object_transfer_timeline, cluster_resources,
                       available_resources)  # noqa: E402
-from ray.worker import (
-    LOCAL_MODE,
-    SCRIPT_MODE,
-    WORKER_MODE,
-    cancel,
-    connect,
-    disconnect,
-    get,
-    get_actor,
-    get_gpu_ids,
-    get_resource_ids,
-    get_webui_url,
-    init,
-    is_initialized,
-    put,
-    kill,
-    register_custom_serializer,
-    remote,
-    shutdown,
-    show_in_webui,
-    wait,
+from ray.worker import (  # noqa: F401
+    LOCAL_MODE, SCRIPT_MODE, WORKER_MODE, IO_WORKER_MODE, cancel, connect,
+    disconnect, get, get_actor, get_gpu_ids, get_resource_ids, get_webui_url,
+    init, is_initialized, put, kill, register_custom_serializer, remote,
+    shutdown, show_in_webui, wait,
 )  # noqa: E402
 import ray.internal  # noqa: E402
 import ray.projects  # noqa: E402
@@ -66,6 +66,7 @@ from ray.includes.common cimport (
    TASK_TYPE_ACTOR_TASK,
    WORKER_TYPE_WORKER,
    WORKER_TYPE_DRIVER,
+    WORKER_TYPE_IO_WORKER,
    PLACEMENT_STRATEGY_PACK,
    PLACEMENT_STRATEGY_SPREAD,
 )
@@ -90,6 +91,7 @@ from ray.includes.ray_config cimport RayConfig
 from ray.includes.global_state_accessor cimport CGlobalStateAccessor

 import ray
+from ray import external_storage
 from ray.async_compat import (
    sync_to_async, get_new_event_loop)
 import ray.memory_monitor as memory_monitor
@@ -590,6 +592,49 @@ cdef void gc_collect() nogil:
                    num_freed, end - start))


+cdef c_vector[c_string] spill_objects_handler(
+        const c_vector[CObjectID]& object_ids_to_spill) nogil:
+    cdef c_vector[c_string] return_urls
+    with gil:
+        object_refs = VectorToObjectRefs(object_ids_to_spill)
+        try:
+            urls = external_storage.spill_objects(object_refs)
+            for url in urls:
+                return_urls.push_back(url)
+        except Exception:
+            exception_str = (
+                "An unexpected internal error occurred while the IO worker "
+                "was spilling objects.")
+            logger.exception(exception_str)
+            ray.utils.push_error_to_driver(
+                ray.worker.global_worker,
+                "io_worker_spill_objects_error",
+                traceback.format_exc() + exception_str,
+                job_id=None)
+        return return_urls
+
+
+cdef void restore_spilled_objects_handler(
+        const c_vector[c_string]& object_urls) nogil:
+    with gil:
+        urls = []
+        size = object_urls.size()
+        for i in range(size):
+            urls.append(object_urls[i])
+        try:
+            external_storage.restore_spilled_objects(urls)
+        except Exception:
+            exception_str = (
+                "An unexpected internal error occurred while the IO worker "
+                "was restoring spilled objects.")
+            logger.exception(exception_str)
+            ray.utils.push_error_to_driver(
+                ray.worker.global_worker,
+                "io_worker_retore_spilled_objects_error",
+                traceback.format_exc() + exception_str,
+                job_id=None)
+
+
 # This function introduces ~2-7us of overhead per call (i.e., it can be called
 # up to hundreds of thousands of times per second).
 cdef void get_py_stack(c_string* stack_out) nogil:
@@ -650,17 +695,25 @@ cdef void terminate_asyncio_thread() nogil:

 cdef class CoreWorker:

-    def __cinit__(self, is_driver, store_socket, raylet_socket,
+    def __cinit__(self, worker_type, store_socket, raylet_socket,
                  JobID job_id, GcsClientOptions gcs_options, log_dir,
                  node_ip_address, node_manager_port, raylet_ip_address,
                  local_mode, driver_name, stdout_file, stderr_file,
                  serialized_job_config, metrics_agent_port):
-        self.is_driver = is_driver
        self.is_local_mode = local_mode

        cdef CCoreWorkerOptions options = CCoreWorkerOptions()
-        options.worker_type = (
-            WORKER_TYPE_DRIVER if is_driver else WORKER_TYPE_WORKER)
+        if worker_type in (ray.LOCAL_MODE, ray.SCRIPT_MODE):
+            self.is_driver = True
+            options.worker_type = WORKER_TYPE_DRIVER
+        elif worker_type == ray.WORKER_MODE:
+            self.is_driver = False
+            options.worker_type = WORKER_TYPE_WORKER
+        elif worker_type == ray.IO_WORKER_MODE:
+            self.is_driver = False
+            options.worker_type = WORKER_TYPE_IO_WORKER
+        else:
+            raise ValueError(f"Unknown worker type: {worker_type}")
        options.language = LANGUAGE_PYTHON
        options.store_socket = store_socket.encode("ascii")
        options.raylet_socket = raylet_socket.encode("ascii")
@@ -678,6 +731,8 @@ cdef class CoreWorker:
        options.task_execution_callback = task_execution_handler
        options.check_signals = check_signals
        options.gc_collect = gc_collect
+        options.spill_objects = spill_objects_handler
+        options.restore_spilled_objects = restore_spilled_objects_handler
        options.get_lang_stack = get_py_stack
        options.ref_counting_enabled = True
        options.is_local_mode = local_mode
@@ -725,15 +780,15 @@ cdef class CoreWorker:
        return self.plasma_event_handler

    def get_objects(self, object_refs, TaskID current_task_id,
-                    int64_t timeout_ms=-1):
+                    int64_t timeout_ms=-1, plasma_objects_only=False):
        cdef:
            c_vector[shared_ptr[CRayObject]] results
            CTaskID c_task_id = current_task_id.native()
            c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
-
+            c_bool _plasma_objects_only = plasma_objects_only
        with nogil:
            check_status(CCoreWorkerProcess.GetCoreWorker().Get(
-                c_object_ids, timeout_ms, &results))
+                c_object_ids, timeout_ms, &results, _plasma_objects_only))

        return RayObjectsToDataMetadataPairs(results)

@@ -771,6 +826,48 @@ cdef class CoreWorker:
        # and deal with it here.
        return data.get() == NULL

+    def put_file_like_object(
+            self, metadata, data_size, file_like, ObjectRef object_ref=None):
+        """Directly create a new Plasma Store object from a file like
+        object. This avoids extra memory copy.
+
+        Args:
+            metadata (bytes): The metadata of the object.
+            data_size (int): The size of the data buffer.
+            file_like: A python file object that provides the `readinto`
+                interface.
+            object_ref: The new ObjectRef.
+        """
+        cdef:
+            CObjectID c_object_id
+            shared_ptr[CBuffer] data_buf
+            shared_ptr[CBuffer] metadata_buf
+            int64_t put_threshold
+            c_bool put_small_object_in_memory_store
+            c_vector[CObjectID] c_object_id_vector
+        # TODO(suquark): This method does not support put objects to
+        # in memory store currently.
+        metadata_buf = string_to_buffer(metadata)
+        object_already_exists = self._create_put_buffer(
+            metadata_buf, data_size, object_ref,
+            ObjectRefsToVector([]),
+            &c_object_id, &data_buf)
+        if object_already_exists:
+            logger.debug("Object already exists in 'put_file_like_object'.")
+            return
+        data = Buffer.make(data_buf)
+        view = memoryview(data)
+        index = 0
+        while index < data_size:
+            bytes_read = file_like.readinto(view[index:])
+            index += bytes_read
+        with nogil:
+            # Using custom object refs is not supported because we
+            # can't track their lifecycle, so we don't pin the object
+            # in this case.
+            check_status(CCoreWorkerProcess.GetCoreWorker().Seal(
+                         c_object_id, pin_object=object_ref is None))
+
    def put_serialized_object(self, serialized_object,
                              ObjectRef object_ref=None,
                              c_bool pin_object=True):
@@ -1342,6 +1439,20 @@ cdef class CoreWorker:
            resource_name.encode("ascii"), capacity,
            CClientID.FromBinary(client_id.binary()))

+    def force_spill_objects(self, object_refs):
+        cdef c_vector[CObjectID] object_ids
+        object_ids = ObjectRefsToVector(object_refs)
+        with nogil:
+            check_status(CCoreWorkerProcess.GetCoreWorker()
+                         .ForceSpillObjects(object_ids))
+
+    def force_restore_spilled_objects(self, object_refs):
+        cdef c_vector[CObjectID] object_ids
+        object_ids = ObjectRefsToVector(object_refs)
+        with nogil:
+            check_status(CCoreWorkerProcess.GetCoreWorker()
+                         .ForceRestoreSpilledObjects(object_ids))
+
 cdef void async_set_result(shared_ptr[CRayObject] obj,
                           CObjectID object_ref,
                           void *future) with gil:
@@ -1,10 +1,13 @@
 from .api import get, wait
 from .dynamic_resources import set_resource
+from .object_spilling import force_spill_objects, force_restore_spilled_objects
 from .placement_group import (
    placement_group, )
 __all__ = [
    "get",
    "wait",
    "set_resource",
+    "force_spill_objects",
+    "force_restore_spilled_objects",
    "placement_group",
 ]
@@ -0,0 +1,35 @@
+import ray
+
+
+def force_spill_objects(object_refs):
+    """Force spilling objects to external storage.
+
+    Args:
+        object_refs: Object refs of the objects to be
+            spilled.
+    """
+    core_worker = ray.worker.global_worker.core_worker
+    # Make sure that the values are object refs.
+    for object_ref in object_refs:
+        if not isinstance(object_ref, ray.ObjectRef):
+            raise TypeError(
+                f"Attempting to call `force_spill_objects` on the "
+                f"value {object_ref}, which is not an ray.ObjectRef.")
+    return core_worker.force_spill_objects(object_refs)
+
+
+def force_restore_spilled_objects(object_refs):
+    """Force restoring objects from external storage.
+
+    Args:
+        object_refs: Object refs of the objects to be
+            restored.
+    """
+    core_worker = ray.worker.global_worker.core_worker
+    # Make sure that the values are object refs.
+    for object_ref in object_refs:
+        if not isinstance(object_ref, ray.ObjectRef):
+            raise TypeError(
+                f"Attempting to call `force_restore_spilled_objects` on the "
+                f"value {object_ref}, which is not an ray.ObjectRef.")
+    return core_worker.force_restore_spilled_objects(object_refs)
@@ -0,0 +1,129 @@
+import abc
+import os
+from typing import List
+import ray
+
+
+class ExternalStorage(metaclass=abc.ABCMeta):
+    """The base class for external storage.
+
+    This class provides some useful functions for zero-copy object
+    put/get from plasma store. Also it specifies the interface for
+    object spilling.
+    """
+
+    def _get_objects_from_store(self, object_refs):
+        worker = ray.worker.global_worker
+        ray_object_pairs = worker.core_worker.get_objects(
+            object_refs,
+            worker.current_task_id,
+            timeout_ms=0,
+            plasma_objects_only=True)
+        return ray_object_pairs
+
+    def _put_object_to_store(self, metadata, data_size, file_like, object_ref):
+        worker = ray.worker.global_worker
+        worker.core_worker.put_file_like_object(metadata, data_size, file_like,
+                                                object_ref)
+
+    @abc.abstractmethod
+    def spill_objects(self, object_refs):
+        """Spill objects to the external storage. Objects are specified
+        by their object refs.
+
+        Args:
+            object_refs: The list of the refs of the objects to be spilled.
+        Returns:
+            A list of keys corresponding to the input object refs.
+        """
+
+    @abc.abstractmethod
+    def restore_spilled_objects(self, keys: List[bytes]):
+        """Spill objects to the external storage. Objects are specified
+        by their object refs.
+
+        Args:
+            keys: A list of bytes corresponding to the spilled objects.
+        """
+
+
+class NullStorage(ExternalStorage):
+    """The class that represents an uninitialized external storage."""
+
+    def spill_objects(self, object_refs):
+        raise NotImplementedError("External storage is not initialized")
+
+    def restore_spilled_objects(self, keys):
+        raise NotImplementedError("External storage is not initialized")
+
+
+class FileSystemStorage(ExternalStorage):
+    """The class for filesystem-like external storage."""
+
+    def __init__(self, directory_path):
+        self.directory_path = directory_path
+        self.prefix = "ray_spilled_object_"
+
+    def spill_objects(self, object_refs):
+        keys = []
+        ray_object_pairs = self._get_objects_from_store(object_refs)
+        for ref, (buf, metadata) in zip(object_refs, ray_object_pairs):
+            filename = self.prefix + ref.hex()
+            with open(os.path.join(self.directory_path, filename), "wb") as f:
+                metadata_len = len(metadata)
+                buf_len = len(buf)
+                f.write(metadata_len.to_bytes(8, byteorder="little"))
+                f.write(buf_len.to_bytes(8, byteorder="little"))
+                f.write(metadata)
+                f.write(memoryview(buf))
+            keys.append(filename.encode())
+        return keys
+
+    def restore_spilled_objects(self, keys):
+        for k in keys:
+            filename = k.decode()
+            ref = ray.ObjectRef(bytes.fromhex(filename[len(self.prefix):]))
+            with open(os.path.join(self.directory_path, filename), "rb") as f:
+                metadata_len = int.from_bytes(f.read(8), byteorder="little")
+                buf_len = int.from_bytes(f.read(8), byteorder="little")
+                metadata = f.read(metadata_len)
+                # read remaining data to our buffer
+                self._put_object_to_store(metadata, buf_len, f, ref)
+
+
+_external_storage = NullStorage()
+
+
+def setup_external_storage(config):
+    """Setup the external storage according to the config."""
+    global _external_storage
+    if config:
+        storage_type = config["type"]
+        if storage_type == "filesystem":
+            _external_storage = FileSystemStorage(**config["params"])
+        else:
+            raise ValueError(f"Unknown external storage type: {storage_type}")
+    else:
+        _external_storage = NullStorage()
+
+
+def spill_objects(object_refs):
+    """Spill objects to the external storage. Objects are specified
+    by their object refs.
+
+    Args:
+        object_refs: The list of the refs of the objects to be spilled.
+    Returns:
+        A list of keys corresponding to the input object refs.
+    """
+    return _external_storage.spill_objects(object_refs)
+
+
+def restore_spilled_objects(keys: List[bytes]):
+    """Spill objects to the external storage. Objects are specified
+    by their object refs.
+
+    Args:
+        keys: A list of bytes corresponding to the spilled objects.
+    """
+    _external_storage.restore_spilled_objects(keys)
@@ -162,6 +162,7 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil:
 cdef extern from "src/ray/protobuf/common.pb.h" nogil:
    cdef CWorkerType WORKER_TYPE_WORKER "ray::WorkerType::WORKER"
    cdef CWorkerType WORKER_TYPE_DRIVER "ray::WorkerType::DRIVER"
+    cdef CWorkerType WORKER_TYPE_IO_WORKER "ray::WorkerType::IO_WORKER"

 cdef extern from "src/ray/protobuf/common.pb.h" nogil:
    cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
@@ -165,7 +165,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
                          shared_ptr[CBuffer] *data)
        CRayStatus Seal(const CObjectID &object_id, c_bool pin_object)
        CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
-                       c_vector[shared_ptr[CRayObject]] *results)
+                       c_vector[shared_ptr[CRayObject]] *results,
+                       c_bool plasma_objects_only)
        CRayStatus Contains(const CObjectID &object_id, c_bool *has_object)
        CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects,
                        int64_t timeout_ms, c_vector[c_bool] *results)
@@ -192,6 +193,9 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
        CRayStatus SetResource(const c_string &resource_name,
                               const double capacity,
                               const CClientID &client_Id)
+        CRayStatus ForceSpillObjects(const c_vector[CObjectID] &object_ids)
+        CRayStatus ForceRestoreSpilledObjects(
+                const c_vector[CObjectID] &object_ids)

    cdef cppclass CCoreWorkerOptions "ray::CoreWorkerOptions":
        CWorkerType worker_type
@@ -220,6 +224,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
         ) task_execution_callback
        (CRayStatus() nogil) check_signals
        (void() nogil) gc_collect
+        (c_vector[c_string](const c_vector[CObjectID]&) nogil) spill_objects
+        (void(const c_vector[c_string]&) nogil) restore_spilled_objects
        (void(c_string *stack_out) nogil) get_lang_stack
        c_bool ref_counting_enabled
        c_bool is_local_mode
@@ -718,7 +718,8 @@ class Node:
            socket_to_use=self.socket,
            head_node=self.head,
            start_initial_python_workers_for_first_job=self._ray_params.
-            start_initial_python_workers_for_first_job)
+            start_initial_python_workers_for_first_job,
+            object_spilling_config=self._ray_params.object_spilling_config)
        assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
        self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]

@@ -145,7 +145,8 @@ class RayParams:
                 enable_object_reconstruction=False,
                 metrics_agent_port=None,
                 metrics_export_port=None,
-                 lru_evict=False):
+                 lru_evict=False,
+                 object_spilling_config=None):
        self.object_ref_seed = object_ref_seed
        self.redis_address = redis_address
        self.num_cpus = num_cpus
@@ -190,6 +191,7 @@ class RayParams:
        self._internal_config = _internal_config
        self._lru_evict = lru_evict
        self._enable_object_reconstruction = enable_object_reconstruction
+        self.object_spilling_config = object_spilling_config
        self._check_usage()

        # Set the internal config options for LRU eviction.
@@ -1289,7 +1289,8 @@ def start_raylet(redis_address,
                 fate_share=None,
                 socket_to_use=None,
                 head_node=False,
-                 start_initial_python_workers_for_first_job=False):
+                 start_initial_python_workers_for_first_job=False,
+                 object_spilling_config=None):
    """Start a raylet, which is a combined local scheduler and object manager.

    Args:
@@ -1398,6 +1399,10 @@ def start_raylet(redis_address,
    if load_code_from_local:
        start_worker_command += ["--load-code-from-local"]

+    if object_spilling_config:
+        start_worker_command.append(
+            f"--object-spilling-config={json.dumps(object_spilling_config)}")
+
    command = [
        RAYLET_EXECUTABLE,
        "--raylet_socket_name={}".format(raylet_name),
@@ -48,6 +48,7 @@ py_test_module_list(
    "test_stress_sharded.py",
    "test_unreconstructable_errors.py",
    "test_tensorflow.py",
+    "test_object_spilling.py",
  ],
  size = "medium",
  extra_srcs = SRCS,
@@ -0,0 +1,140 @@
+import json
+import random
+import time
+
+import numpy as np
+import pytest
+import ray
+
+
+def test_spill_objects_manually(shutdown_only):
+    # Limit our object store to 75 MiB of memory.
+    ray.init(
+        object_store_memory=75 * 1024 * 1024,
+        object_spilling_config={
+            "type": "filesystem",
+            "params": {
+                "directory_path": "/tmp"
+            }
+        },
+        _internal_config=json.dumps({
+            "object_store_full_max_retries": 0,
+            "max_io_workers": 4,
+        }))
+    arr = np.random.rand(1024 * 1024)  # 8 MB data
+    replay_buffer = []
+    pinned_objects = set()
+    spilled_objects = set()
+
+    # Create objects of more than 200 MiB.
+    for _ in range(25):
+        ref = None
+        while ref is None:
+            try:
+                ref = ray.put(arr)
+                replay_buffer.append(ref)
+                pinned_objects.add(ref)
+            except ray.exceptions.ObjectStoreFullError:
+                ref_to_spill = pinned_objects.pop()
+                ray.experimental.force_spill_objects([ref_to_spill])
+                spilled_objects.add(ref_to_spill)
+
+    # Spill 2 more objects so we will always have enough space for
+    # restoring objects back.
+    refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
+    ray.experimental.force_spill_objects(refs_to_spill)
+    spilled_objects.update(refs_to_spill)
+
+    # randomly sample objects
+    for _ in range(100):
+        ref = random.choice(replay_buffer)
+        if ref in spilled_objects:
+            ray.experimental.force_restore_spilled_objects([ref])
+        sample = ray.get(ref)
+        assert np.array_equal(sample, arr)
+
+
+def test_spill_objects_manually_from_workers(shutdown_only):
+    # Limit our object store to 100 MiB of memory.
+    ray.init(
+        object_store_memory=100 * 1024 * 1024,
+        object_spilling_config={
+            "type": "filesystem",
+            "params": {
+                "directory_path": "/tmp"
+            }
+        },
+        _internal_config=json.dumps({
+            "object_store_full_max_retries": 0,
+            "max_io_workers": 4,
+        }))
+
+    @ray.remote
+    def _worker():
+        arr = np.random.rand(100 * 1024)
+        ref = ray.put(arr)
+        ray.experimental.force_spill_objects([ref])
+        ray.experimental.force_restore_spilled_objects([ref])
+        assert np.array_equal(ray.get(ref), arr)
+
+    ray.get([_worker.remote() for _ in range(50)])
+
+
+def test_spill_objects_manually_with_workers(shutdown_only):
+    # Limit our object store to 75 MiB of memory.
+    ray.init(
+        object_store_memory=100 * 1024 * 1024,
+        object_spilling_config={
+            "type": "filesystem",
+            "params": {
+                "directory_path": "/tmp"
+            }
+        },
+        _internal_config=json.dumps({
+            "object_store_full_max_retries": 0,
+            "max_io_workers": 4,
+        }))
+    arrays = [np.random.rand(100 * 1024) for _ in range(50)]
+    objects = [ray.put(arr) for arr in arrays]
+
+    @ray.remote
+    def _worker(object_refs):
+        ray.experimental.force_spill_objects(object_refs)
+
+    ray.get([_worker.remote([o]) for o in objects])
+
+    for restored, arr in zip(ray.get(objects), arrays):
+        assert np.array_equal(restored, arr)
+
+
+@pytest.mark.skip(reason="have not been fully implemented")
+def test_spill_objects_automatically(shutdown_only):
+    # Limit our object store to 75 MiB of memory.
+    ray.init(
+        object_store_memory=75 * 1024 * 1024,
+        _internal_config=json.dumps({
+            "max_io_workers": 4,
+            "object_store_full_max_retries": 2,
+            "object_store_full_initial_delay_ms": 10,
+            "auto_object_spilling": True,
+        }))
+    arr = np.random.rand(1024 * 1024)  # 8 MB data
+    replay_buffer = []
+
+    # Wait raylet for starting an IO worker.
+    time.sleep(1)
+
+    # Create objects of more than 800 MiB.
+    for _ in range(100):
+        ref = None
+        while ref is None:
+            ref = ray.put(arr)
+            replay_buffer.append(ref)
+
+    print("-----------------------------------")
+
+    # randomly sample objects
+    for _ in range(1000):
+        ref = random.choice(replay_buffer)
+        sample = ray.get(ref, timeout=0)
+        assert np.array_equal(sample, arr)
@@ -53,6 +53,7 @@ from ray.utils import (_random_string, check_oversized_pickle, is_cython,
 SCRIPT_MODE = 0
 WORKER_MODE = 1
 LOCAL_MODE = 2
+IO_WORKER_MODE = 3

 ERROR_KEY_PREFIX = b"Error:"

@@ -513,7 +514,8 @@ def init(address=None,
         _internal_config=None,
         lru_evict=False,
         enable_object_reconstruction=False,
-         _metrics_export_port=None):
+         _metrics_export_port=None,
+         object_spilling_config=None):
    """
    Connect to an existing Ray cluster or start one and connect to it.

@@ -642,6 +644,8 @@ def init(address=None,
        _metrics_export_port(int): Port number Ray exposes system metrics
            through a Prometheus endpoint. It is currently under active
            development, and the API is subject to change.
+        object_spilling_config (str): The configuration json string for object
+            spilling I/O worker.

    Returns:
        Address information about the started processes.
@@ -737,7 +741,8 @@ def init(address=None,
            _internal_config=_internal_config,
            lru_evict=lru_evict,
            enable_object_reconstruction=enable_object_reconstruction,
-            metrics_export_port=_metrics_export_port)
+            metrics_export_port=_metrics_export_port,
+            object_spilling_config=object_spilling_config)
        # Start the Ray processes. We set shutdown_at_exit=False because we
        # shutdown the node in the ray.shutdown call that happens in the atexit
        # handler. We still spawn a reaper process in case the atexit handler
@@ -1206,7 +1211,7 @@ def connect(node,
    worker.redis_client = node.create_redis_client()

    # Initialize some fields.
-    if mode is WORKER_MODE:
+    if mode in (WORKER_MODE, IO_WORKER_MODE):
        # We should not specify the job_id if it's `WORKER_MODE`.
        assert job_id is None
        job_id = JobID.nil()
@@ -1260,7 +1265,7 @@ def connect(node,
        import __main__ as main
        driver_name = (main.__file__
                       if hasattr(main, "__file__") else "INTERACTIVE MODE")
-    elif mode == WORKER_MODE:
+    elif mode == WORKER_MODE or mode == IO_WORKER_MODE:
        # Check the RedirectOutput key in Redis and based on its value redirect
        # worker output and error to their own files.
        # This key is set in services.py when Redis is started.
@@ -1295,8 +1300,7 @@ def connect(node,
        job_config = ray.job_config.JobConfig()
    serialized_job_config = job_config.serialize()
    worker.core_worker = ray._raylet.CoreWorker(
-        (mode == SCRIPT_MODE or mode == LOCAL_MODE),
-        node.plasma_store_socket_name, node.raylet_socket_name, job_id,
+        mode, node.plasma_store_socket_name, node.raylet_socket_name, job_id,
        gcs_options, node.get_logs_dir_path(), node.node_ip_address,
        node.node_manager_port, node.raylet_ip_address, (mode == LOCAL_MODE),
        driver_name, log_stdout_file_path, log_stderr_file_path,
@@ -1,5 +1,6 @@
 import argparse
 import json
+import time

 import ray
 import ray.actor
@@ -80,17 +81,47 @@ parser.add_argument(
    default=False,
    action="store_true",
    help="True if cloudpickle should be used for serialization.")
+parser.add_argument(
+    "--worker-type",
+    required=False,
+    type=str,
+    default="WORKER",
+    help="Specify the type of the worker process")
 parser.add_argument(
    "--metrics-agent-port",
    required=True,
    type=int,
    help="the port of the node's metric agent.")
+parser.add_argument(
+    "--object-spilling-config",
+    required=False,
+    type=str,
+    default="",
+    help="The configuration of object spilling. Only used by I/O workers.")

 if __name__ == "__main__":
    args = parser.parse_args()

    ray.utils.setup_logger(args.logging_level, args.logging_format)

+    if args.worker_type == "WORKER":
+        mode = ray.WORKER_MODE
+    elif args.worker_type == "IO_WORKER":
+        mode = ray.IO_WORKER_MODE
+    else:
+        raise ValueError("Unknown worker type: " + args.worker_type)
+
+    # NOTE(suquark): We must initialize the external storage before we
+    # connect to raylet. Otherwise we may receive requests before the
+    # external storage is intialized.
+    if mode == ray.IO_WORKER_MODE:
+        from ray import external_storage
+        if args.object_spilling_config:
+            object_spilling_config = json.loads(args.object_spilling_config)
+        else:
+            object_spilling_config = {}
+        external_storage.setup_external_storage(object_spilling_config)
+
    internal_config = {}
    if args.config_list is not None:
        config_list = args.config_list.split(",")
@@ -125,5 +156,14 @@ if __name__ == "__main__":
        spawn_reaper=False,
        connect_only=True)
    ray.worker._global_node = node
-    ray.worker.connect(node, mode=ray.WORKER_MODE)
-    ray.worker.global_worker.main_loop()
+
+    ray.worker.connect(node, mode=mode)
+    if mode == ray.WORKER_MODE:
+        ray.worker.global_worker.main_loop()
+    elif mode == ray.IO_WORKER_MODE:
+        # It is handled by another thread in the C++ core worker.
+        # We just need to keep the worker alive.
+        while True:
+            time.sleep(100000)
+    else:
+        raise ValueError(f"Unexcepted worker mode: {mode}")