[Object Spilling] Fusion small objects (#12087)

2026-06-27 23:54:34 +08:00 · 2020-11-25 10:13:32 -08:00
parent 09d5413f70
commit 2e4e285ef0
18 changed files with 413 additions and 117 deletions
@@ -611,17 +611,19 @@ cdef c_vector[c_string] spill_objects_handler(


 cdef void restore_spilled_objects_handler(
+        const c_vector[CObjectID]& object_ids_to_restore,
        const c_vector[c_string]& object_urls) nogil:
    with gil:
        urls = []
        size = object_urls.size()
        for i in range(size):
            urls.append(object_urls[i])
+        object_refs = VectorToObjectRefs(object_ids_to_restore)
        try:
            with ray.worker._changeproctitle(
                    ray_constants.WORKER_PROCESS_TYPE_RESTORE_WORKER,
                    ray_constants.WORKER_PROCESS_TYPE_RESTORE_WORKER_IDLE):
-                external_storage.restore_spilled_objects(urls)
+                external_storage.restore_spilled_objects(object_refs, urls)
        except Exception:
            exception_str = (
                "An unexpected internal error occurred while the IO worker "
@@ -1,8 +1,64 @@
 import abc
 import os
-from typing import List
+import urllib
+from collections import namedtuple
+from typing import List, IO, Tuple

 import ray
+from ray.ray_constants import DEFAULT_OBJECT_PREFIX
+from ray._raylet import ObjectRef
+
+ParsedURL = namedtuple("ParsedURL", "base_url, offset, size")
+
+
+def create_url_with_offset(*, url: str, offset: int, size: int) -> str:
+    """Methods to create a URL with offset.
+
+    When ray spills objects, it fuses multiple objects
+    into one file to optimize the performance. That says, each object
+    needs to keep tracking of its own special url to store metadata.
+
+    This method creates an url_with_offset, which is used internally
+    by Ray.
+
+    Created url_with_offset can be passed to the self._get_base_url method
+    to parse the filename used to store files.
+
+    Example) file://path/to/file?offset=""&size=""
+
+    Args:
+        url(str): url to the object stored in the external storage.
+        offset(int): Offset from the beginning of the file to
+            the first bytes of this object.
+        size(int): Size of the object that is stored in the url.
+            It is used to calculate the last offset.
+
+    Returns:
+        url_with_offset stored internally to find
+        objects from external storage.
+    """
+    return f"{url}?offset={offset}&size={size}"
+
+
+def parse_url_with_offset(url_with_offset: str) -> Tuple[str, int, int]:
+    """Parse url_with_offset to retrieve information.
+
+    base_url is the url where the object ref
+    is stored in the external storage.
+
+    Args:
+        url_with_offset(str): url created by create_url_with_offset.
+
+    Returns:
+        named tuple of base_url, offset, and size.
+    """
+    parsed_result = urllib.parse.urlparse(url_with_offset)
+    query_dict = urllib.parse.parse_qs(parsed_result.query)
+    # Split by ? to remove the query from the url.
+    base_url = parsed_result.geturl().split("?")[0]
+    offset = int(query_dict["offset"][0])
+    size = int(query_dict["size"][0])
+    return ParsedURL(base_url=base_url, offset=offset, size=size)


 class ExternalStorage(metaclass=abc.ABCMeta):
@@ -35,34 +91,88 @@ class ExternalStorage(metaclass=abc.ABCMeta):
        worker.core_worker.put_file_like_object(metadata, data_size, file_like,
                                                object_ref)

+    def _write_multiple_objects(self, f: IO, object_refs: List[ObjectRef],
+                                url: str) -> List[str]:
+        """Fuse all given objects into a given file handle.
+
+        Args:
+            f(IO): File handle to fusion all given object refs.
+            object_refs(list): Object references to fusion to a single file.
+            url(str): url where the object ref is stored
+                in the external storage.
+
+        Return:
+            List of urls_with_offset of fusioned objects.
+            The order of returned keys are equivalent to the one
+            with given object_refs.
+        """
+        keys = []
+        offset = 0
+        ray_object_pairs = self._get_objects_from_store(object_refs)
+        for ref, (buf, metadata) in zip(object_refs, ray_object_pairs):
+            metadata_len = len(metadata)
+            buf_len = len(buf)
+            # 16 bytes to store metadata and buffer length.
+            data_size_in_bytes = metadata_len + buf_len + 16
+            f.write(metadata_len.to_bytes(8, byteorder="little"))
+            f.write(buf_len.to_bytes(8, byteorder="little"))
+            f.write(metadata)
+            f.write(memoryview(buf))
+            url_with_offset = create_url_with_offset(
+                url=url, offset=offset, size=data_size_in_bytes)
+            keys.append(url_with_offset.encode())
+            offset += data_size_in_bytes
+        return keys
+
+    def _size_check(self, metadata_len, buffer_len, obtained_data_size):
+        """Check whether or not the obtained_data_size is as expected.
+
+        Args:
+             metadata_len(int): Actual metadata length of the object.
+             buffer_len(int): Actual buffer length of the object.
+             obtained_data_size(int): Data size specified in the
+                url_with_offset.
+
+        Raises:
+            ValueError if obtained_data_size is different from
+            metadata_len + buffer_len + 16(first 8 bytes to store length).
+        """
+        data_size_in_bytes = metadata_len + buffer_len + 16
+        if data_size_in_bytes != obtained_data_size:
+            raise ValueError(
+                f"Obtained data has a size of {data_size_in_bytes}, "
+                "although it is supposed to have the "
+                f"size of {obtained_data_size}.")
+
    @abc.abstractmethod
-    def spill_objects(self, object_refs):
+    def spill_objects(self, object_refs) -> List[str]:
        """Spill objects to the external storage. Objects are specified
        by their object refs.

        Args:
            object_refs: The list of the refs of the objects to be spilled.
        Returns:
-            A list of keys corresponding to the input object refs.
+            A list of internal URLs with object offset.
        """

    @abc.abstractmethod
-    def restore_spilled_objects(self, keys: List[bytes]):
-        """Spill objects to the external storage. Objects are specified
-        by their object refs.
+    def restore_spilled_objects(self, object_refs: List[ObjectRef],
+                                url_with_offset_list: List[str]):
+        """Restore objects from the external storage.

        Args:
-            keys: A list of bytes corresponding to the spilled objects.
+            object_refs: List of object IDs (note that it is not ref).
+            url_with_offset_list: List of url_with_offset.
        """


 class NullStorage(ExternalStorage):
    """The class that represents an uninitialized external storage."""

-    def spill_objects(self, object_refs):
+    def spill_objects(self, object_refs) -> List[str]:
        raise NotImplementedError("External storage is not initialized")

-    def restore_spilled_objects(self, keys):
+    def restore_spilled_objects(self, object_refs, url_with_offset_list):
        raise NotImplementedError("External storage is not initialized")


@@ -76,37 +186,40 @@ class FileSystemStorage(ExternalStorage):

    def __init__(self, directory_path):
        self.directory_path = directory_path
-        self.prefix = "ray_spilled_object_"
+        self.prefix = DEFAULT_OBJECT_PREFIX
        os.makedirs(self.directory_path, exist_ok=True)
        if not os.path.exists(self.directory_path):
            raise ValueError("The given directory path to store objects, "
                             f"{self.directory_path}, could not be created.")

-    def spill_objects(self, object_refs):
-        keys = []
-        ray_object_pairs = self._get_objects_from_store(object_refs)
-        for ref, (buf, metadata) in zip(object_refs, ray_object_pairs):
-            filename = self.prefix + ref.hex()
-            with open(os.path.join(self.directory_path, filename), "wb") as f:
-                metadata_len = len(metadata)
-                buf_len = len(buf)
-                f.write(metadata_len.to_bytes(8, byteorder="little"))
-                f.write(buf_len.to_bytes(8, byteorder="little"))
-                f.write(metadata)
-                f.write(memoryview(buf))
-            keys.append(filename.encode())
-        return keys
+    def spill_objects(self, object_refs) -> List[str]:
+        if len(object_refs) == 0:
+            return []
+        # Always use the first object ref as a key when fusioning objects.
+        first_ref = object_refs[0]
+        filename = f"{self.prefix}-{first_ref.hex()}-multi-{len(object_refs)}"
+        url = f"{os.path.join(self.directory_path, filename)}"
+        with open(url, "wb") as f:
+            return self._write_multiple_objects(f, object_refs, url)

-    def restore_spilled_objects(self, keys):
-        for k in keys:
-            filename = k.decode()
-            ref = ray.ObjectRef(bytes.fromhex(filename[len(self.prefix):]))
-            with open(os.path.join(self.directory_path, filename), "rb") as f:
+    def restore_spilled_objects(self, object_refs: List[ObjectRef],
+                                url_with_offset_list: List[str]):
+        for i in range(len(object_refs)):
+            object_ref = object_refs[i]
+            url_with_offset = url_with_offset_list[i].decode()
+            # Retrieve the information needed.
+            parsed_result = parse_url_with_offset(url_with_offset)
+            base_url = parsed_result.base_url
+            offset = parsed_result.offset
+            # Read a part of the file and recover the object.
+            with open(base_url, "rb") as f:
+                f.seek(offset)
                metadata_len = int.from_bytes(f.read(8), byteorder="little")
                buf_len = int.from_bytes(f.read(8), byteorder="little")
+                self._size_check(metadata_len, buf_len, parsed_result.size)
                metadata = f.read(metadata_len)
                # read remaining data to our buffer
-                self._put_object_to_store(metadata, buf_len, f, ref)
+                self._put_object_to_store(metadata, buf_len, f, object_ref)


 class ExternalStorageSmartOpenImpl(ExternalStorage):
@@ -115,6 +228,10 @@ class ExternalStorageSmartOpenImpl(ExternalStorage):

    Smart open supports multiple backend with the same APIs.

+    To use this implementation, you should pre-create the given uri.
+    For example, if your uri is a local file path, you should pre-create
+    the directory.
+
    Args:
        uri(str): Storage URI used for smart open.
        prefix(str): Prefix of objects that are stored.
@@ -129,7 +246,7 @@ class ExternalStorageSmartOpenImpl(ExternalStorage):

    def __init__(self,
                 uri: str,
-                 prefix: str = "ray_spilled_object_",
+                 prefix: str = DEFAULT_OBJECT_PREFIX,
                 override_transport_params: dict = None):
        try:
            from smart_open import open  # noqa
@@ -142,49 +259,49 @@ class ExternalStorageSmartOpenImpl(ExternalStorage):
        self.uri = uri.strip("/")
        self.prefix = prefix
        self.override_transport_params = override_transport_params or {}
-        self.transport_params = {}.update(self.override_transport_params)
+        # smart_open always seek to 0 if we don't set this argument.
+        # This will lead us to call a Object.get when it is not necessary,
+        # so defer seek and call seek before reading objects instead.
+        self.transport_params = {"defer_seek": True}
+        self.transport_params.update(self.override_transport_params)

-    def spill_objects(self, object_refs):
-        keys = []
-        ray_object_pairs = self._get_objects_from_store(object_refs)
-        for ref, (buf, metadata) in zip(object_refs, ray_object_pairs):
-            key = self.prefix + ref.hex()
-            self._spill_object(key, ref, buf, metadata)
-            keys.append(key.encode())
-        return keys
-
-    def restore_spilled_objects(self, keys):
-        for k in keys:
-            key = k.decode()
-            ref = ray.ObjectRef(bytes.fromhex(key[len(self.prefix):]))
-            self._restore_spilled_object(key, ref)
-
-    def _spill_object(self, key, ref, buf, metadata):
+    def spill_objects(self, object_refs) -> List[str]:
+        if len(object_refs) == 0:
+            return []
        from smart_open import open
+        # Always use the first object ref as a key when fusioning objects.
+        first_ref = object_refs[0]
+        key = f"{self.prefix}-{first_ref.hex()}-multi-{len(object_refs)}"
+        url = f"{self.uri}/{key}"
        with open(
-                self._build_uri(key), "wb",
+                url, "wb",
                transport_params=self.transport_params) as file_like:
-            metadata_len = len(metadata)
-            buf_len = len(buf)
-            file_like.write(metadata_len.to_bytes(8, byteorder="little"))
-            file_like.write(buf_len.to_bytes(8, byteorder="little"))
-            file_like.write(metadata)
-            file_like.write(memoryview(buf))
+            return self._write_multiple_objects(file_like, object_refs, url)

-    def _restore_spilled_object(self, key, ref):
+    def restore_spilled_objects(self, object_refs: List[ObjectRef],
+                                url_with_offset_list: List[str]):
        from smart_open import open
-        with open(
-                self._build_uri(key), "rb",
-                transport_params=self.transport_params) as file_like:
-            metadata_len = int.from_bytes(
-                file_like.read(8), byteorder="little")
-            buf_len = int.from_bytes(file_like.read(8), byteorder="little")
-            metadata = file_like.read(metadata_len)
-            # read remaining data to our buffer
-            self._put_object_to_store(metadata, buf_len, file_like, ref)
+        for i in range(len(object_refs)):
+            object_ref = object_refs[i]
+            url_with_offset = url_with_offset_list[i].decode()

-    def _build_uri(self, key):
-        return f"{self.uri}/{key}"
+            # Retrieve the information needed.
+            parsed_result = parse_url_with_offset(url_with_offset)
+            base_url = parsed_result.base_url
+            offset = parsed_result.offset
+
+            with open(
+                    base_url, "rb",
+                    transport_params=self.transport_params) as f:
+                # smart open seek reads the file from offset-end_of_the_file
+                # when the seek is called.
+                f.seek(offset)
+                metadata_len = int.from_bytes(f.read(8), byteorder="little")
+                buf_len = int.from_bytes(f.read(8), byteorder="little")
+                self._size_check(metadata_len, buf_len, parsed_result.size)
+                metadata = f.read(metadata_len)
+                # read remaining data to our buffer
+                self._put_object_to_store(metadata, buf_len, f, object_ref)


 _external_storage = NullStorage()
@@ -223,11 +340,13 @@ def spill_objects(object_refs):
    return _external_storage.spill_objects(object_refs)


-def restore_spilled_objects(keys: List[bytes]):
-    """Spill objects to the external storage. Objects are specified
-    by their object refs.
+def restore_spilled_objects(object_refs: List[ObjectRef],
+                            url_with_offset_list: List[str]):
+    """Restore objects from the external storage.

    Args:
-        keys: A list of bytes corresponding to the spilled objects.
+        object_refs: List of object IDs (note that it is not ref).
+        url_with_offset_list: List of url_with_offset.
    """
-    _external_storage.restore_spilled_objects(keys)
+    _external_storage.restore_spilled_objects(object_refs,
+                                              url_with_offset_list)
@@ -232,8 +232,10 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
        (void(const CWorkerID &) nogil) on_worker_shutdown
        (CRayStatus() nogil) check_signals
        (void() nogil) gc_collect
-        (c_vector[c_string](const c_vector[CObjectID]&) nogil) spill_objects
-        (void(const c_vector[c_string]&) nogil) restore_spilled_objects
+        (c_vector[c_string](const c_vector[CObjectID] &) nogil) spill_objects
+        (void(
+            const c_vector[CObjectID] &,
+            const c_vector[c_string] &) nogil) restore_spilled_objects
        (void(c_string *stack_out) nogil) get_lang_stack
        c_bool ref_counting_enabled
        c_bool is_local_mode
@@ -52,6 +52,9 @@ cdef class ObjectRef(BaseID):
    def task_id(self):
        return TaskID(self.data.TaskId().Binary())

+    def job_id(self):
+        return self.task_id().job_id()
+
    cdef size_t hash(self):
        return self.data.Hash()

@@ -124,6 +124,8 @@ cdef extern from "ray/common/id.h" namespace "ray" nogil:

        CActorID ActorId() const

+        CJobID JobId() const
+
    cdef cppclass CObjectID" ray::ObjectID"(CBaseID[CObjectID]):

        @staticmethod
@@ -152,6 +152,9 @@ cdef class TaskID(BaseID):
    def actor_id(self):
        return ActorID(self.data.ActorId().Binary())

+    def job_id(self):
+        return JobID(self.data.JobId().Binary())
+
    cdef size_t hash(self):
        return self.data.Hash()

@@ -207,3 +207,6 @@ MACH_PAGE_SIZE_BYTES = 4096
 # Max 64 bit integer value, which is needed to ensure against overflow
 # in C++ when passing integer values cross-language.
 MAX_INT64_VALUE = 9223372036854775807
+
+# Object Spilling related constants
+DEFAULT_OBJECT_PREFIX = "ray_spilled_object"
@@ -9,12 +9,15 @@ import numpy as np
 import pytest
 import psutil
 import ray
+from ray.external_storage import (create_url_with_offset,
+                                  parse_url_with_offset)

 bucket_name = "object-spilling-test"
+spill_local_path = "/tmp/spill"
 file_system_object_spilling_config = {
    "type": "filesystem",
    "params": {
-        "directory_path": "/tmp"
+        "directory_path": spill_local_path
    }
 }
 smart_open_object_spilling_config = {
@@ -108,6 +111,17 @@ def test_invalid_config_raises_exception(shutdown_only):
        })


+def test_url_generation_and_parse():
+    url = "s3://abc/def/ray_good"
+    offset = 10
+    size = 30
+    url_with_offset = create_url_with_offset(url=url, offset=offset, size=size)
+    parsed_result = parse_url_with_offset(url_with_offset)
+    assert parsed_result.base_url == url
+    assert parsed_result.offset == offset
+    assert parsed_result.size == size
+
+
@pytest.mark.skipif(
    platform.system() == "Windows", reason="Failing on Windows.")
 def test_spill_objects_manually(object_spilling_config, shutdown_only):
@@ -119,6 +133,7 @@ def test_spill_objects_manually(object_spilling_config, shutdown_only):
            "automatic_object_spilling_enabled": False,
            "max_io_workers": 4,
            "object_spilling_config": object_spilling_config,
+            "min_spilling_size": 0,
        })
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []
@@ -179,6 +194,7 @@ def test_spill_objects_manually_from_workers(object_spilling_config,
            "automatic_object_spilling_enabled": False,
            "max_io_workers": 4,
            "object_spilling_config": object_spilling_config,
+            "min_spilling_size": 0,
        })

    @ray.remote
@@ -209,6 +225,7 @@ def test_spill_objects_manually_with_workers(object_spilling_config,
            "automatic_object_spilling_enabled": False,
            "max_io_workers": 4,
            "object_spilling_config": object_spilling_config,
+            "min_spilling_size": 0,
        })
    arrays = [np.random.rand(100 * 1024) for _ in range(50)]
    objects = [ray.put(arr) for arr in arrays]
@@ -240,6 +257,7 @@ def test_spill_objects_manually_with_workers(object_spilling_config,
                    "directory_path": "/tmp"
                }
            }),
+            "min_spilling_size": 0,
        },
    }],
    indirect=True)
@@ -278,6 +296,7 @@ def test_spill_remote_object(ray_start_cluster_head):
 def test_spill_objects_automatically(object_spilling_config, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    ray.init(
+        num_cpus=1,
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 4,
@@ -285,27 +304,30 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
            "object_store_full_max_retries": 4,
            "object_store_full_initial_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
+            "min_spilling_size": 0
        })
-    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []
-
-    # Wait raylet for starting an IO worker.
-    time.sleep(1)
+    solution_buffer = []
+    buffer_length = 100

    # Create objects of more than 800 MiB.
-    for _ in range(100):
+    for _ in range(buffer_length):
        ref = None
        while ref is None:
+            multiplier = random.choice([1, 2, 3])
+            arr = np.random.rand(multiplier * 1024 * 1024)
            ref = ray.put(arr)
            replay_buffer.append(ref)
+            solution_buffer.append(arr)

    print("-----------------------------------")
-
    # randomly sample objects
    for _ in range(1000):
-        ref = random.choice(replay_buffer)
+        index = random.choice(list(range(buffer_length)))
+        ref = replay_buffer[index]
+        solution = solution_buffer[index]
        sample = ray.get(ref, timeout=0)
-        assert np.array_equal(sample, arr)
+        assert np.array_equal(sample, solution)


@pytest.mark.skipif(
@@ -321,6 +343,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
            "automatic_object_spilling_enabled": True,
            "max_io_workers": 2,
            "object_spilling_config": object_spilling_config,
+            "min_spilling_size": 0,
        },
    )

@@ -352,13 +375,11 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
            "object_store_full_max_retries": 4,
            "object_store_full_initial_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
+            "min_spilling_size": 0,
        })
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []

-    # Wait raylet for starting an IO worker.
-    time.sleep(1)
-
    # Create objects of more than 400 MiB.
    for _ in range(50):
        ref = None
@@ -373,5 +394,61 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
                assert np.array_equal(sample, arr)


+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="Failing on Windows.")
+def test_fusion_objects(tmp_path, shutdown_only):
+    # Limit our object store to 75 MiB of memory.
+    temp_folder = tmp_path / "spill"
+    temp_folder.mkdir()
+    min_spilling_size = 30 * 1024 * 1024
+    ray.init(
+        object_store_memory=75 * 1024 * 1024,
+        _system_config={
+            "max_io_workers": 4,
+            "automatic_object_spilling_enabled": True,
+            "object_store_full_max_retries": 4,
+            "object_store_full_initial_delay_ms": 100,
+            "object_spilling_config": json.dumps({
+                "type": "filesystem",
+                "params": {
+                    "directory_path": str(temp_folder)
+                }
+            }),
+            "min_spilling_size": min_spilling_size,
+        })
+    replay_buffer = []
+    solution_buffer = []
+    buffer_length = 100
+
+    # Create objects of more than 800 MiB.
+    for _ in range(buffer_length):
+        ref = None
+        while ref is None:
+            multiplier = random.choice([1, 2, 3])
+            arr = np.random.rand(multiplier * 1024 * 1024)
+            ref = ray.put(arr)
+            replay_buffer.append(ref)
+            solution_buffer.append(arr)
+
+    print("-----------------------------------")
+    # randomly sample objects
+    for _ in range(1000):
+        index = random.choice(list(range(buffer_length)))
+        ref = replay_buffer[index]
+        solution = solution_buffer[index]
+        sample = ray.get(ref, timeout=0)
+        assert np.array_equal(sample, solution)
+
+    is_test_passing = False
+    for path in temp_folder.iterdir():
+        file_size = path.stat().st_size
+        # Make sure there are at least one
+        # file_size that exceeds the min_spilling_size.
+        # If we don't fusion correctly, this cannot happen.
+        if file_size >= min_spilling_size:
+            is_test_passing = True
+    assert is_test_passing
+
+
 if __name__ == "__main__":
    sys.exit(pytest.main(["-sv", __file__]))