mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 10:01:11 +08:00
[Core] Object spilling prototype (#9818)
This commit is contained in:
committed by
GitHub
parent
36e626e95d
commit
17ca1d8ff4
+5
-21
@@ -78,27 +78,11 @@ from ray.profiling import profile # noqa: E402
|
||||
from ray.state import (jobs, nodes, actors, objects, timeline,
|
||||
object_transfer_timeline, cluster_resources,
|
||||
available_resources) # noqa: E402
|
||||
from ray.worker import (
|
||||
LOCAL_MODE,
|
||||
SCRIPT_MODE,
|
||||
WORKER_MODE,
|
||||
cancel,
|
||||
connect,
|
||||
disconnect,
|
||||
get,
|
||||
get_actor,
|
||||
get_gpu_ids,
|
||||
get_resource_ids,
|
||||
get_webui_url,
|
||||
init,
|
||||
is_initialized,
|
||||
put,
|
||||
kill,
|
||||
register_custom_serializer,
|
||||
remote,
|
||||
shutdown,
|
||||
show_in_webui,
|
||||
wait,
|
||||
from ray.worker import ( # noqa: F401
|
||||
LOCAL_MODE, SCRIPT_MODE, WORKER_MODE, IO_WORKER_MODE, cancel, connect,
|
||||
disconnect, get, get_actor, get_gpu_ids, get_resource_ids, get_webui_url,
|
||||
init, is_initialized, put, kill, register_custom_serializer, remote,
|
||||
shutdown, show_in_webui, wait,
|
||||
) # noqa: E402
|
||||
import ray.internal # noqa: E402
|
||||
import ray.projects # noqa: E402
|
||||
|
||||
+118
-7
@@ -66,6 +66,7 @@ from ray.includes.common cimport (
|
||||
TASK_TYPE_ACTOR_TASK,
|
||||
WORKER_TYPE_WORKER,
|
||||
WORKER_TYPE_DRIVER,
|
||||
WORKER_TYPE_IO_WORKER,
|
||||
PLACEMENT_STRATEGY_PACK,
|
||||
PLACEMENT_STRATEGY_SPREAD,
|
||||
)
|
||||
@@ -90,6 +91,7 @@ from ray.includes.ray_config cimport RayConfig
|
||||
from ray.includes.global_state_accessor cimport CGlobalStateAccessor
|
||||
|
||||
import ray
|
||||
from ray import external_storage
|
||||
from ray.async_compat import (
|
||||
sync_to_async, get_new_event_loop)
|
||||
import ray.memory_monitor as memory_monitor
|
||||
@@ -590,6 +592,49 @@ cdef void gc_collect() nogil:
|
||||
num_freed, end - start))
|
||||
|
||||
|
||||
cdef c_vector[c_string] spill_objects_handler(
|
||||
const c_vector[CObjectID]& object_ids_to_spill) nogil:
|
||||
cdef c_vector[c_string] return_urls
|
||||
with gil:
|
||||
object_refs = VectorToObjectRefs(object_ids_to_spill)
|
||||
try:
|
||||
urls = external_storage.spill_objects(object_refs)
|
||||
for url in urls:
|
||||
return_urls.push_back(url)
|
||||
except Exception:
|
||||
exception_str = (
|
||||
"An unexpected internal error occurred while the IO worker "
|
||||
"was spilling objects.")
|
||||
logger.exception(exception_str)
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.worker.global_worker,
|
||||
"io_worker_spill_objects_error",
|
||||
traceback.format_exc() + exception_str,
|
||||
job_id=None)
|
||||
return return_urls
|
||||
|
||||
|
||||
cdef void restore_spilled_objects_handler(
|
||||
const c_vector[c_string]& object_urls) nogil:
|
||||
with gil:
|
||||
urls = []
|
||||
size = object_urls.size()
|
||||
for i in range(size):
|
||||
urls.append(object_urls[i])
|
||||
try:
|
||||
external_storage.restore_spilled_objects(urls)
|
||||
except Exception:
|
||||
exception_str = (
|
||||
"An unexpected internal error occurred while the IO worker "
|
||||
"was restoring spilled objects.")
|
||||
logger.exception(exception_str)
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.worker.global_worker,
|
||||
"io_worker_retore_spilled_objects_error",
|
||||
traceback.format_exc() + exception_str,
|
||||
job_id=None)
|
||||
|
||||
|
||||
# This function introduces ~2-7us of overhead per call (i.e., it can be called
|
||||
# up to hundreds of thousands of times per second).
|
||||
cdef void get_py_stack(c_string* stack_out) nogil:
|
||||
@@ -650,17 +695,25 @@ cdef void terminate_asyncio_thread() nogil:
|
||||
|
||||
cdef class CoreWorker:
|
||||
|
||||
def __cinit__(self, is_driver, store_socket, raylet_socket,
|
||||
def __cinit__(self, worker_type, store_socket, raylet_socket,
|
||||
JobID job_id, GcsClientOptions gcs_options, log_dir,
|
||||
node_ip_address, node_manager_port, raylet_ip_address,
|
||||
local_mode, driver_name, stdout_file, stderr_file,
|
||||
serialized_job_config, metrics_agent_port):
|
||||
self.is_driver = is_driver
|
||||
self.is_local_mode = local_mode
|
||||
|
||||
cdef CCoreWorkerOptions options = CCoreWorkerOptions()
|
||||
options.worker_type = (
|
||||
WORKER_TYPE_DRIVER if is_driver else WORKER_TYPE_WORKER)
|
||||
if worker_type in (ray.LOCAL_MODE, ray.SCRIPT_MODE):
|
||||
self.is_driver = True
|
||||
options.worker_type = WORKER_TYPE_DRIVER
|
||||
elif worker_type == ray.WORKER_MODE:
|
||||
self.is_driver = False
|
||||
options.worker_type = WORKER_TYPE_WORKER
|
||||
elif worker_type == ray.IO_WORKER_MODE:
|
||||
self.is_driver = False
|
||||
options.worker_type = WORKER_TYPE_IO_WORKER
|
||||
else:
|
||||
raise ValueError(f"Unknown worker type: {worker_type}")
|
||||
options.language = LANGUAGE_PYTHON
|
||||
options.store_socket = store_socket.encode("ascii")
|
||||
options.raylet_socket = raylet_socket.encode("ascii")
|
||||
@@ -678,6 +731,8 @@ cdef class CoreWorker:
|
||||
options.task_execution_callback = task_execution_handler
|
||||
options.check_signals = check_signals
|
||||
options.gc_collect = gc_collect
|
||||
options.spill_objects = spill_objects_handler
|
||||
options.restore_spilled_objects = restore_spilled_objects_handler
|
||||
options.get_lang_stack = get_py_stack
|
||||
options.ref_counting_enabled = True
|
||||
options.is_local_mode = local_mode
|
||||
@@ -725,15 +780,15 @@ cdef class CoreWorker:
|
||||
return self.plasma_event_handler
|
||||
|
||||
def get_objects(self, object_refs, TaskID current_task_id,
|
||||
int64_t timeout_ms=-1):
|
||||
int64_t timeout_ms=-1, plasma_objects_only=False):
|
||||
cdef:
|
||||
c_vector[shared_ptr[CRayObject]] results
|
||||
CTaskID c_task_id = current_task_id.native()
|
||||
c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
|
||||
|
||||
c_bool _plasma_objects_only = plasma_objects_only
|
||||
with nogil:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Get(
|
||||
c_object_ids, timeout_ms, &results))
|
||||
c_object_ids, timeout_ms, &results, _plasma_objects_only))
|
||||
|
||||
return RayObjectsToDataMetadataPairs(results)
|
||||
|
||||
@@ -771,6 +826,48 @@ cdef class CoreWorker:
|
||||
# and deal with it here.
|
||||
return data.get() == NULL
|
||||
|
||||
def put_file_like_object(
|
||||
self, metadata, data_size, file_like, ObjectRef object_ref=None):
|
||||
"""Directly create a new Plasma Store object from a file like
|
||||
object. This avoids extra memory copy.
|
||||
|
||||
Args:
|
||||
metadata (bytes): The metadata of the object.
|
||||
data_size (int): The size of the data buffer.
|
||||
file_like: A python file object that provides the `readinto`
|
||||
interface.
|
||||
object_ref: The new ObjectRef.
|
||||
"""
|
||||
cdef:
|
||||
CObjectID c_object_id
|
||||
shared_ptr[CBuffer] data_buf
|
||||
shared_ptr[CBuffer] metadata_buf
|
||||
int64_t put_threshold
|
||||
c_bool put_small_object_in_memory_store
|
||||
c_vector[CObjectID] c_object_id_vector
|
||||
# TODO(suquark): This method does not support put objects to
|
||||
# in memory store currently.
|
||||
metadata_buf = string_to_buffer(metadata)
|
||||
object_already_exists = self._create_put_buffer(
|
||||
metadata_buf, data_size, object_ref,
|
||||
ObjectRefsToVector([]),
|
||||
&c_object_id, &data_buf)
|
||||
if object_already_exists:
|
||||
logger.debug("Object already exists in 'put_file_like_object'.")
|
||||
return
|
||||
data = Buffer.make(data_buf)
|
||||
view = memoryview(data)
|
||||
index = 0
|
||||
while index < data_size:
|
||||
bytes_read = file_like.readinto(view[index:])
|
||||
index += bytes_read
|
||||
with nogil:
|
||||
# Using custom object refs is not supported because we
|
||||
# can't track their lifecycle, so we don't pin the object
|
||||
# in this case.
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Seal(
|
||||
c_object_id, pin_object=object_ref is None))
|
||||
|
||||
def put_serialized_object(self, serialized_object,
|
||||
ObjectRef object_ref=None,
|
||||
c_bool pin_object=True):
|
||||
@@ -1342,6 +1439,20 @@ cdef class CoreWorker:
|
||||
resource_name.encode("ascii"), capacity,
|
||||
CClientID.FromBinary(client_id.binary()))
|
||||
|
||||
def force_spill_objects(self, object_refs):
|
||||
cdef c_vector[CObjectID] object_ids
|
||||
object_ids = ObjectRefsToVector(object_refs)
|
||||
with nogil:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker()
|
||||
.ForceSpillObjects(object_ids))
|
||||
|
||||
def force_restore_spilled_objects(self, object_refs):
|
||||
cdef c_vector[CObjectID] object_ids
|
||||
object_ids = ObjectRefsToVector(object_refs)
|
||||
with nogil:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker()
|
||||
.ForceRestoreSpilledObjects(object_ids))
|
||||
|
||||
cdef void async_set_result(shared_ptr[CRayObject] obj,
|
||||
CObjectID object_ref,
|
||||
void *future) with gil:
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
from .api import get, wait
|
||||
from .dynamic_resources import set_resource
|
||||
from .object_spilling import force_spill_objects, force_restore_spilled_objects
|
||||
from .placement_group import (
|
||||
placement_group, )
|
||||
__all__ = [
|
||||
"get",
|
||||
"wait",
|
||||
"set_resource",
|
||||
"force_spill_objects",
|
||||
"force_restore_spilled_objects",
|
||||
"placement_group",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
import ray
|
||||
|
||||
|
||||
def force_spill_objects(object_refs):
|
||||
"""Force spilling objects to external storage.
|
||||
|
||||
Args:
|
||||
object_refs: Object refs of the objects to be
|
||||
spilled.
|
||||
"""
|
||||
core_worker = ray.worker.global_worker.core_worker
|
||||
# Make sure that the values are object refs.
|
||||
for object_ref in object_refs:
|
||||
if not isinstance(object_ref, ray.ObjectRef):
|
||||
raise TypeError(
|
||||
f"Attempting to call `force_spill_objects` on the "
|
||||
f"value {object_ref}, which is not an ray.ObjectRef.")
|
||||
return core_worker.force_spill_objects(object_refs)
|
||||
|
||||
|
||||
def force_restore_spilled_objects(object_refs):
|
||||
"""Force restoring objects from external storage.
|
||||
|
||||
Args:
|
||||
object_refs: Object refs of the objects to be
|
||||
restored.
|
||||
"""
|
||||
core_worker = ray.worker.global_worker.core_worker
|
||||
# Make sure that the values are object refs.
|
||||
for object_ref in object_refs:
|
||||
if not isinstance(object_ref, ray.ObjectRef):
|
||||
raise TypeError(
|
||||
f"Attempting to call `force_restore_spilled_objects` on the "
|
||||
f"value {object_ref}, which is not an ray.ObjectRef.")
|
||||
return core_worker.force_restore_spilled_objects(object_refs)
|
||||
@@ -0,0 +1,129 @@
|
||||
import abc
|
||||
import os
|
||||
from typing import List
|
||||
import ray
|
||||
|
||||
|
||||
class ExternalStorage(metaclass=abc.ABCMeta):
|
||||
"""The base class for external storage.
|
||||
|
||||
This class provides some useful functions for zero-copy object
|
||||
put/get from plasma store. Also it specifies the interface for
|
||||
object spilling.
|
||||
"""
|
||||
|
||||
def _get_objects_from_store(self, object_refs):
|
||||
worker = ray.worker.global_worker
|
||||
ray_object_pairs = worker.core_worker.get_objects(
|
||||
object_refs,
|
||||
worker.current_task_id,
|
||||
timeout_ms=0,
|
||||
plasma_objects_only=True)
|
||||
return ray_object_pairs
|
||||
|
||||
def _put_object_to_store(self, metadata, data_size, file_like, object_ref):
|
||||
worker = ray.worker.global_worker
|
||||
worker.core_worker.put_file_like_object(metadata, data_size, file_like,
|
||||
object_ref)
|
||||
|
||||
@abc.abstractmethod
|
||||
def spill_objects(self, object_refs):
|
||||
"""Spill objects to the external storage. Objects are specified
|
||||
by their object refs.
|
||||
|
||||
Args:
|
||||
object_refs: The list of the refs of the objects to be spilled.
|
||||
Returns:
|
||||
A list of keys corresponding to the input object refs.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def restore_spilled_objects(self, keys: List[bytes]):
|
||||
"""Spill objects to the external storage. Objects are specified
|
||||
by their object refs.
|
||||
|
||||
Args:
|
||||
keys: A list of bytes corresponding to the spilled objects.
|
||||
"""
|
||||
|
||||
|
||||
class NullStorage(ExternalStorage):
|
||||
"""The class that represents an uninitialized external storage."""
|
||||
|
||||
def spill_objects(self, object_refs):
|
||||
raise NotImplementedError("External storage is not initialized")
|
||||
|
||||
def restore_spilled_objects(self, keys):
|
||||
raise NotImplementedError("External storage is not initialized")
|
||||
|
||||
|
||||
class FileSystemStorage(ExternalStorage):
|
||||
"""The class for filesystem-like external storage."""
|
||||
|
||||
def __init__(self, directory_path):
|
||||
self.directory_path = directory_path
|
||||
self.prefix = "ray_spilled_object_"
|
||||
|
||||
def spill_objects(self, object_refs):
|
||||
keys = []
|
||||
ray_object_pairs = self._get_objects_from_store(object_refs)
|
||||
for ref, (buf, metadata) in zip(object_refs, ray_object_pairs):
|
||||
filename = self.prefix + ref.hex()
|
||||
with open(os.path.join(self.directory_path, filename), "wb") as f:
|
||||
metadata_len = len(metadata)
|
||||
buf_len = len(buf)
|
||||
f.write(metadata_len.to_bytes(8, byteorder="little"))
|
||||
f.write(buf_len.to_bytes(8, byteorder="little"))
|
||||
f.write(metadata)
|
||||
f.write(memoryview(buf))
|
||||
keys.append(filename.encode())
|
||||
return keys
|
||||
|
||||
def restore_spilled_objects(self, keys):
|
||||
for k in keys:
|
||||
filename = k.decode()
|
||||
ref = ray.ObjectRef(bytes.fromhex(filename[len(self.prefix):]))
|
||||
with open(os.path.join(self.directory_path, filename), "rb") as f:
|
||||
metadata_len = int.from_bytes(f.read(8), byteorder="little")
|
||||
buf_len = int.from_bytes(f.read(8), byteorder="little")
|
||||
metadata = f.read(metadata_len)
|
||||
# read remaining data to our buffer
|
||||
self._put_object_to_store(metadata, buf_len, f, ref)
|
||||
|
||||
|
||||
_external_storage = NullStorage()
|
||||
|
||||
|
||||
def setup_external_storage(config):
|
||||
"""Setup the external storage according to the config."""
|
||||
global _external_storage
|
||||
if config:
|
||||
storage_type = config["type"]
|
||||
if storage_type == "filesystem":
|
||||
_external_storage = FileSystemStorage(**config["params"])
|
||||
else:
|
||||
raise ValueError(f"Unknown external storage type: {storage_type}")
|
||||
else:
|
||||
_external_storage = NullStorage()
|
||||
|
||||
|
||||
def spill_objects(object_refs):
|
||||
"""Spill objects to the external storage. Objects are specified
|
||||
by their object refs.
|
||||
|
||||
Args:
|
||||
object_refs: The list of the refs of the objects to be spilled.
|
||||
Returns:
|
||||
A list of keys corresponding to the input object refs.
|
||||
"""
|
||||
return _external_storage.spill_objects(object_refs)
|
||||
|
||||
|
||||
def restore_spilled_objects(keys: List[bytes]):
|
||||
"""Spill objects to the external storage. Objects are specified
|
||||
by their object refs.
|
||||
|
||||
Args:
|
||||
keys: A list of bytes corresponding to the spilled objects.
|
||||
"""
|
||||
_external_storage.restore_spilled_objects(keys)
|
||||
@@ -162,6 +162,7 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil:
|
||||
cdef extern from "src/ray/protobuf/common.pb.h" nogil:
|
||||
cdef CWorkerType WORKER_TYPE_WORKER "ray::WorkerType::WORKER"
|
||||
cdef CWorkerType WORKER_TYPE_DRIVER "ray::WorkerType::DRIVER"
|
||||
cdef CWorkerType WORKER_TYPE_IO_WORKER "ray::WorkerType::IO_WORKER"
|
||||
|
||||
cdef extern from "src/ray/protobuf/common.pb.h" nogil:
|
||||
cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
|
||||
|
||||
@@ -165,7 +165,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
shared_ptr[CBuffer] *data)
|
||||
CRayStatus Seal(const CObjectID &object_id, c_bool pin_object)
|
||||
CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
|
||||
c_vector[shared_ptr[CRayObject]] *results)
|
||||
c_vector[shared_ptr[CRayObject]] *results,
|
||||
c_bool plasma_objects_only)
|
||||
CRayStatus Contains(const CObjectID &object_id, c_bool *has_object)
|
||||
CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects,
|
||||
int64_t timeout_ms, c_vector[c_bool] *results)
|
||||
@@ -192,6 +193,9 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CRayStatus SetResource(const c_string &resource_name,
|
||||
const double capacity,
|
||||
const CClientID &client_Id)
|
||||
CRayStatus ForceSpillObjects(const c_vector[CObjectID] &object_ids)
|
||||
CRayStatus ForceRestoreSpilledObjects(
|
||||
const c_vector[CObjectID] &object_ids)
|
||||
|
||||
cdef cppclass CCoreWorkerOptions "ray::CoreWorkerOptions":
|
||||
CWorkerType worker_type
|
||||
@@ -220,6 +224,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
) task_execution_callback
|
||||
(CRayStatus() nogil) check_signals
|
||||
(void() nogil) gc_collect
|
||||
(c_vector[c_string](const c_vector[CObjectID]&) nogil) spill_objects
|
||||
(void(const c_vector[c_string]&) nogil) restore_spilled_objects
|
||||
(void(c_string *stack_out) nogil) get_lang_stack
|
||||
c_bool ref_counting_enabled
|
||||
c_bool is_local_mode
|
||||
|
||||
+2
-1
@@ -718,7 +718,8 @@ class Node:
|
||||
socket_to_use=self.socket,
|
||||
head_node=self.head,
|
||||
start_initial_python_workers_for_first_job=self._ray_params.
|
||||
start_initial_python_workers_for_first_job)
|
||||
start_initial_python_workers_for_first_job,
|
||||
object_spilling_config=self._ray_params.object_spilling_config)
|
||||
assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
|
||||
|
||||
|
||||
@@ -145,7 +145,8 @@ class RayParams:
|
||||
enable_object_reconstruction=False,
|
||||
metrics_agent_port=None,
|
||||
metrics_export_port=None,
|
||||
lru_evict=False):
|
||||
lru_evict=False,
|
||||
object_spilling_config=None):
|
||||
self.object_ref_seed = object_ref_seed
|
||||
self.redis_address = redis_address
|
||||
self.num_cpus = num_cpus
|
||||
@@ -190,6 +191,7 @@ class RayParams:
|
||||
self._internal_config = _internal_config
|
||||
self._lru_evict = lru_evict
|
||||
self._enable_object_reconstruction = enable_object_reconstruction
|
||||
self.object_spilling_config = object_spilling_config
|
||||
self._check_usage()
|
||||
|
||||
# Set the internal config options for LRU eviction.
|
||||
|
||||
@@ -1289,7 +1289,8 @@ def start_raylet(redis_address,
|
||||
fate_share=None,
|
||||
socket_to_use=None,
|
||||
head_node=False,
|
||||
start_initial_python_workers_for_first_job=False):
|
||||
start_initial_python_workers_for_first_job=False,
|
||||
object_spilling_config=None):
|
||||
"""Start a raylet, which is a combined local scheduler and object manager.
|
||||
|
||||
Args:
|
||||
@@ -1398,6 +1399,10 @@ def start_raylet(redis_address,
|
||||
if load_code_from_local:
|
||||
start_worker_command += ["--load-code-from-local"]
|
||||
|
||||
if object_spilling_config:
|
||||
start_worker_command.append(
|
||||
f"--object-spilling-config={json.dumps(object_spilling_config)}")
|
||||
|
||||
command = [
|
||||
RAYLET_EXECUTABLE,
|
||||
"--raylet_socket_name={}".format(raylet_name),
|
||||
|
||||
@@ -48,6 +48,7 @@ py_test_module_list(
|
||||
"test_stress_sharded.py",
|
||||
"test_unreconstructable_errors.py",
|
||||
"test_tensorflow.py",
|
||||
"test_object_spilling.py",
|
||||
],
|
||||
size = "medium",
|
||||
extra_srcs = SRCS,
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
|
||||
def test_spill_objects_manually(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
object_spilling_config={
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
pinned_objects = set()
|
||||
spilled_objects = set()
|
||||
|
||||
# Create objects of more than 200 MiB.
|
||||
for _ in range(25):
|
||||
ref = None
|
||||
while ref is None:
|
||||
try:
|
||||
ref = ray.put(arr)
|
||||
replay_buffer.append(ref)
|
||||
pinned_objects.add(ref)
|
||||
except ray.exceptions.ObjectStoreFullError:
|
||||
ref_to_spill = pinned_objects.pop()
|
||||
ray.experimental.force_spill_objects([ref_to_spill])
|
||||
spilled_objects.add(ref_to_spill)
|
||||
|
||||
# Spill 2 more objects so we will always have enough space for
|
||||
# restoring objects back.
|
||||
refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
|
||||
ray.experimental.force_spill_objects(refs_to_spill)
|
||||
spilled_objects.update(refs_to_spill)
|
||||
|
||||
# randomly sample objects
|
||||
for _ in range(100):
|
||||
ref = random.choice(replay_buffer)
|
||||
if ref in spilled_objects:
|
||||
ray.experimental.force_restore_spilled_objects([ref])
|
||||
sample = ray.get(ref)
|
||||
assert np.array_equal(sample, arr)
|
||||
|
||||
|
||||
def test_spill_objects_manually_from_workers(shutdown_only):
|
||||
# Limit our object store to 100 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
object_spilling_config={
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
|
||||
@ray.remote
|
||||
def _worker():
|
||||
arr = np.random.rand(100 * 1024)
|
||||
ref = ray.put(arr)
|
||||
ray.experimental.force_spill_objects([ref])
|
||||
ray.experimental.force_restore_spilled_objects([ref])
|
||||
assert np.array_equal(ray.get(ref), arr)
|
||||
|
||||
ray.get([_worker.remote() for _ in range(50)])
|
||||
|
||||
|
||||
def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
object_spilling_config={
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
_internal_config=json.dumps({
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}))
|
||||
arrays = [np.random.rand(100 * 1024) for _ in range(50)]
|
||||
objects = [ray.put(arr) for arr in arrays]
|
||||
|
||||
@ray.remote
|
||||
def _worker(object_refs):
|
||||
ray.experimental.force_spill_objects(object_refs)
|
||||
|
||||
ray.get([_worker.remote([o]) for o in objects])
|
||||
|
||||
for restored, arr in zip(ray.get(objects), arrays):
|
||||
assert np.array_equal(restored, arr)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="have not been fully implemented")
|
||||
def test_spill_objects_automatically(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_internal_config=json.dumps({
|
||||
"max_io_workers": 4,
|
||||
"object_store_full_max_retries": 2,
|
||||
"object_store_full_initial_delay_ms": 10,
|
||||
"auto_object_spilling": True,
|
||||
}))
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
|
||||
# Wait raylet for starting an IO worker.
|
||||
time.sleep(1)
|
||||
|
||||
# Create objects of more than 800 MiB.
|
||||
for _ in range(100):
|
||||
ref = None
|
||||
while ref is None:
|
||||
ref = ray.put(arr)
|
||||
replay_buffer.append(ref)
|
||||
|
||||
print("-----------------------------------")
|
||||
|
||||
# randomly sample objects
|
||||
for _ in range(1000):
|
||||
ref = random.choice(replay_buffer)
|
||||
sample = ray.get(ref, timeout=0)
|
||||
assert np.array_equal(sample, arr)
|
||||
+10
-6
@@ -53,6 +53,7 @@ from ray.utils import (_random_string, check_oversized_pickle, is_cython,
|
||||
SCRIPT_MODE = 0
|
||||
WORKER_MODE = 1
|
||||
LOCAL_MODE = 2
|
||||
IO_WORKER_MODE = 3
|
||||
|
||||
ERROR_KEY_PREFIX = b"Error:"
|
||||
|
||||
@@ -513,7 +514,8 @@ def init(address=None,
|
||||
_internal_config=None,
|
||||
lru_evict=False,
|
||||
enable_object_reconstruction=False,
|
||||
_metrics_export_port=None):
|
||||
_metrics_export_port=None,
|
||||
object_spilling_config=None):
|
||||
"""
|
||||
Connect to an existing Ray cluster or start one and connect to it.
|
||||
|
||||
@@ -642,6 +644,8 @@ def init(address=None,
|
||||
_metrics_export_port(int): Port number Ray exposes system metrics
|
||||
through a Prometheus endpoint. It is currently under active
|
||||
development, and the API is subject to change.
|
||||
object_spilling_config (str): The configuration json string for object
|
||||
spilling I/O worker.
|
||||
|
||||
Returns:
|
||||
Address information about the started processes.
|
||||
@@ -737,7 +741,8 @@ def init(address=None,
|
||||
_internal_config=_internal_config,
|
||||
lru_evict=lru_evict,
|
||||
enable_object_reconstruction=enable_object_reconstruction,
|
||||
metrics_export_port=_metrics_export_port)
|
||||
metrics_export_port=_metrics_export_port,
|
||||
object_spilling_config=object_spilling_config)
|
||||
# Start the Ray processes. We set shutdown_at_exit=False because we
|
||||
# shutdown the node in the ray.shutdown call that happens in the atexit
|
||||
# handler. We still spawn a reaper process in case the atexit handler
|
||||
@@ -1206,7 +1211,7 @@ def connect(node,
|
||||
worker.redis_client = node.create_redis_client()
|
||||
|
||||
# Initialize some fields.
|
||||
if mode is WORKER_MODE:
|
||||
if mode in (WORKER_MODE, IO_WORKER_MODE):
|
||||
# We should not specify the job_id if it's `WORKER_MODE`.
|
||||
assert job_id is None
|
||||
job_id = JobID.nil()
|
||||
@@ -1260,7 +1265,7 @@ def connect(node,
|
||||
import __main__ as main
|
||||
driver_name = (main.__file__
|
||||
if hasattr(main, "__file__") else "INTERACTIVE MODE")
|
||||
elif mode == WORKER_MODE:
|
||||
elif mode == WORKER_MODE or mode == IO_WORKER_MODE:
|
||||
# Check the RedirectOutput key in Redis and based on its value redirect
|
||||
# worker output and error to their own files.
|
||||
# This key is set in services.py when Redis is started.
|
||||
@@ -1295,8 +1300,7 @@ def connect(node,
|
||||
job_config = ray.job_config.JobConfig()
|
||||
serialized_job_config = job_config.serialize()
|
||||
worker.core_worker = ray._raylet.CoreWorker(
|
||||
(mode == SCRIPT_MODE or mode == LOCAL_MODE),
|
||||
node.plasma_store_socket_name, node.raylet_socket_name, job_id,
|
||||
mode, node.plasma_store_socket_name, node.raylet_socket_name, job_id,
|
||||
gcs_options, node.get_logs_dir_path(), node.node_ip_address,
|
||||
node.node_manager_port, node.raylet_ip_address, (mode == LOCAL_MODE),
|
||||
driver_name, log_stdout_file_path, log_stderr_file_path,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.actor
|
||||
@@ -80,17 +81,47 @@ parser.add_argument(
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="True if cloudpickle should be used for serialization.")
|
||||
parser.add_argument(
|
||||
"--worker-type",
|
||||
required=False,
|
||||
type=str,
|
||||
default="WORKER",
|
||||
help="Specify the type of the worker process")
|
||||
parser.add_argument(
|
||||
"--metrics-agent-port",
|
||||
required=True,
|
||||
type=int,
|
||||
help="the port of the node's metric agent.")
|
||||
parser.add_argument(
|
||||
"--object-spilling-config",
|
||||
required=False,
|
||||
type=str,
|
||||
default="",
|
||||
help="The configuration of object spilling. Only used by I/O workers.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.utils.setup_logger(args.logging_level, args.logging_format)
|
||||
|
||||
if args.worker_type == "WORKER":
|
||||
mode = ray.WORKER_MODE
|
||||
elif args.worker_type == "IO_WORKER":
|
||||
mode = ray.IO_WORKER_MODE
|
||||
else:
|
||||
raise ValueError("Unknown worker type: " + args.worker_type)
|
||||
|
||||
# NOTE(suquark): We must initialize the external storage before we
|
||||
# connect to raylet. Otherwise we may receive requests before the
|
||||
# external storage is intialized.
|
||||
if mode == ray.IO_WORKER_MODE:
|
||||
from ray import external_storage
|
||||
if args.object_spilling_config:
|
||||
object_spilling_config = json.loads(args.object_spilling_config)
|
||||
else:
|
||||
object_spilling_config = {}
|
||||
external_storage.setup_external_storage(object_spilling_config)
|
||||
|
||||
internal_config = {}
|
||||
if args.config_list is not None:
|
||||
config_list = args.config_list.split(",")
|
||||
@@ -125,5 +156,14 @@ if __name__ == "__main__":
|
||||
spawn_reaper=False,
|
||||
connect_only=True)
|
||||
ray.worker._global_node = node
|
||||
ray.worker.connect(node, mode=ray.WORKER_MODE)
|
||||
ray.worker.global_worker.main_loop()
|
||||
|
||||
ray.worker.connect(node, mode=mode)
|
||||
if mode == ray.WORKER_MODE:
|
||||
ray.worker.global_worker.main_loop()
|
||||
elif mode == ray.IO_WORKER_MODE:
|
||||
# It is handled by another thread in the C++ core worker.
|
||||
# We just need to keep the worker alive.
|
||||
while True:
|
||||
time.sleep(100000)
|
||||
else:
|
||||
raise ValueError(f"Unexcepted worker mode: {mode}")
|
||||
|
||||
Reference in New Issue
Block a user