[Core] Object spilling prototype (#9818)

This commit is contained in:
Siyuan (Ryans) Zhuang
2020-08-14 15:39:10 -07:00
committed by GitHub
parent 36e626e95d
commit 17ca1d8ff4
36 changed files with 1026 additions and 95 deletions
+5 -21
View File
@@ -78,27 +78,11 @@ from ray.profiling import profile # noqa: E402
from ray.state import (jobs, nodes, actors, objects, timeline,
object_transfer_timeline, cluster_resources,
available_resources) # noqa: E402
from ray.worker import (
LOCAL_MODE,
SCRIPT_MODE,
WORKER_MODE,
cancel,
connect,
disconnect,
get,
get_actor,
get_gpu_ids,
get_resource_ids,
get_webui_url,
init,
is_initialized,
put,
kill,
register_custom_serializer,
remote,
shutdown,
show_in_webui,
wait,
from ray.worker import ( # noqa: F401
LOCAL_MODE, SCRIPT_MODE, WORKER_MODE, IO_WORKER_MODE, cancel, connect,
disconnect, get, get_actor, get_gpu_ids, get_resource_ids, get_webui_url,
init, is_initialized, put, kill, register_custom_serializer, remote,
shutdown, show_in_webui, wait,
) # noqa: E402
import ray.internal # noqa: E402
import ray.projects # noqa: E402
+118 -7
View File
@@ -66,6 +66,7 @@ from ray.includes.common cimport (
TASK_TYPE_ACTOR_TASK,
WORKER_TYPE_WORKER,
WORKER_TYPE_DRIVER,
WORKER_TYPE_IO_WORKER,
PLACEMENT_STRATEGY_PACK,
PLACEMENT_STRATEGY_SPREAD,
)
@@ -90,6 +91,7 @@ from ray.includes.ray_config cimport RayConfig
from ray.includes.global_state_accessor cimport CGlobalStateAccessor
import ray
from ray import external_storage
from ray.async_compat import (
sync_to_async, get_new_event_loop)
import ray.memory_monitor as memory_monitor
@@ -590,6 +592,49 @@ cdef void gc_collect() nogil:
num_freed, end - start))
cdef c_vector[c_string] spill_objects_handler(
const c_vector[CObjectID]& object_ids_to_spill) nogil:
cdef c_vector[c_string] return_urls
with gil:
object_refs = VectorToObjectRefs(object_ids_to_spill)
try:
urls = external_storage.spill_objects(object_refs)
for url in urls:
return_urls.push_back(url)
except Exception:
exception_str = (
"An unexpected internal error occurred while the IO worker "
"was spilling objects.")
logger.exception(exception_str)
ray.utils.push_error_to_driver(
ray.worker.global_worker,
"io_worker_spill_objects_error",
traceback.format_exc() + exception_str,
job_id=None)
return return_urls
cdef void restore_spilled_objects_handler(
const c_vector[c_string]& object_urls) nogil:
with gil:
urls = []
size = object_urls.size()
for i in range(size):
urls.append(object_urls[i])
try:
external_storage.restore_spilled_objects(urls)
except Exception:
exception_str = (
"An unexpected internal error occurred while the IO worker "
"was restoring spilled objects.")
logger.exception(exception_str)
ray.utils.push_error_to_driver(
ray.worker.global_worker,
"io_worker_retore_spilled_objects_error",
traceback.format_exc() + exception_str,
job_id=None)
# This function introduces ~2-7us of overhead per call (i.e., it can be called
# up to hundreds of thousands of times per second).
cdef void get_py_stack(c_string* stack_out) nogil:
@@ -650,17 +695,25 @@ cdef void terminate_asyncio_thread() nogil:
cdef class CoreWorker:
def __cinit__(self, is_driver, store_socket, raylet_socket,
def __cinit__(self, worker_type, store_socket, raylet_socket,
JobID job_id, GcsClientOptions gcs_options, log_dir,
node_ip_address, node_manager_port, raylet_ip_address,
local_mode, driver_name, stdout_file, stderr_file,
serialized_job_config, metrics_agent_port):
self.is_driver = is_driver
self.is_local_mode = local_mode
cdef CCoreWorkerOptions options = CCoreWorkerOptions()
options.worker_type = (
WORKER_TYPE_DRIVER if is_driver else WORKER_TYPE_WORKER)
if worker_type in (ray.LOCAL_MODE, ray.SCRIPT_MODE):
self.is_driver = True
options.worker_type = WORKER_TYPE_DRIVER
elif worker_type == ray.WORKER_MODE:
self.is_driver = False
options.worker_type = WORKER_TYPE_WORKER
elif worker_type == ray.IO_WORKER_MODE:
self.is_driver = False
options.worker_type = WORKER_TYPE_IO_WORKER
else:
raise ValueError(f"Unknown worker type: {worker_type}")
options.language = LANGUAGE_PYTHON
options.store_socket = store_socket.encode("ascii")
options.raylet_socket = raylet_socket.encode("ascii")
@@ -678,6 +731,8 @@ cdef class CoreWorker:
options.task_execution_callback = task_execution_handler
options.check_signals = check_signals
options.gc_collect = gc_collect
options.spill_objects = spill_objects_handler
options.restore_spilled_objects = restore_spilled_objects_handler
options.get_lang_stack = get_py_stack
options.ref_counting_enabled = True
options.is_local_mode = local_mode
@@ -725,15 +780,15 @@ cdef class CoreWorker:
return self.plasma_event_handler
def get_objects(self, object_refs, TaskID current_task_id,
int64_t timeout_ms=-1):
int64_t timeout_ms=-1, plasma_objects_only=False):
cdef:
c_vector[shared_ptr[CRayObject]] results
CTaskID c_task_id = current_task_id.native()
c_vector[CObjectID] c_object_ids = ObjectRefsToVector(object_refs)
c_bool _plasma_objects_only = plasma_objects_only
with nogil:
check_status(CCoreWorkerProcess.GetCoreWorker().Get(
c_object_ids, timeout_ms, &results))
c_object_ids, timeout_ms, &results, _plasma_objects_only))
return RayObjectsToDataMetadataPairs(results)
@@ -771,6 +826,48 @@ cdef class CoreWorker:
# and deal with it here.
return data.get() == NULL
def put_file_like_object(
self, metadata, data_size, file_like, ObjectRef object_ref=None):
"""Directly create a new Plasma Store object from a file like
object. This avoids extra memory copy.
Args:
metadata (bytes): The metadata of the object.
data_size (int): The size of the data buffer.
file_like: A python file object that provides the `readinto`
interface.
object_ref: The new ObjectRef.
"""
cdef:
CObjectID c_object_id
shared_ptr[CBuffer] data_buf
shared_ptr[CBuffer] metadata_buf
int64_t put_threshold
c_bool put_small_object_in_memory_store
c_vector[CObjectID] c_object_id_vector
# TODO(suquark): This method does not support put objects to
# in memory store currently.
metadata_buf = string_to_buffer(metadata)
object_already_exists = self._create_put_buffer(
metadata_buf, data_size, object_ref,
ObjectRefsToVector([]),
&c_object_id, &data_buf)
if object_already_exists:
logger.debug("Object already exists in 'put_file_like_object'.")
return
data = Buffer.make(data_buf)
view = memoryview(data)
index = 0
while index < data_size:
bytes_read = file_like.readinto(view[index:])
index += bytes_read
with nogil:
# Using custom object refs is not supported because we
# can't track their lifecycle, so we don't pin the object
# in this case.
check_status(CCoreWorkerProcess.GetCoreWorker().Seal(
c_object_id, pin_object=object_ref is None))
def put_serialized_object(self, serialized_object,
ObjectRef object_ref=None,
c_bool pin_object=True):
@@ -1342,6 +1439,20 @@ cdef class CoreWorker:
resource_name.encode("ascii"), capacity,
CClientID.FromBinary(client_id.binary()))
def force_spill_objects(self, object_refs):
cdef c_vector[CObjectID] object_ids
object_ids = ObjectRefsToVector(object_refs)
with nogil:
check_status(CCoreWorkerProcess.GetCoreWorker()
.ForceSpillObjects(object_ids))
def force_restore_spilled_objects(self, object_refs):
cdef c_vector[CObjectID] object_ids
object_ids = ObjectRefsToVector(object_refs)
with nogil:
check_status(CCoreWorkerProcess.GetCoreWorker()
.ForceRestoreSpilledObjects(object_ids))
cdef void async_set_result(shared_ptr[CRayObject] obj,
CObjectID object_ref,
void *future) with gil:
+3
View File
@@ -1,10 +1,13 @@
from .api import get, wait
from .dynamic_resources import set_resource
from .object_spilling import force_spill_objects, force_restore_spilled_objects
from .placement_group import (
placement_group, )
__all__ = [
"get",
"wait",
"set_resource",
"force_spill_objects",
"force_restore_spilled_objects",
"placement_group",
]
@@ -0,0 +1,35 @@
import ray
def force_spill_objects(object_refs):
"""Force spilling objects to external storage.
Args:
object_refs: Object refs of the objects to be
spilled.
"""
core_worker = ray.worker.global_worker.core_worker
# Make sure that the values are object refs.
for object_ref in object_refs:
if not isinstance(object_ref, ray.ObjectRef):
raise TypeError(
f"Attempting to call `force_spill_objects` on the "
f"value {object_ref}, which is not an ray.ObjectRef.")
return core_worker.force_spill_objects(object_refs)
def force_restore_spilled_objects(object_refs):
"""Force restoring objects from external storage.
Args:
object_refs: Object refs of the objects to be
restored.
"""
core_worker = ray.worker.global_worker.core_worker
# Make sure that the values are object refs.
for object_ref in object_refs:
if not isinstance(object_ref, ray.ObjectRef):
raise TypeError(
f"Attempting to call `force_restore_spilled_objects` on the "
f"value {object_ref}, which is not an ray.ObjectRef.")
return core_worker.force_restore_spilled_objects(object_refs)
+129
View File
@@ -0,0 +1,129 @@
import abc
import os
from typing import List
import ray
class ExternalStorage(metaclass=abc.ABCMeta):
"""The base class for external storage.
This class provides some useful functions for zero-copy object
put/get from plasma store. Also it specifies the interface for
object spilling.
"""
def _get_objects_from_store(self, object_refs):
worker = ray.worker.global_worker
ray_object_pairs = worker.core_worker.get_objects(
object_refs,
worker.current_task_id,
timeout_ms=0,
plasma_objects_only=True)
return ray_object_pairs
def _put_object_to_store(self, metadata, data_size, file_like, object_ref):
worker = ray.worker.global_worker
worker.core_worker.put_file_like_object(metadata, data_size, file_like,
object_ref)
@abc.abstractmethod
def spill_objects(self, object_refs):
"""Spill objects to the external storage. Objects are specified
by their object refs.
Args:
object_refs: The list of the refs of the objects to be spilled.
Returns:
A list of keys corresponding to the input object refs.
"""
@abc.abstractmethod
def restore_spilled_objects(self, keys: List[bytes]):
"""Spill objects to the external storage. Objects are specified
by their object refs.
Args:
keys: A list of bytes corresponding to the spilled objects.
"""
class NullStorage(ExternalStorage):
"""The class that represents an uninitialized external storage."""
def spill_objects(self, object_refs):
raise NotImplementedError("External storage is not initialized")
def restore_spilled_objects(self, keys):
raise NotImplementedError("External storage is not initialized")
class FileSystemStorage(ExternalStorage):
"""The class for filesystem-like external storage."""
def __init__(self, directory_path):
self.directory_path = directory_path
self.prefix = "ray_spilled_object_"
def spill_objects(self, object_refs):
keys = []
ray_object_pairs = self._get_objects_from_store(object_refs)
for ref, (buf, metadata) in zip(object_refs, ray_object_pairs):
filename = self.prefix + ref.hex()
with open(os.path.join(self.directory_path, filename), "wb") as f:
metadata_len = len(metadata)
buf_len = len(buf)
f.write(metadata_len.to_bytes(8, byteorder="little"))
f.write(buf_len.to_bytes(8, byteorder="little"))
f.write(metadata)
f.write(memoryview(buf))
keys.append(filename.encode())
return keys
def restore_spilled_objects(self, keys):
for k in keys:
filename = k.decode()
ref = ray.ObjectRef(bytes.fromhex(filename[len(self.prefix):]))
with open(os.path.join(self.directory_path, filename), "rb") as f:
metadata_len = int.from_bytes(f.read(8), byteorder="little")
buf_len = int.from_bytes(f.read(8), byteorder="little")
metadata = f.read(metadata_len)
# read remaining data to our buffer
self._put_object_to_store(metadata, buf_len, f, ref)
_external_storage = NullStorage()
def setup_external_storage(config):
"""Setup the external storage according to the config."""
global _external_storage
if config:
storage_type = config["type"]
if storage_type == "filesystem":
_external_storage = FileSystemStorage(**config["params"])
else:
raise ValueError(f"Unknown external storage type: {storage_type}")
else:
_external_storage = NullStorage()
def spill_objects(object_refs):
"""Spill objects to the external storage. Objects are specified
by their object refs.
Args:
object_refs: The list of the refs of the objects to be spilled.
Returns:
A list of keys corresponding to the input object refs.
"""
return _external_storage.spill_objects(object_refs)
def restore_spilled_objects(keys: List[bytes]):
"""Spill objects to the external storage. Objects are specified
by their object refs.
Args:
keys: A list of bytes corresponding to the spilled objects.
"""
_external_storage.restore_spilled_objects(keys)
+1
View File
@@ -162,6 +162,7 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil:
cdef extern from "src/ray/protobuf/common.pb.h" nogil:
cdef CWorkerType WORKER_TYPE_WORKER "ray::WorkerType::WORKER"
cdef CWorkerType WORKER_TYPE_DRIVER "ray::WorkerType::DRIVER"
cdef CWorkerType WORKER_TYPE_IO_WORKER "ray::WorkerType::IO_WORKER"
cdef extern from "src/ray/protobuf/common.pb.h" nogil:
cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
+7 -1
View File
@@ -165,7 +165,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
shared_ptr[CBuffer] *data)
CRayStatus Seal(const CObjectID &object_id, c_bool pin_object)
CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
c_vector[shared_ptr[CRayObject]] *results)
c_vector[shared_ptr[CRayObject]] *results,
c_bool plasma_objects_only)
CRayStatus Contains(const CObjectID &object_id, c_bool *has_object)
CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects,
int64_t timeout_ms, c_vector[c_bool] *results)
@@ -192,6 +193,9 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CRayStatus SetResource(const c_string &resource_name,
const double capacity,
const CClientID &client_Id)
CRayStatus ForceSpillObjects(const c_vector[CObjectID] &object_ids)
CRayStatus ForceRestoreSpilledObjects(
const c_vector[CObjectID] &object_ids)
cdef cppclass CCoreWorkerOptions "ray::CoreWorkerOptions":
CWorkerType worker_type
@@ -220,6 +224,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
) task_execution_callback
(CRayStatus() nogil) check_signals
(void() nogil) gc_collect
(c_vector[c_string](const c_vector[CObjectID]&) nogil) spill_objects
(void(const c_vector[c_string]&) nogil) restore_spilled_objects
(void(c_string *stack_out) nogil) get_lang_stack
c_bool ref_counting_enabled
c_bool is_local_mode
+2 -1
View File
@@ -718,7 +718,8 @@ class Node:
socket_to_use=self.socket,
head_node=self.head,
start_initial_python_workers_for_first_job=self._ray_params.
start_initial_python_workers_for_first_job)
start_initial_python_workers_for_first_job,
object_spilling_config=self._ray_params.object_spilling_config)
assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
+3 -1
View File
@@ -145,7 +145,8 @@ class RayParams:
enable_object_reconstruction=False,
metrics_agent_port=None,
metrics_export_port=None,
lru_evict=False):
lru_evict=False,
object_spilling_config=None):
self.object_ref_seed = object_ref_seed
self.redis_address = redis_address
self.num_cpus = num_cpus
@@ -190,6 +191,7 @@ class RayParams:
self._internal_config = _internal_config
self._lru_evict = lru_evict
self._enable_object_reconstruction = enable_object_reconstruction
self.object_spilling_config = object_spilling_config
self._check_usage()
# Set the internal config options for LRU eviction.
+6 -1
View File
@@ -1289,7 +1289,8 @@ def start_raylet(redis_address,
fate_share=None,
socket_to_use=None,
head_node=False,
start_initial_python_workers_for_first_job=False):
start_initial_python_workers_for_first_job=False,
object_spilling_config=None):
"""Start a raylet, which is a combined local scheduler and object manager.
Args:
@@ -1398,6 +1399,10 @@ def start_raylet(redis_address,
if load_code_from_local:
start_worker_command += ["--load-code-from-local"]
if object_spilling_config:
start_worker_command.append(
f"--object-spilling-config={json.dumps(object_spilling_config)}")
command = [
RAYLET_EXECUTABLE,
"--raylet_socket_name={}".format(raylet_name),
+1
View File
@@ -48,6 +48,7 @@ py_test_module_list(
"test_stress_sharded.py",
"test_unreconstructable_errors.py",
"test_tensorflow.py",
"test_object_spilling.py",
],
size = "medium",
extra_srcs = SRCS,
+140
View File
@@ -0,0 +1,140 @@
import json
import random
import time
import numpy as np
import pytest
import ray
def test_spill_objects_manually(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=75 * 1024 * 1024,
object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
}
},
_internal_config=json.dumps({
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}))
arr = np.random.rand(1024 * 1024) # 8 MB data
replay_buffer = []
pinned_objects = set()
spilled_objects = set()
# Create objects of more than 200 MiB.
for _ in range(25):
ref = None
while ref is None:
try:
ref = ray.put(arr)
replay_buffer.append(ref)
pinned_objects.add(ref)
except ray.exceptions.ObjectStoreFullError:
ref_to_spill = pinned_objects.pop()
ray.experimental.force_spill_objects([ref_to_spill])
spilled_objects.add(ref_to_spill)
# Spill 2 more objects so we will always have enough space for
# restoring objects back.
refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
ray.experimental.force_spill_objects(refs_to_spill)
spilled_objects.update(refs_to_spill)
# randomly sample objects
for _ in range(100):
ref = random.choice(replay_buffer)
if ref in spilled_objects:
ray.experimental.force_restore_spilled_objects([ref])
sample = ray.get(ref)
assert np.array_equal(sample, arr)
def test_spill_objects_manually_from_workers(shutdown_only):
# Limit our object store to 100 MiB of memory.
ray.init(
object_store_memory=100 * 1024 * 1024,
object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
}
},
_internal_config=json.dumps({
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}))
@ray.remote
def _worker():
arr = np.random.rand(100 * 1024)
ref = ray.put(arr)
ray.experimental.force_spill_objects([ref])
ray.experimental.force_restore_spilled_objects([ref])
assert np.array_equal(ray.get(ref), arr)
ray.get([_worker.remote() for _ in range(50)])
def test_spill_objects_manually_with_workers(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=100 * 1024 * 1024,
object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
}
},
_internal_config=json.dumps({
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}))
arrays = [np.random.rand(100 * 1024) for _ in range(50)]
objects = [ray.put(arr) for arr in arrays]
@ray.remote
def _worker(object_refs):
ray.experimental.force_spill_objects(object_refs)
ray.get([_worker.remote([o]) for o in objects])
for restored, arr in zip(ray.get(objects), arrays):
assert np.array_equal(restored, arr)
@pytest.mark.skip(reason="have not been fully implemented")
def test_spill_objects_automatically(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=75 * 1024 * 1024,
_internal_config=json.dumps({
"max_io_workers": 4,
"object_store_full_max_retries": 2,
"object_store_full_initial_delay_ms": 10,
"auto_object_spilling": True,
}))
arr = np.random.rand(1024 * 1024) # 8 MB data
replay_buffer = []
# Wait raylet for starting an IO worker.
time.sleep(1)
# Create objects of more than 800 MiB.
for _ in range(100):
ref = None
while ref is None:
ref = ray.put(arr)
replay_buffer.append(ref)
print("-----------------------------------")
# randomly sample objects
for _ in range(1000):
ref = random.choice(replay_buffer)
sample = ray.get(ref, timeout=0)
assert np.array_equal(sample, arr)
+10 -6
View File
@@ -53,6 +53,7 @@ from ray.utils import (_random_string, check_oversized_pickle, is_cython,
SCRIPT_MODE = 0
WORKER_MODE = 1
LOCAL_MODE = 2
IO_WORKER_MODE = 3
ERROR_KEY_PREFIX = b"Error:"
@@ -513,7 +514,8 @@ def init(address=None,
_internal_config=None,
lru_evict=False,
enable_object_reconstruction=False,
_metrics_export_port=None):
_metrics_export_port=None,
object_spilling_config=None):
"""
Connect to an existing Ray cluster or start one and connect to it.
@@ -642,6 +644,8 @@ def init(address=None,
_metrics_export_port(int): Port number Ray exposes system metrics
through a Prometheus endpoint. It is currently under active
development, and the API is subject to change.
object_spilling_config (str): The configuration json string for object
spilling I/O worker.
Returns:
Address information about the started processes.
@@ -737,7 +741,8 @@ def init(address=None,
_internal_config=_internal_config,
lru_evict=lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
metrics_export_port=_metrics_export_port)
metrics_export_port=_metrics_export_port,
object_spilling_config=object_spilling_config)
# Start the Ray processes. We set shutdown_at_exit=False because we
# shutdown the node in the ray.shutdown call that happens in the atexit
# handler. We still spawn a reaper process in case the atexit handler
@@ -1206,7 +1211,7 @@ def connect(node,
worker.redis_client = node.create_redis_client()
# Initialize some fields.
if mode is WORKER_MODE:
if mode in (WORKER_MODE, IO_WORKER_MODE):
# We should not specify the job_id if it's `WORKER_MODE`.
assert job_id is None
job_id = JobID.nil()
@@ -1260,7 +1265,7 @@ def connect(node,
import __main__ as main
driver_name = (main.__file__
if hasattr(main, "__file__") else "INTERACTIVE MODE")
elif mode == WORKER_MODE:
elif mode == WORKER_MODE or mode == IO_WORKER_MODE:
# Check the RedirectOutput key in Redis and based on its value redirect
# worker output and error to their own files.
# This key is set in services.py when Redis is started.
@@ -1295,8 +1300,7 @@ def connect(node,
job_config = ray.job_config.JobConfig()
serialized_job_config = job_config.serialize()
worker.core_worker = ray._raylet.CoreWorker(
(mode == SCRIPT_MODE or mode == LOCAL_MODE),
node.plasma_store_socket_name, node.raylet_socket_name, job_id,
mode, node.plasma_store_socket_name, node.raylet_socket_name, job_id,
gcs_options, node.get_logs_dir_path(), node.node_ip_address,
node.node_manager_port, node.raylet_ip_address, (mode == LOCAL_MODE),
driver_name, log_stdout_file_path, log_stderr_file_path,
+42 -2
View File
@@ -1,5 +1,6 @@
import argparse
import json
import time
import ray
import ray.actor
@@ -80,17 +81,47 @@ parser.add_argument(
default=False,
action="store_true",
help="True if cloudpickle should be used for serialization.")
parser.add_argument(
"--worker-type",
required=False,
type=str,
default="WORKER",
help="Specify the type of the worker process")
parser.add_argument(
"--metrics-agent-port",
required=True,
type=int,
help="the port of the node's metric agent.")
parser.add_argument(
"--object-spilling-config",
required=False,
type=str,
default="",
help="The configuration of object spilling. Only used by I/O workers.")
if __name__ == "__main__":
args = parser.parse_args()
ray.utils.setup_logger(args.logging_level, args.logging_format)
if args.worker_type == "WORKER":
mode = ray.WORKER_MODE
elif args.worker_type == "IO_WORKER":
mode = ray.IO_WORKER_MODE
else:
raise ValueError("Unknown worker type: " + args.worker_type)
# NOTE(suquark): We must initialize the external storage before we
# connect to raylet. Otherwise we may receive requests before the
# external storage is intialized.
if mode == ray.IO_WORKER_MODE:
from ray import external_storage
if args.object_spilling_config:
object_spilling_config = json.loads(args.object_spilling_config)
else:
object_spilling_config = {}
external_storage.setup_external_storage(object_spilling_config)
internal_config = {}
if args.config_list is not None:
config_list = args.config_list.split(",")
@@ -125,5 +156,14 @@ if __name__ == "__main__":
spawn_reaper=False,
connect_only=True)
ray.worker._global_node = node
ray.worker.connect(node, mode=ray.WORKER_MODE)
ray.worker.global_worker.main_loop()
ray.worker.connect(node, mode=mode)
if mode == ray.WORKER_MODE:
ray.worker.global_worker.main_loop()
elif mode == ray.IO_WORKER_MODE:
# It is handled by another thread in the C++ core worker.
# We just need to keep the worker alive.
while True:
time.sleep(100000)
else:
raise ValueError(f"Unexcepted worker mode: {mode}")