[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)

* remove

* internal config updates, remove Checkpointable

* Lower object timeout default

* remove json

* Fix flaky test

* Fix unit test
This commit is contained in:
Stephanie Wang
2020-08-27 10:19:53 -07:00
committed by GitHub
parent 0aec4cbccb
commit f75dfd60a3
56 changed files with 239 additions and 1267 deletions
-146
View File
@@ -2,8 +2,6 @@ import inspect
import logging
import weakref
from abc import ABCMeta, abstractmethod
from collections import namedtuple
import ray.ray_constants as ray_constants
import ray._raylet
import ray.signature as signature
@@ -854,11 +852,6 @@ def modify_class(cls):
"classes. In Python 2, you must declare the class with "
"'class ClassName(object):' instead of 'class ClassName:'.")
if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
raise TypeError(
"A checkpointable actor class should implement all abstract "
"methods in the `Checkpointable` interface.")
# Modify the class to have an additional method that will be used for
# terminating the worker.
class Class(cls):
@@ -869,20 +862,6 @@ def modify_class(cls):
if worker.mode != ray.LOCAL_MODE:
ray.actor.exit_actor()
def __ray_checkpoint__(self):
"""Save a checkpoint.
This task saves the current state of the actor, the current task
frontier according to the raylet, and the checkpoint index
(number of tasks executed so far).
"""
worker = ray.worker.global_worker
if not isinstance(self, ray.actor.Checkpointable):
raise TypeError(
"__ray_checkpoint__.remote() may only be called on actors "
"that implement ray.actor.Checkpointable")
return worker._save_actor_checkpoint()
Class.__module__ = cls.__module__
Class.__name__ = cls.__name__
@@ -951,128 +930,3 @@ def exit_actor():
assert False, "This process should have terminated."
else:
raise TypeError("exit_actor called on a non-actor worker.")
CheckpointContext = namedtuple(
"CheckpointContext",
[
# Actor's ID.
"actor_id",
# Number of tasks executed since last checkpoint.
"num_tasks_since_last_checkpoint",
# Time elapsed since last checkpoint, in milliseconds.
"time_elapsed_ms_since_last_checkpoint",
],
)
"""A namedtuple that contains information about actor's last checkpoint."""
Checkpoint = namedtuple(
"Checkpoint",
[
# ID of this checkpoint.
"checkpoint_id",
# The timestamp at which this checkpoint was saved,
# represented as milliseconds elapsed since Unix epoch.
"timestamp",
],
)
"""A namedtuple that represents a checkpoint."""
class Checkpointable(metaclass=ABCMeta):
"""An interface that indicates an actor can be checkpointed."""
@abstractmethod
def should_checkpoint(self, checkpoint_context):
"""Whether this actor needs to be checkpointed.
This method will be called after every task. You should implement this
callback to decide whether this actor needs to be checkpointed at this
time, based on the checkpoint context, or any other factors.
Args:
checkpoint_context: A namedtuple that contains info about last
checkpoint.
Returns:
A boolean value that indicates whether this actor needs to be
checkpointed.
"""
pass
@abstractmethod
def save_checkpoint(self, actor_id, checkpoint_id):
"""Save a checkpoint to persistent storage.
If `should_checkpoint` returns true, this method will be called. You
should implement this callback to save actor's checkpoint and the given
checkpoint id to persistent storage.
Args:
actor_id: Actor's ID.
checkpoint_id: ID of this checkpoint. You should save it together
with actor's checkpoint data. And it will be used by the
`load_checkpoint` method.
Returns:
None.
"""
pass
@abstractmethod
def load_checkpoint(self, actor_id, available_checkpoints):
"""Load actor's previous checkpoint, and restore actor's state.
This method will be called when an actor is restarted, after
actor's constructor.
If the actor needs to restore from previous checkpoint, this function
should restore actor's state and return the checkpoint ID. Otherwise,
it should do nothing and return None.
Note, this method must return one of the checkpoint IDs in the
`available_checkpoints` list, or None. Otherwise, an exception will be
raised.
Args:
actor_id: Actor's ID.
available_checkpoints: A list of `Checkpoint` namedtuples that
contains all available checkpoint IDs and their timestamps,
sorted by timestamp in descending order.
Returns:
The ID of the checkpoint from which the actor was resumed, or None
if the actor should restart from the beginning.
"""
pass
@abstractmethod
def checkpoint_expired(self, actor_id, checkpoint_id):
"""Delete an expired checkpoint.
This method will be called when an checkpoint is expired. You should
implement this method to delete your application checkpoint data.
Note, the maximum number of checkpoints kept in the backend can be
configured at `RayConfig.num_actor_checkpoints_to_keep`.
Args:
actor_id: ID of the actor.
checkpoint_id: ID of the checkpoint that has expired.
Returns:
None.
"""
pass
def get_checkpoints_for_actor(actor_id):
"""Get the available checkpoints for the given actor ID, return a list
sorted by checkpoint timestamp in descending order.
"""
checkpoint_info = ray.state.state.actor_checkpoint_info(actor_id)
if checkpoint_info is None:
return []
checkpoints = [
Checkpoint(checkpoint_id, timestamp) for checkpoint_id, timestamp in
zip(checkpoint_info["CheckpointIds"], checkpoint_info["Timestamps"])
]
return sorted(
checkpoints,
key=lambda checkpoint: checkpoint.timestamp,
reverse=True,
)
-4
View File
@@ -1,4 +1,3 @@
import json
import logging
import time
@@ -80,9 +79,6 @@ class Cluster:
"min_worker_port": 0,
"max_worker_port": 0,
}
if "_internal_config" in node_args:
node_args["_internal_config"] = json.loads(
node_args["_internal_config"])
ray_params = ray.parameter.RayParams(**node_args)
ray_params.update_if_absent(**default_kwargs)
if self.head_node is None:
+6 -119
View File
@@ -544,43 +544,13 @@ class FunctionActorManager:
"""
def actor_method_executor(actor, *args, **kwargs):
# Update the actor's task counter to reflect the task we're about
# to execute.
self._worker.actor_task_counter += 1
# Execute the assigned method and save a checkpoint if necessary.
try:
is_bound = (is_class_method(method)
or is_static_method(type(actor), method_name))
if is_bound:
method_returns = method(*args, **kwargs)
else:
method_returns = method(actor, *args, **kwargs)
except Exception as e:
# Save the checkpoint before allowing the method exception
# to be thrown, but don't save the checkpoint for actor
# creation task.
if (isinstance(actor, ray.actor.Checkpointable)
and self._worker.actor_task_counter != 1):
self._save_and_log_checkpoint(actor)
raise e
# Execute the assigned method.
is_bound = (is_class_method(method)
or is_static_method(type(actor), method_name))
if is_bound:
return method(*args, **kwargs)
else:
# Handle any checkpointing operations before storing the
# method's return values.
# NOTE(swang): If method_returns is a pointer to the actor's
# state and the checkpointing operations can modify the return
# values if they mutate the actor's state. Is this okay?
if isinstance(actor, ray.actor.Checkpointable):
# If this is the first task to execute on the actor, try to
# resume from a checkpoint.
if self._worker.actor_task_counter == 1:
if actor_imported:
self._restore_and_log_checkpoint(actor)
else:
# Save the checkpoint before returning the method's
# return values.
self._save_and_log_checkpoint(actor)
return method_returns
return method(actor, *args, **kwargs)
# Set method_name and method as attributes to the executor clusore
# so we can make decision based on these attributes in task executor.
@@ -591,86 +561,3 @@ class FunctionActorManager:
actor_method_executor.method = method
return actor_method_executor
def _save_and_log_checkpoint(self, actor):
"""Save an actor checkpoint if necessary and log any errors.
Args:
actor: The actor to checkpoint.
Returns:
The result of the actor's user-defined `save_checkpoint` method.
"""
actor_id = self._worker.actor_id
checkpoint_info = self._worker.actor_checkpoint_info[actor_id]
checkpoint_info.num_tasks_since_last_checkpoint += 1
now = int(1000 * time.time())
checkpoint_context = ray.actor.CheckpointContext(
actor_id, checkpoint_info.num_tasks_since_last_checkpoint,
now - checkpoint_info.last_checkpoint_timestamp)
# If we should take a checkpoint, notify raylet to prepare a checkpoint
# and then call `save_checkpoint`.
if actor.should_checkpoint(checkpoint_context):
try:
now = int(1000 * time.time())
checkpoint_id = (
self._worker.core_worker.prepare_actor_checkpoint(actor_id)
)
checkpoint_info.checkpoint_ids.append(checkpoint_id)
actor.save_checkpoint(actor_id, checkpoint_id)
if (len(checkpoint_info.checkpoint_ids) >
ray._config.num_actor_checkpoints_to_keep()):
actor.checkpoint_expired(
actor_id,
checkpoint_info.checkpoint_ids.pop(0),
)
checkpoint_info.num_tasks_since_last_checkpoint = 0
checkpoint_info.last_checkpoint_timestamp = now
except Exception:
# Checkpoint save or reload failed. Notify the driver.
traceback_str = ray.utils.format_error_message(
traceback.format_exc())
ray.utils.push_error_to_driver(
self._worker,
ray_constants.CHECKPOINT_PUSH_ERROR,
traceback_str,
job_id=self._worker.current_job_id)
def _restore_and_log_checkpoint(self, actor):
"""Restore an actor from a checkpoint if available and log any errors.
This should only be called on workers that have just executed an actor
creation task.
Args:
actor: The actor to restore from a checkpoint.
"""
actor_id = self._worker.actor_id
try:
checkpoints = ray.actor.get_checkpoints_for_actor(actor_id)
if len(checkpoints) > 0:
# If we found previously saved checkpoints for this actor,
# call the `load_checkpoint` callback.
checkpoint_id = actor.load_checkpoint(actor_id, checkpoints)
if checkpoint_id is not None:
# Check that the returned checkpoint id is in the
# `available_checkpoints` list.
msg = (
"`load_checkpoint` must return a checkpoint id that " +
"exists in the `available_checkpoints` list, or None.")
assert any(checkpoint_id == checkpoint.checkpoint_id
for checkpoint in checkpoints), msg
# Notify raylet that this actor has been resumed from
# a checkpoint.
(self._worker.core_worker.
notify_actor_resumed_from_checkpoint(
actor_id, checkpoint_id))
except Exception:
# Checkpoint save or reload failed. Notify the driver.
traceback_str = ray.utils.format_error_message(
traceback.format_exc())
ray.utils.push_error_to_driver(
self._worker,
ray_constants.CHECKPOINT_PUSH_ERROR,
traceback_str,
job_id=self._worker.current_job_id)
+1 -29
View File
@@ -21,52 +21,28 @@ cdef extern from "ray/common/ray_config.h" nogil:
uint64_t num_heartbeats_warning() const
int64_t initial_reconstruction_timeout_milliseconds() const
int64_t object_timeout_milliseconds() const
int64_t get_timeout_milliseconds() const
uint64_t max_lineage_size() const
int64_t worker_get_request_size() const
int64_t worker_fetch_request_size() const
int64_t actor_max_dummy_objects() const
int64_t raylet_client_num_connect_attempts() const
int64_t raylet_client_connect_timeout_milliseconds() const
int64_t raylet_fetch_timeout_milliseconds() const
int64_t raylet_reconstruction_timeout_milliseconds() const
int64_t max_num_to_reconstruct() const
int64_t raylet_fetch_request_size() const
int64_t kill_worker_timeout_milliseconds() const
int64_t worker_register_timeout_seconds() const
int64_t max_time_for_handler_milliseconds() const
int64_t max_time_for_loop() const
int64_t redis_db_connect_retries()
int64_t redis_db_connect_wait_milliseconds() const
int64_t plasma_default_release_delay() const
int64_t L3_cache_size_bytes() const
int64_t max_tasks_to_spillback() const
int64_t actor_creation_num_spillbacks_warning() const
int node_manager_forward_task_retry_timeout_milliseconds() const
int object_manager_pull_timeout_ms() const
int object_manager_push_timeout_ms() const
@@ -79,10 +55,6 @@ cdef extern from "ray/common/ray_config.h" nogil:
int num_workers_per_process_java() const
int64_t max_task_lease_timeout_ms() const
uint32_t num_actor_checkpoints_to_keep() const
uint32_t maximum_gcs_deletion_batch_size() const
int64_t max_direct_call_object_size() const
+2 -60
View File
@@ -26,18 +26,14 @@ cdef class Config:
return RayConfig.instance().num_heartbeats_warning()
@staticmethod
def initial_reconstruction_timeout_milliseconds():
def object_timeout_milliseconds():
return (RayConfig.instance()
.initial_reconstruction_timeout_milliseconds())
.object_timeout_milliseconds())
@staticmethod
def get_timeout_milliseconds():
return RayConfig.instance().get_timeout_milliseconds()
@staticmethod
def max_lineage_size():
return RayConfig.instance().max_lineage_size()
@staticmethod
def worker_get_request_size():
return RayConfig.instance().worker_get_request_size()
@@ -46,10 +42,6 @@ cdef class Config:
def worker_fetch_request_size():
return RayConfig.instance().worker_fetch_request_size()
@staticmethod
def actor_max_dummy_objects():
return RayConfig.instance().actor_max_dummy_objects()
@staticmethod
def raylet_client_num_connect_attempts():
return RayConfig.instance().raylet_client_num_connect_attempts()
@@ -64,19 +56,6 @@ cdef class Config:
return (RayConfig.instance()
.raylet_fetch_timeout_milliseconds())
@staticmethod
def raylet_reconstruction_timeout_milliseconds():
return (RayConfig.instance()
.raylet_reconstruction_timeout_milliseconds())
@staticmethod
def max_num_to_reconstruct():
return RayConfig.instance().max_num_to_reconstruct()
@staticmethod
def raylet_fetch_request_size():
return RayConfig.instance().raylet_fetch_request_size()
@staticmethod
def kill_worker_timeout_milliseconds():
return RayConfig.instance().kill_worker_timeout_milliseconds()
@@ -85,14 +64,6 @@ cdef class Config:
def worker_register_timeout_seconds():
return RayConfig.instance().worker_register_timeout_seconds()
@staticmethod
def max_time_for_handler_milliseconds():
return RayConfig.instance().max_time_for_handler_milliseconds()
@staticmethod
def max_time_for_loop():
return RayConfig.instance().max_time_for_loop()
@staticmethod
def redis_db_connect_retries():
return RayConfig.instance().redis_db_connect_retries()
@@ -101,27 +72,6 @@ cdef class Config:
def redis_db_connect_wait_milliseconds():
return RayConfig.instance().redis_db_connect_wait_milliseconds()
@staticmethod
def plasma_default_release_delay():
return RayConfig.instance().plasma_default_release_delay()
@staticmethod
def L3_cache_size_bytes():
return RayConfig.instance().L3_cache_size_bytes()
@staticmethod
def max_tasks_to_spillback():
return RayConfig.instance().max_tasks_to_spillback()
@staticmethod
def actor_creation_num_spillbacks_warning():
return RayConfig.instance().actor_creation_num_spillbacks_warning()
@staticmethod
def node_manager_forward_task_retry_timeout_milliseconds():
return (RayConfig.instance()
.node_manager_forward_task_retry_timeout_milliseconds())
@staticmethod
def object_manager_pull_timeout_ms():
return RayConfig.instance().object_manager_pull_timeout_ms()
@@ -146,14 +96,6 @@ cdef class Config:
def num_workers_per_process_java():
return RayConfig.instance().num_workers_per_process_java()
@staticmethod
def max_task_lease_timeout_ms():
return RayConfig.instance().max_task_lease_timeout_ms()
@staticmethod
def num_actor_checkpoints_to_keep():
return RayConfig.instance().num_actor_checkpoints_to_keep()
@staticmethod
def maximum_gcs_deletion_batch_size():
return RayConfig.instance().maximum_gcs_deletion_batch_size()
+4 -4
View File
@@ -93,9 +93,9 @@ class Node:
"The raylet IP address should only be different than the node "
"IP address when connecting to an existing raylet; i.e., when "
"head=False and connect_only=True.")
if ray_params._internal_config and len(
ray_params._internal_config) > 0 and (not head
and not connect_only):
if ray_params._system_config and len(
ray_params._system_config) > 0 and (not head
and not connect_only):
raise ValueError(
"Internal config parameters can only be set on the head node.")
@@ -124,7 +124,7 @@ class Node:
self._localhost = socket.gethostbyname("localhost")
self._ray_params = ray_params
self._redis_address = ray_params.redis_address
self._config = ray_params._internal_config or {}
self._config = ray_params._system_config or {}
# Enable Plasma Store as a thread by default.
if "plasma_store_as_thread" not in self._config:
+16 -14
View File
@@ -91,8 +91,9 @@ class RayParams:
metrics_agent_port(int): The port to bind metrics agent.
metrics_export_port(int): The port at which metrics are exposed
through a Prometheus endpoint.
_internal_config (str): JSON configuration for overriding
RayConfig defaults. For testing purposes ONLY.
_system_config (dict): Configuration for overriding RayConfig
defaults. Used to set system configuration and for experimental Ray
core feature flags.
lru_evict (bool): Enable LRU eviction if space is needed.
enable_object_reconstruction (bool): Enable plasma reconstruction on
failure.
@@ -141,7 +142,7 @@ class RayParams:
java_worker_options=None,
load_code_from_local=False,
start_initial_python_workers_for_first_job=False,
_internal_config=None,
_system_config=None,
enable_object_reconstruction=False,
metrics_agent_port=None,
metrics_export_port=None,
@@ -188,7 +189,7 @@ class RayParams:
self.metrics_export_port = metrics_export_port
self.start_initial_python_workers_for_first_job = (
start_initial_python_workers_for_first_job)
self._internal_config = _internal_config
self._system_config = _system_config
self._lru_evict = lru_evict
self._enable_object_reconstruction = enable_object_reconstruction
self.object_spilling_config = object_spilling_config
@@ -197,26 +198,27 @@ class RayParams:
# Set the internal config options for LRU eviction.
if lru_evict:
# Turn off object pinning.
if self._internal_config is None:
self._internal_config = dict()
if self._internal_config.get("object_pinning_enabled", False):
if self._system_config is None:
self._system_config = dict()
if self._system_config.get("object_pinning_enabled", False):
raise Exception(
"Object pinning cannot be enabled if using LRU eviction.")
self._internal_config["object_pinning_enabled"] = False
self._internal_config["object_store_full_max_retries"] = -1
self._internal_config["free_objects_period_milliseconds"] = 1000
self._system_config["object_pinning_enabled"] = False
self._system_config["object_store_full_max_retries"] = -1
self._system_config["free_objects_period_milliseconds"] = 1000
# Set the internal config options for object reconstruction.
if enable_object_reconstruction:
# Turn off object pinning.
if self._internal_config is None:
self._internal_config = dict()
if self._system_config is None:
self._system_config = dict()
if lru_evict:
raise Exception(
"Object reconstruction cannot be enabled if using LRU "
"eviction.")
self._internal_config["lineage_pinning_enabled"] = True
self._internal_config["free_objects_period_milliseconds"] = -1
print(self._system_config)
self._system_config["lineage_pinning_enabled"] = True
self._system_config["free_objects_period_milliseconds"] = -1
def update(self, **kwargs):
"""Update the settings according to the keyword arguments.
+2 -9
View File
@@ -1,7 +1,6 @@
"""This is the script for `ray microbenchmark`."""
import asyncio
import json
import logging
import os
import time
@@ -110,10 +109,7 @@ def main():
print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks")
ray.init(
_internal_config=json.dumps({
"put_small_object_in_memory_store": True
}))
ray.init(_system_config={"put_small_object_in_memory_store": True})
value = ray.put(0)
@@ -138,10 +134,7 @@ def main():
timeit("multi client put calls", put_multi_small, 1000)
ray.shutdown()
ray.init(
_internal_config=json.dumps({
"put_small_object_in_memory_store": False
}))
ray.init(_system_config={"put_small_object_in_memory_store": False})
value = ray.put(0)
arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
+7 -6
View File
@@ -358,10 +358,10 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
type=str,
help="Overwrite the options to start Java workers.")
@click.option(
"--internal-config",
"--system-config",
default=None,
type=json.loads,
help="Do NOT use this. This is for debugging/development purposes ONLY.")
help="Override system configuration defaults.")
@click.option(
"--load-code-from-local",
is_flag=True,
@@ -394,9 +394,9 @@ def start(node_ip_address, redis_address, address, redis_port, port,
dashboard_port, block, plasma_directory, huge_pages,
autoscaling_config, no_redirect_worker_output, no_redirect_output,
plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
java_worker_options, load_code_from_local, internal_config,
lru_evict, enable_object_reconstruction, metrics_export_port,
log_new_style, log_color, verbose):
java_worker_options, load_code_from_local, system_config, lru_evict,
enable_object_reconstruction, metrics_export_port, log_new_style,
log_color, verbose):
"""Start Ray processes manually on the local machine."""
cli_logger.old_style = not log_new_style
cli_logger.color_mode = log_color
@@ -508,7 +508,8 @@ def start(node_ip_address, redis_address, address, redis_port, port,
dashboard_port=dashboard_port,
java_worker_options=java_worker_options,
load_code_from_local=load_code_from_local,
_internal_config=internal_config,
_system_config=json.loads(system_config)
if system_config else system_config,
lru_evict=lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
metrics_export_port=metrics_export_port)
+2 -4
View File
@@ -1,7 +1,6 @@
import asyncio
import errno
import io
import json
import fnmatch
import os
import subprocess
@@ -282,10 +281,9 @@ def recursive_fnmatch(dirpath, pattern):
return matches
def generate_internal_config_map(**kwargs):
internal_config = json.dumps(kwargs)
def generate_system_config_map(**kwargs):
ray_kwargs = {
"_internal_config": internal_config,
"_system_config": kwargs,
}
return ray_kwargs
+15 -16
View File
@@ -3,7 +3,6 @@ This file defines the common pytest fixtures used in current directory.
"""
from contextlib import contextmanager
import json
import pytest
import subprocess
@@ -19,22 +18,22 @@ def shutdown_only():
ray.shutdown()
def get_default_fixure_internal_config():
internal_config = json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
def get_default_fixure_system_config():
system_config = {
"object_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
"object_store_full_max_retries": 3,
"object_store_full_initial_delay_ms": 100,
})
return internal_config
}
return system_config
def get_default_fixture_ray_kwargs():
internal_config = get_default_fixure_internal_config()
system_config = get_default_fixure_system_config()
ray_kwargs = {
"num_cpus": 1,
"object_store_memory": 150 * 1024 * 1024,
"_internal_config": internal_config,
"_system_config": system_config,
}
return ray_kwargs
@@ -125,8 +124,8 @@ def _ray_start_cluster(**kwargs):
cluster = Cluster()
remote_nodes = []
for i in range(num_nodes):
if i > 0 and "_internal_config" in init_kwargs:
del init_kwargs["_internal_config"]
if i > 0 and "_system_config" in init_kwargs:
del init_kwargs["_system_config"]
remote_nodes.append(cluster.add_node(**init_kwargs))
# We assume driver will connect to the head (first node),
# so ray init will be invoked if do_init is true
@@ -164,10 +163,10 @@ def ray_start_cluster_2_nodes(request):
def ray_start_object_store_memory(request):
# Start the Ray processes.
store_size = request.param
internal_config = get_default_fixure_internal_config()
system_config = get_default_fixure_system_config()
init_kwargs = {
"num_cpus": 1,
"_internal_config": internal_config,
"_system_config": system_config,
"object_store_memory": store_size,
}
ray.init(**init_kwargs)
@@ -208,12 +207,12 @@ def call_ray_stop_only():
@pytest.fixture()
def two_node_cluster():
internal_config = json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
system_config = {
"object_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
})
}
cluster = ray.cluster_utils.Cluster(
head_node_args={"_internal_config": internal_config})
head_node_args={"_system_config": system_config})
for _ in range(2):
remote_node = cluster.add_node(num_cpus=1)
ray.init(address=cluster.address)
+14 -369
View File
@@ -1,5 +1,4 @@
import collections
import json
import numpy as np
import os
import pytest
@@ -8,94 +7,22 @@ import sys
import time
import ray
import ray.ray_constants as ray_constants
import ray.test_utils
import ray.cluster_utils
from ray.test_utils import (
wait_for_condition,
wait_for_pid_to_exit,
generate_internal_config_map,
generate_system_config_map,
get_other_nodes,
SignalActor,
get_error_message,
)
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
@pytest.fixture
def ray_checkpointable_actor_cls(request):
checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
"ray_temp_checkpoint_dir") + os.sep
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
class CheckpointableActor(ray.actor.Checkpointable):
def __init__(self):
self.value = 0
self.resumed_from_checkpoint = False
self.checkpoint_dir = checkpoint_dir
def node_id(self):
return ray.worker.global_worker.node.unique_id
def increase(self):
self.value += 1
return self.value
def get(self):
return self.value
def was_resumed_from_checkpoint(self):
return self.resumed_from_checkpoint
def get_pid(self):
return os.getpid()
def should_checkpoint(self, checkpoint_context):
# Checkpoint the actor when value is increased to 3.
should_checkpoint = self.value == 3
return should_checkpoint
def save_checkpoint(self, actor_id, checkpoint_id):
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
# Save checkpoint into a file.
with open(self.checkpoint_dir + actor_id, "a+") as f:
print(checkpoint_id, self.value, file=f)
def load_checkpoint(self, actor_id, available_checkpoints):
actor_id = actor_id.hex()
filename = self.checkpoint_dir + actor_id
# Load checkpoint from the file.
if not os.path.isfile(filename):
return None
available_checkpoint_ids = [
c.checkpoint_id for c in available_checkpoints
]
with open(filename, "r") as f:
for line in f:
checkpoint_id, value = line.strip().split(" ")
checkpoint_id = ray.ActorCheckpointID(
ray.utils.hex_to_binary(checkpoint_id))
if checkpoint_id in available_checkpoint_ids:
self.value = int(value)
self.resumed_from_checkpoint = True
return checkpoint_id
return None
def checkpoint_expired(self, actor_id, checkpoint_id):
pass
return CheckpointableActor
@pytest.fixture
def ray_init_with_task_retry_delay():
address = ray.init(
_internal_config=json.dumps({
"task_retry_delay_ms": 100
}))
address = ray.init(_system_config={"task_retry_delay_ms": 100})
yield address
ray.shutdown()
@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
def test_actor_restart_on_node_failure(ray_start_cluster):
config = json.dumps({
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 1000,
"object_timeout_milliseconds": 1000,
"task_retry_delay_ms": 100,
})
}
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
cluster.add_node(num_cpus=0, _system_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address)
@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
assert ray.get(RetryableTask.remote(remote_actor)) == 3
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
# NOTE(hchen): we set object_timeout_milliseconds to 1s for
# this test. Because if this value is too small, suprious task reconstruction
# may happen and cause the test fauilure. If the value is too large, this test
# could be very slow. We can remove this once we support dynamic timeout.
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_internal_config_map(
initial_reconstruction_timeout_milliseconds=1000,
num_heartbeats_timeout=10)
generate_system_config_map(
object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
],
indirect=True)
def test_multiple_actor_restart(ray_start_cluster_head):
@@ -520,287 +446,6 @@ def kill_actor(actor):
wait_for_pid_to_exit(pid)
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test actor checkpointing and restoring from a checkpoint."""
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
# Call increase 3 times, triggering a checkpoint.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that restart still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test checkpointing of a remote actor through method invocation."""
# Define a class that exposes a method to save checkpoints.
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def __init__(self):
super(RemoteCheckpointableActor, self).__init__()
self._should_checkpoint = False
def checkpoint(self):
self._should_checkpoint = True
def should_checkpoint(self, checkpoint_context):
should_checkpoint = self._should_checkpoint
self._should_checkpoint = False
return should_checkpoint
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
actor = cls.remote()
# Call increase 3 times.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Call a checkpoint task.
actor.checkpoint.remote()
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that restart still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
ray_checkpointable_actor_cls):
"""Test actor checkpointing on a remote node."""
# Place the actor on the remote node.
cluster = ray_start_cluster_2_nodes
remote_node = list(cluster.worker_nodes)
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
actor = actor_cls.remote()
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
actor = actor_cls.remote()
# Call increase several times.
expected = 0
for _ in range(6):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
cluster.remove_node(remote_node[0])
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to complete."""
p = error_pubsub
@ray.remote(max_restarts=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def save_checkpoint(self, actor_id, checkpoint_context):
raise Exception("Intentional error saving checkpoint.")
actor = RemoteCheckpointableActor.remote()
# Call increase 3 times, triggering a checkpoint that will fail.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor still wasn't resumed from a checkpoint and its
# value is still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that restart still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Check that the checkpoint error was pushed to the driver.
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
assert len(errors) == 1
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to load."""
p = error_pubsub
@ray.remote(max_restarts=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def load_checkpoint(self, actor_id, checkpoints):
raise Exception("Intentional error loading checkpoint.")
actor = RemoteCheckpointableActor.remote()
# Call increase 3 times, triggering a checkpoint that will succeed.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint because loading
# it failed.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor still wasn't resumed from a checkpoint and its
# value is still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that restart still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Check that the checkpoint error was pushed to the driver.
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
assert len(errors) == 1
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
@pytest.mark.parametrize(
"ray_start_regular",
# This overwrite currently isn't effective,
# see https://github.com/ray-project/ray/issues/3926.
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
indirect=True,
)
def test_deleting_actor_checkpoint(ray_start_regular):
"""Test deleting old actor checkpoints."""
@ray.remote
class CheckpointableActor(ray.actor.Checkpointable):
def __init__(self):
self.checkpoint_ids = []
def get_checkpoint_ids(self):
return self.checkpoint_ids
def should_checkpoint(self, checkpoint_context):
# Save checkpoints after every task
return True
def save_checkpoint(self, actor_id, checkpoint_id):
self.checkpoint_ids.append(checkpoint_id)
pass
def load_checkpoint(self, actor_id, available_checkpoints):
pass
def checkpoint_expired(self, actor_id, checkpoint_id):
assert checkpoint_id == self.checkpoint_ids[0]
del self.checkpoint_ids[0]
actor = CheckpointableActor.remote()
for i in range(19):
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
for _ in range(20):
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
def test_bad_checkpointable_actor_class():
"""Test error raised if an actor class doesn't implement all abstract
methods in the Checkpointable interface."""
with pytest.raises(TypeError):
@ray.remote
class BadCheckpointableActor(ray.actor.Checkpointable):
def should_checkpoint(self, checkpoint_context):
return True
def test_init_exception_in_checkpointable_actor(
ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
# This test is similar to test_failure.py::test_failed_actor_init.
# This test is used to guarantee that checkpointable actor does not
# break the same logic.
error_message1 = "actor constructor failed"
error_message2 = "actor method failed"
p = error_pubsub
@ray.remote
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
def __init__(self):
raise Exception(error_message1)
def fail_method(self):
raise Exception(error_message2)
def should_checkpoint(self, checkpoint_context):
return True
a = CheckpointableFailedActor.remote()
# Make sure that we get errors from a failed constructor.
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
assert len(errors) == 1
assert error_message1 in errors[0].error_message
# Make sure that we get errors from a failed method.
a.fail_method.remote()
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
assert len(errors) == 1
assert error_message1 in errors[0].error_message
def test_decorated_method(ray_start_regular):
def method_invocation_decorator(f):
def new_f_invocation(args, kwargs):
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
return self.dependency
# Make sure it is scheduled in the second node.
@ray.remote(resources={"node": 1}, num_cpus=1)
@ray.remote(resources={"node": 1})
class Owner:
def get_pid(self):
return os.getpid()
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
# Wait until the `Caller` start executing the remote `call` method.
ray.get(signal_handle.wait.remote())
@ray.remote
@ray.remote(resources={"caller": 1})
class Caller:
def call(self, owner_pid, signal_handle, actor_handle):
# Notify the `Owner` that the `Caller` is executing the remote
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
return True
cluster = ray_start_cluster
node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
node_to_be_broken = cluster.add_node(resources={"node": 1})
cluster.add_node(resources={"caller": 1})
owner = Owner.remote()
owner_pid = ray.get(owner.get_pid.remote())
caller = Caller.remote()
owner.create_actor.remote(caller)
ray.get(owner.create_actor.remote(caller))
cluster.remove_node(node_to_be_broken)
# Wait for the `Owner` to exit.
wait_for_pid_to_exit(owner_pid)
# It will hang here if location is not properly resolved.
+1 -4
View File
@@ -1,5 +1,4 @@
import collections
import json
import os
import pytest
try:
@@ -241,9 +240,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet,
num_gpus=num_gpus_per_raylet,
_internal_config=json.dumps({
"num_heartbeats_timeout": 1000
} if i == 0 else {}))
_system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
ray.init(address=cluster.address)
@ray.remote
+4 -11
View File
@@ -2,7 +2,6 @@
import glob
import logging
import os
import json
import sys
import socket
import time
@@ -69,9 +68,9 @@ def test_local_scheduling_first(ray_start_cluster):
# Disable worker caching.
cluster.add_node(
num_cpus=num_cpus,
_internal_config=json.dumps({
_system_config={
"worker_lease_timeout_milliseconds": 0,
}))
})
cluster.add_node(num_cpus=num_cpus)
ray.init(address=cluster.address)
@@ -332,9 +331,7 @@ def test_wait_reconstruction(shutdown_only):
ray.init(
num_cpus=1,
object_store_memory=int(10**8),
_internal_config=json.dumps({
"object_pinning_enabled": 0
}))
_system_config={"object_pinning_enabled": 0})
@ray.remote
def f():
@@ -607,11 +604,7 @@ def test_move_log_files_to_old(shutdown_only):
def test_lease_request_leak(shutdown_only):
ray.init(
num_cpus=1,
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
}))
ray.init(num_cpus=1, _system_config={"object_timeout_milliseconds": 200})
assert len(ray.objects()) == 0
@ray.remote
+3 -4
View File
@@ -3,7 +3,6 @@ import numpy as np
from numpy.testing import assert_equal, assert_almost_equal
import pytest
import sys
import json
import ray
import ray.experimental.array.remote as ra
@@ -59,13 +58,13 @@ def test_distributed_array_assemble(ray_start_2_cpus, reload_modules):
@pytest.mark.parametrize(
"ray_start_cluster_2_nodes",
[{
"_internal_config": json.dumps({
"_system_config": {
# NOTE(swang): If plasma store notifications to the raylet for new
# objects are delayed by long enough, then this causes concurrent
# fetch calls to timeout and mistakenly mark the object as lost.
# Set the timeout very high to prevent this.
"initial_reconstruction_timeout_milliseconds": 60000,
})
"object_timeout_milliseconds": 60000,
}
}],
indirect=True)
def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
+1 -5
View File
@@ -1,6 +1,5 @@
# coding: utf-8
import io
import json
import logging
import os
import pickle
@@ -206,10 +205,7 @@ def test_background_tasks_with_max_calls(shutdown_only):
def test_fair_queueing(shutdown_only):
ray.init(
num_cpus=1, _internal_config=json.dumps({
"fair_queueing_enabled": 1
}))
ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1})
@ray.remote
def h():
+5 -9
View File
@@ -1,5 +1,4 @@
# coding: utf-8
import json
import logging
import sys
import threading
@@ -333,19 +332,16 @@ def test_call_chain(ray_start_cluster):
assert ray.get(x) == 100
def test_internal_config_when_connecting(ray_start_cluster):
config = json.dumps({
"object_pinning_enabled": 0,
"initial_reconstruction_timeout_milliseconds": 200
})
def test_system_config_when_connecting(ray_start_cluster):
config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200}
cluster = ray.cluster_utils.Cluster()
cluster.add_node(
_internal_config=config, object_store_memory=100 * 1024 * 1024)
_system_config=config, object_store_memory=100 * 1024 * 1024)
cluster.wait_for_nodes()
# Specifying _internal_config when connecting to a cluster is disallowed.
# Specifying _system_config when connecting to a cluster is disallowed.
with pytest.raises(ValueError):
ray.init(address=cluster.address, _internal_config=config)
ray.init(address=cluster.address, _system_config=config)
# Check that the config was picked up (object pinning is disabled).
ray.init(address=cluster.address)
@@ -1,4 +1,3 @@
import json
import os
import signal
import sys
@@ -138,9 +137,9 @@ def check_components_alive(cluster, component_type, check_component_alive):
"ray_start_cluster", [{
"num_cpus": 8,
"num_nodes": 4,
"_internal_config": json.dumps({
"_system_config": {
"num_heartbeats_timeout": 100
}),
},
}],
indirect=True)
def test_raylet_failed(ray_start_cluster):
+12 -15
View File
@@ -1,4 +1,3 @@
import json
import logging
import os
import sys
@@ -908,12 +907,12 @@ def test_raylet_crash_when_get(ray_start_regular):
def test_connect_with_disconnected_node(shutdown_only):
config = json.dumps({
config = {
"num_heartbeats_timeout": 50,
"raylet_heartbeat_timeout_milliseconds": 10,
})
}
cluster = Cluster()
cluster.add_node(num_cpus=0, _internal_config=config)
cluster.add_node(num_cpus=0, _system_config=config)
ray.init(address=cluster.address)
p = init_error_pubsub()
errors = get_error_message(p, 1, timeout=5)
@@ -943,9 +942,9 @@ def test_connect_with_disconnected_node(shutdown_only):
"ray_start_cluster_head", [{
"num_cpus": 5,
"object_store_memory": 10**8,
"_internal_config": json.dumps({
"_system_config": {
"object_store_full_max_retries": 0
})
}
}],
indirect=True)
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
@@ -965,9 +964,7 @@ def test_fill_object_store_exception(shutdown_only):
ray.init(
num_cpus=2,
object_store_memory=10**8,
_internal_config=json.dumps({
"object_store_full_max_retries": 0
}))
_system_config={"object_store_full_max_retries": 0})
@ray.remote
def expensive_task():
@@ -997,14 +994,14 @@ def test_fill_object_store_exception(shutdown_only):
def test_fill_object_store_lru_fallback(shutdown_only):
config = json.dumps({
config = {
"free_objects_batch_size": 1,
})
}
ray.init(
num_cpus=2,
object_store_memory=10**8,
lru_evict=True,
_internal_config=config)
_system_config=config)
@ray.remote
def expensive_task():
@@ -1125,13 +1122,13 @@ def test_serialized_id(ray_start_cluster):
[(False, False), (False, True), (True, False),
(True, True)])
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
config = json.dumps({
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
})
}
cluster = Cluster()
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
cluster.add_node(num_cpus=0, _system_config=config)
ray.init(address=cluster.address)
# Node to place the parent actor.
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
+4 -4
View File
@@ -3,7 +3,7 @@ import sys
import ray
import pytest
from ray.test_utils import (
generate_internal_config_map,
generate_system_config_map,
wait_for_condition,
wait_for_pid_to_exit,
)
@@ -22,7 +22,7 @@ def increase(x):
@pytest.mark.parametrize(
"ray_start_regular",
[generate_internal_config_map(num_heartbeats_timeout=20)],
[generate_system_config_map(num_heartbeats_timeout=20)],
indirect=True)
def test_gcs_server_restart(ray_start_regular):
actor1 = Increase.remote()
@@ -45,7 +45,7 @@ def test_gcs_server_restart(ray_start_regular):
@pytest.mark.parametrize(
"ray_start_regular",
[generate_internal_config_map(num_heartbeats_timeout=20)],
[generate_system_config_map(num_heartbeats_timeout=20)],
indirect=True)
def test_gcs_server_restart_during_actor_creation(ray_start_regular):
ids = []
@@ -64,7 +64,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):
@pytest.mark.parametrize(
"ray_start_cluster_head",
[generate_internal_config_map(num_heartbeats_timeout=20)],
[generate_system_config_map(num_heartbeats_timeout=20)],
indirect=True)
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
"""Checks that the node failure detector is correct when gcs server restart.
+2 -3
View File
@@ -1,4 +1,3 @@
import json
import pytest
try:
import pytest_timeout
@@ -140,9 +139,9 @@ def test_load_report(shutdown_only, max_shapes):
cluster = ray.init(
num_cpus=1,
resources={resource1: 1},
_internal_config=json.dumps({
_system_config={
"max_resource_shapes_per_load_report": max_shapes,
}))
})
redis = ray.services.create_redis_client(
cluster["redis_address"],
password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
+1 -4
View File
@@ -48,10 +48,7 @@ def _setup_cluster_for_test(ray_start_cluster):
NUM_NODES = 2
cluster = ray_start_cluster
# Add a head node.
cluster.add_node(
_internal_config=json.dumps({
"metrics_report_interval_ms": 1000
}))
cluster.add_node(_system_config={"metrics_report_interval_ms": 1000})
# Add worker nodes.
[cluster.add_node() for _ in range(NUM_NODES - 1)]
cluster.wait_for_nodes()
+5 -7
View File
@@ -6,7 +6,7 @@ import ray
import ray.ray_constants as ray_constants
from ray.monitor import Monitor
from ray.cluster_utils import Cluster
from ray.test_utils import generate_internal_config_map, SignalActor
from ray.test_utils import generate_system_config_map, SignalActor
logger = logging.getLogger(__name__)
@@ -33,12 +33,11 @@ def test_shutdown():
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_internal_config_map(
num_heartbeats_timeout=20,
initial_reconstruction_timeout_milliseconds=12345)
generate_system_config_map(
num_heartbeats_timeout=20, object_timeout_milliseconds=12345)
],
indirect=True)
def test_internal_config(ray_start_cluster_head):
def test_system_config(ray_start_cluster_head):
"""Checks that the internal configuration setting works.
We set the cluster to timeout nodes after 2 seconds of no timeouts. We
@@ -52,8 +51,7 @@ def test_internal_config(ray_start_cluster_head):
@ray.remote
def f():
assert ray._config.initial_reconstruction_timeout_milliseconds(
) == 12345
assert ray._config.object_timeout_milliseconds() == 12345
assert ray._config.num_heartbeats_timeout() == 20
ray.get([f.remote() for _ in range(5)])
+3 -12
View File
@@ -1,5 +1,4 @@
# coding: utf-8
import json
import os
import sys
@@ -19,9 +18,7 @@ def test_initial_workers(shutdown_only):
ray.init(
num_cpus=1,
include_dashboard=True,
_internal_config=json.dumps({
"enable_multi_tenancy": True
}))
_system_config={"enable_multi_tenancy": True})
raylet = ray.nodes()[0]
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
raylet["NodeManagerPort"])
@@ -43,11 +40,7 @@ def test_initial_workers(shutdown_only):
# different drivers were scheduled to the same worker process, that is, tasks
# of different jobs were not correctly isolated during execution.
def test_multi_drivers(shutdown_only):
info = ray.init(
num_cpus=10,
_internal_config=json.dumps({
"enable_multi_tenancy": True
}))
info = ray.init(num_cpus=10, _system_config={"enable_multi_tenancy": True})
driver_code = """
import os
@@ -120,9 +113,7 @@ def test_worker_env(shutdown_only):
"foo1": "bar1",
"foo2": "bar2"
}),
_internal_config=json.dumps({
"enable_multi_tenancy": True
}))
_system_config={"enable_multi_tenancy": True})
@ray.remote
def get_env(key):
+2 -3
View File
@@ -1,4 +1,3 @@
import json
import os
import signal
import sys
@@ -145,10 +144,10 @@ def check_components_alive(cluster, component_type, check_component_alive):
[{
"num_cpus": 8,
"num_nodes": 4,
"_internal_config": json.dumps({
"_system_config": {
# Raylet codepath is not stable with a shorter timeout.
"num_heartbeats_timeout": 10
}),
},
}],
indirect=True)
def test_raylet_failed(ray_start_cluster):
@@ -1,4 +1,3 @@
import json
import os
import sys
import time
@@ -19,13 +18,13 @@ import ray.ray_constants as ray_constants
"num_cpus": 1,
"num_nodes": 4,
"object_store_memory": 1000 * 1024 * 1024,
"_internal_config": json.dumps({
"_system_config": {
# Raylet codepath is not stable with a shorter timeout.
"num_heartbeats_timeout": 10,
"object_manager_pull_timeout_ms": 1000,
"object_manager_push_timeout_ms": 1000,
"object_manager_repeated_push_delay_ms": 1000,
}),
},
}],
indirect=True)
def test_object_reconstruction(ray_start_cluster):
+3 -4
View File
@@ -1,5 +1,4 @@
from collections import defaultdict
import json
import multiprocessing
import numpy as np
import pytest
@@ -207,14 +206,14 @@ def test_object_transfer_retry(ray_start_cluster):
# Also, force the receiving object manager to retry the pull sooner. We
# make the chunk size smaller in order to make it easier to test objects
# with multiple chunks.
config = json.dumps({
config = {
"object_manager_repeated_push_delay_ms": repeated_push_delay * 1000,
"object_manager_pull_timeout_ms": repeated_push_delay * 1000 / 4,
"object_manager_default_chunk_size": 1000
})
}
object_store_memory = 150 * 1024 * 1024
cluster.add_node(
object_store_memory=object_store_memory, _internal_config=config)
object_store_memory=object_store_memory, _system_config=config)
cluster.add_node(num_gpus=1, object_store_memory=object_store_memory)
ray.init(address=cluster.address)
+8 -8
View File
@@ -17,10 +17,10 @@ def test_spill_objects_manually(shutdown_only):
"directory_path": "/tmp"
}
},
_internal_config=json.dumps({
_system_config={
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}))
})
arr = np.random.rand(1024 * 1024) # 8 MB data
replay_buffer = []
pinned_objects = set()
@@ -64,10 +64,10 @@ def test_spill_objects_manually_from_workers(shutdown_only):
"directory_path": "/tmp"
}
},
_internal_config=json.dumps({
_system_config={
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}))
})
@ray.remote
def _worker():
@@ -90,10 +90,10 @@ def test_spill_objects_manually_with_workers(shutdown_only):
"directory_path": "/tmp"
}
},
_internal_config=json.dumps({
_system_config={
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}))
})
arrays = [np.random.rand(100 * 1024) for _ in range(50)]
objects = [ray.put(arr) for arr in arrays]
@@ -117,7 +117,7 @@ def test_spill_objects_manually_with_workers(shutdown_only):
"directory_path": "/tmp"
}
},
"_internal_config": json.dumps({
"_system_config": json.dumps({
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}),
@@ -159,7 +159,7 @@ def test_spill_objects_automatically(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=75 * 1024 * 1024,
_internal_config=json.dumps({
_system_config=json.dumps({
"max_io_workers": 4,
"object_store_full_max_retries": 2,
"object_store_full_initial_delay_ms": 10,
+22 -30
View File
@@ -1,4 +1,3 @@
import json
import os
import signal
import sys
@@ -16,14 +15,14 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
def test_cached_object(ray_start_cluster):
config = json.dumps({
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
})
"object_timeout_milliseconds": 200,
}
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
cluster.add_node(num_cpus=0, _system_config=config)
ray.init(address=cluster.address)
# Node to place the initial object.
node_to_kill = cluster.add_node(
@@ -61,18 +60,17 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
@@ -121,18 +119,17 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
@@ -171,18 +168,17 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
@@ -229,18 +225,17 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
@@ -303,18 +298,17 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
@@ -384,18 +378,17 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
# Node to place the initial object.
@@ -445,18 +438,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
if not reconstruction_enabled:
config["lineage_pinning_enabled"] = 0
config = json.dumps(config)
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0,
_internal_config=config,
_system_config=config,
object_store_memory=10**8,
enable_object_reconstruction=reconstruction_enabled)
ray.init(address=cluster.address)
@@ -493,17 +485,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
def test_reconstruction_stress(ray_start_cluster):
config = json.dumps({
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"max_direct_call_object_size": 100,
"task_retry_delay_ms": 100,
"initial_reconstruction_timeout_milliseconds": 200,
})
"object_timeout_milliseconds": 200,
}
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(
num_cpus=0, _internal_config=config, enable_object_reconstruction=True)
num_cpus=0, _system_config=config, enable_object_reconstruction=True)
ray.init(address=cluster.address)
# Node to place the initial object.
node_to_kill = cluster.add_node(
+4 -7
View File
@@ -1,6 +1,5 @@
# coding: utf-8
import copy
import json
import logging
import os
import time
@@ -18,14 +17,14 @@ logger = logging.getLogger(__name__)
@pytest.fixture
def one_worker_100MiB(request):
config = json.dumps({
config = {
"object_store_full_max_retries": 2,
"task_retry_delay_ms": 0,
})
}
yield ray.init(
num_cpus=1,
object_store_memory=100 * 1024 * 1024,
_internal_config=config)
_system_config=config)
ray.shutdown()
@@ -245,9 +244,7 @@ def test_pending_task_dependency_pinning(one_worker_100MiB):
def test_feature_flag(shutdown_only):
ray.init(
object_store_memory=100 * 1024 * 1024,
_internal_config=json.dumps({
"object_pinning_enabled": 0
}))
_system_config={"object_pinning_enabled": 0})
@ray.remote
def f(array):
@@ -1,5 +1,4 @@
# coding: utf-8
import json
import logging
import os
import signal
@@ -20,15 +19,15 @@ logger = logging.getLogger(__name__)
@pytest.fixture
def one_worker_100MiB(request):
config = json.dumps({
config = {
"object_store_full_max_retries": 2,
"task_retry_delay_ms": 0,
"initial_reconstruction_timeout_milliseconds": 1000,
})
"object_timeout_milliseconds": 1000,
}
yield ray.init(
num_cpus=1,
object_store_memory=100 * 1024 * 1024,
_internal_config=config)
_system_config=config)
ray.shutdown()
+3 -4
View File
@@ -1,4 +1,3 @@
import json
import numpy as np
import os
import pytest
@@ -23,9 +22,9 @@ def ray_start_reconstruction(request):
"num_cpus": 1,
"object_store_memory": plasma_store_memory // num_nodes,
"redis_max_memory": 10**7,
"_internal_config": json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
})
"_system_config": {
"object_timeout_milliseconds": 200
}
})
for i in range(num_nodes - 1):
cluster.add_node(
+4 -5
View File
@@ -1,5 +1,4 @@
import inspect
import json
import time
import os
import pytest
@@ -45,9 +44,9 @@ def _start_new_cluster():
connect=True,
head_node_args={
"num_cpus": 1,
"_internal_config": json.dumps({
"_system_config": {
"num_heartbeats_timeout": 10
})
}
})
# Pytest doesn't play nicely with imports
register_trainable("__fake_remote", MockRemoteTrainer)
@@ -74,9 +73,9 @@ def start_connected_emptyhead_cluster():
connect=True,
head_node_args={
"num_cpus": 0,
"_internal_config": json.dumps({
"_system_config": {
"num_heartbeats_timeout": 10
})
}
})
# Pytest doesn't play nicely with imports
_register_all()
@@ -1,5 +1,4 @@
# coding: utf-8
import json
import unittest
import ray
@@ -190,9 +189,9 @@ class RayExecutorQueueTest(unittest.TestCase):
connect=True,
head_node_args={
"num_cpus": 1,
"_internal_config": json.dumps({
"_system_config": {
"num_heartbeats_timeout": 10
})
}
})
# Pytest doesn't play nicely with imports
_register_all()
+11 -10
View File
@@ -107,7 +107,6 @@ class Worker:
self.actors = {}
# Information used to maintain actor checkpoints.
self.actor_checkpoint_info = {}
self.actor_task_counter = 0
# When the worker is constructed. Record the original value of the
# CUDA_VISIBLE_DEVICES environment variable.
self.original_gpu_ids = ray.utils.get_cuda_visible_devices()
@@ -515,7 +514,7 @@ def init(address=None,
load_code_from_local=False,
java_worker_options=None,
use_pickle=True,
_internal_config=None,
_system_config=None,
lru_evict=False,
enable_object_reconstruction=False,
_metrics_export_port=None,
@@ -631,8 +630,9 @@ def init(address=None,
module or from the GCS.
java_worker_options: Overwrite the options to start Java workers.
use_pickle: Deprecated.
_internal_config (str): JSON configuration for overriding
RayConfig defaults. For testing purposes ONLY.
_system_config (dict): Configuration for overriding RayConfig
defaults. Used to set system configuration and for experimental Ray
core feature flags.
lru_evict (bool): If True, when an object store is full, it will evict
objects in LRU order to make more space and when under memory
pressure, ray.UnreconstructableError may be thrown. If False, then
@@ -706,8 +706,9 @@ def init(address=None,
raylet_ip_address = node_ip_address
_internal_config = (json.loads(_internal_config)
if _internal_config else {})
_system_config = _system_config or {}
if not isinstance(_system_config, dict):
raise TypeError("The _system_config must be a dict.")
global _global_node
if redis_address is None:
@@ -742,7 +743,7 @@ def init(address=None,
load_code_from_local=load_code_from_local,
java_worker_options=java_worker_options,
start_initial_python_workers_for_first_job=True,
_internal_config=_internal_config,
_system_config=_system_config,
lru_evict=lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
metrics_export_port=_metrics_export_port,
@@ -798,9 +799,9 @@ def init(address=None,
if java_worker_options is not None:
raise ValueError("When connecting to an existing cluster, "
"java_worker_options must not be provided.")
if _internal_config is not None and len(_internal_config) != 0:
if _system_config is not None and len(_system_config) != 0:
raise ValueError("When connecting to an existing cluster, "
"_internal_config must not be provided.")
"_system_config must not be provided.")
if lru_evict:
raise ValueError("When connecting to an existing cluster, "
"lru_evict must not be provided.")
@@ -818,7 +819,7 @@ def init(address=None,
object_ref_seed=object_ref_seed,
temp_dir=temp_dir,
load_code_from_local=load_code_from_local,
_internal_config=_internal_config,
_system_config=_system_config,
lru_evict=lru_evict,
enable_object_reconstruction=enable_object_reconstruction,
metrics_export_port=_metrics_export_port)
+3 -3
View File
@@ -122,13 +122,13 @@ if __name__ == "__main__":
object_spilling_config = {}
external_storage.setup_external_storage(object_spilling_config)
internal_config = {}
system_config = {}
if args.config_list is not None:
config_list = args.config_list.split(",")
if len(config_list) > 1:
i = 0
while i < len(config_list):
internal_config[config_list[i]] = config_list[i + 1]
system_config[config_list[i]] = config_list[i + 1]
i += 2
raylet_ip_address = args.raylet_ip_address
@@ -146,7 +146,7 @@ if __name__ == "__main__":
temp_dir=args.temp_dir,
load_code_from_local=args.load_code_from_local,
metrics_agent_port=args.metrics_agent_port,
_internal_config=json.dumps(internal_config),
_system_config=system_config,
)
node = ray.node.Node(