mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 06:46:40 +08:00
[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)
* remove * internal config updates, remove Checkpointable * Lower object timeout default * remove json * Fix flaky test * Fix unit test
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
@@ -8,94 +7,22 @@ import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
from ray.test_utils import (
|
||||
wait_for_condition,
|
||||
wait_for_pid_to_exit,
|
||||
generate_internal_config_map,
|
||||
generate_system_config_map,
|
||||
get_other_nodes,
|
||||
SignalActor,
|
||||
get_error_message,
|
||||
)
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_checkpointable_actor_cls(request):
|
||||
checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
|
||||
"ray_temp_checkpoint_dir") + os.sep
|
||||
if not os.path.isdir(checkpoint_dir):
|
||||
os.mkdir(checkpoint_dir)
|
||||
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
self.resumed_from_checkpoint = False
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
|
||||
def node_id(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get(self):
|
||||
return self.value
|
||||
|
||||
def was_resumed_from_checkpoint(self):
|
||||
return self.resumed_from_checkpoint
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Checkpoint the actor when value is increased to 3.
|
||||
should_checkpoint = self.value == 3
|
||||
return should_checkpoint
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
|
||||
# Save checkpoint into a file.
|
||||
with open(self.checkpoint_dir + actor_id, "a+") as f:
|
||||
print(checkpoint_id, self.value, file=f)
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
actor_id = actor_id.hex()
|
||||
filename = self.checkpoint_dir + actor_id
|
||||
# Load checkpoint from the file.
|
||||
if not os.path.isfile(filename):
|
||||
return None
|
||||
|
||||
available_checkpoint_ids = [
|
||||
c.checkpoint_id for c in available_checkpoints
|
||||
]
|
||||
with open(filename, "r") as f:
|
||||
for line in f:
|
||||
checkpoint_id, value = line.strip().split(" ")
|
||||
checkpoint_id = ray.ActorCheckpointID(
|
||||
ray.utils.hex_to_binary(checkpoint_id))
|
||||
if checkpoint_id in available_checkpoint_ids:
|
||||
self.value = int(value)
|
||||
self.resumed_from_checkpoint = True
|
||||
return checkpoint_id
|
||||
return None
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
pass
|
||||
|
||||
return CheckpointableActor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_init_with_task_retry_delay():
|
||||
address = ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"task_retry_delay_ms": 100
|
||||
}))
|
||||
address = ray.init(_system_config={"task_retry_delay_ms": 100})
|
||||
yield address
|
||||
ray.shutdown()
|
||||
|
||||
@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
|
||||
|
||||
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 1000,
|
||||
"object_timeout_milliseconds": 1000,
|
||||
"task_retry_delay_ms": 100,
|
||||
})
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
cluster.wait_for_nodes()
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
|
||||
assert ray.get(RetryableTask.remote(remote_actor)) == 3
|
||||
|
||||
|
||||
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
|
||||
# NOTE(hchen): we set object_timeout_milliseconds to 1s for
|
||||
# this test. Because if this value is too small, suprious task reconstruction
|
||||
# may happen and cause the test fauilure. If the value is too large, this test
|
||||
# could be very slow. We can remove this once we support dynamic timeout.
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_internal_config_map(
|
||||
initial_reconstruction_timeout_milliseconds=1000,
|
||||
num_heartbeats_timeout=10)
|
||||
generate_system_config_map(
|
||||
object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
|
||||
],
|
||||
indirect=True)
|
||||
def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
@@ -520,287 +446,6 @@ def kill_actor(actor):
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing and restoring from a checkpoint."""
|
||||
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
|
||||
# Call increase 3 times, triggering a checkpoint.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test checkpointing of a remote actor through method invocation."""
|
||||
|
||||
# Define a class that exposes a method to save checkpoints.
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
super(RemoteCheckpointableActor, self).__init__()
|
||||
self._should_checkpoint = False
|
||||
|
||||
def checkpoint(self):
|
||||
self._should_checkpoint = True
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
should_checkpoint = self._should_checkpoint
|
||||
self._should_checkpoint = False
|
||||
return should_checkpoint
|
||||
|
||||
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
|
||||
actor = cls.remote()
|
||||
# Call increase 3 times.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Call a checkpoint task.
|
||||
actor.checkpoint.remote()
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing on a remote node."""
|
||||
# Place the actor on the remote node.
|
||||
cluster = ray_start_cluster_2_nodes
|
||||
remote_node = list(cluster.worker_nodes)
|
||||
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
|
||||
actor = actor_cls.remote()
|
||||
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
|
||||
actor = actor_cls.remote()
|
||||
|
||||
# Call increase several times.
|
||||
expected = 0
|
||||
for _ in range(6):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
cluster.remove_node(remote_node[0])
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to complete."""
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def save_checkpoint(self, actor_id, checkpoint_context):
|
||||
raise Exception("Intentional error saving checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will fail.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to load."""
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote(max_restarts=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def load_checkpoint(self, actor_id, checkpoints):
|
||||
raise Exception("Intentional error loading checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will succeed.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint because loading
|
||||
# it failed.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
# This overwrite currently isn't effective,
|
||||
# see https://github.com/ray-project/ray/issues/3926.
|
||||
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_deleting_actor_checkpoint(ray_start_regular):
|
||||
"""Test deleting old actor checkpoints."""
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.checkpoint_ids = []
|
||||
|
||||
def get_checkpoint_ids(self):
|
||||
return self.checkpoint_ids
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Save checkpoints after every task
|
||||
return True
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
self.checkpoint_ids.append(checkpoint_id)
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
pass
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
assert checkpoint_id == self.checkpoint_ids[0]
|
||||
del self.checkpoint_ids[0]
|
||||
|
||||
actor = CheckpointableActor.remote()
|
||||
for i in range(19):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
|
||||
for _ in range(20):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
|
||||
|
||||
|
||||
def test_bad_checkpointable_actor_class():
|
||||
"""Test error raised if an actor class doesn't implement all abstract
|
||||
methods in the Checkpointable interface."""
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
|
||||
@ray.remote
|
||||
class BadCheckpointableActor(ray.actor.Checkpointable):
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
|
||||
def test_init_exception_in_checkpointable_actor(
|
||||
ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
|
||||
# This test is similar to test_failure.py::test_failed_actor_init.
|
||||
# This test is used to guarantee that checkpointable actor does not
|
||||
# break the same logic.
|
||||
error_message1 = "actor constructor failed"
|
||||
error_message2 = "actor method failed"
|
||||
|
||||
p = error_pubsub
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
raise Exception(error_message1)
|
||||
|
||||
def fail_method(self):
|
||||
raise Exception(error_message2)
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
a = CheckpointableFailedActor.remote()
|
||||
|
||||
# Make sure that we get errors from a failed constructor.
|
||||
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0].error_message
|
||||
|
||||
# Make sure that we get errors from a failed method.
|
||||
a.fail_method.remote()
|
||||
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0].error_message
|
||||
|
||||
|
||||
def test_decorated_method(ray_start_regular):
|
||||
def method_invocation_decorator(f):
|
||||
def new_f_invocation(args, kwargs):
|
||||
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
return self.dependency
|
||||
|
||||
# Make sure it is scheduled in the second node.
|
||||
@ray.remote(resources={"node": 1}, num_cpus=1)
|
||||
@ray.remote(resources={"node": 1})
|
||||
class Owner:
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
# Wait until the `Caller` start executing the remote `call` method.
|
||||
ray.get(signal_handle.wait.remote())
|
||||
|
||||
@ray.remote
|
||||
@ray.remote(resources={"caller": 1})
|
||||
class Caller:
|
||||
def call(self, owner_pid, signal_handle, actor_handle):
|
||||
# Notify the `Owner` that the `Caller` is executing the remote
|
||||
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
|
||||
return True
|
||||
|
||||
cluster = ray_start_cluster
|
||||
node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
|
||||
node_to_be_broken = cluster.add_node(resources={"node": 1})
|
||||
cluster.add_node(resources={"caller": 1})
|
||||
|
||||
owner = Owner.remote()
|
||||
owner_pid = ray.get(owner.get_pid.remote())
|
||||
|
||||
caller = Caller.remote()
|
||||
owner.create_actor.remote(caller)
|
||||
ray.get(owner.create_actor.remote(caller))
|
||||
cluster.remove_node(node_to_be_broken)
|
||||
# Wait for the `Owner` to exit.
|
||||
wait_for_pid_to_exit(owner_pid)
|
||||
|
||||
# It will hang here if location is not properly resolved.
|
||||
|
||||
Reference in New Issue
Block a user