[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)

* remove

* internal config updates, remove Checkpointable

* Lower object timeout default

* remove json

* Fix flaky test

* Fix unit test
This commit is contained in:
Stephanie Wang
2020-08-27 10:19:53 -07:00
committed by GitHub
parent 0aec4cbccb
commit f75dfd60a3
56 changed files with 239 additions and 1267 deletions
+14 -369
View File
@@ -1,5 +1,4 @@
import collections
import json
import numpy as np
import os
import pytest
@@ -8,94 +7,22 @@ import sys
import time
import ray
import ray.ray_constants as ray_constants
import ray.test_utils
import ray.cluster_utils
from ray.test_utils import (
wait_for_condition,
wait_for_pid_to_exit,
generate_internal_config_map,
generate_system_config_map,
get_other_nodes,
SignalActor,
get_error_message,
)
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
@pytest.fixture
def ray_checkpointable_actor_cls(request):
checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
"ray_temp_checkpoint_dir") + os.sep
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
class CheckpointableActor(ray.actor.Checkpointable):
def __init__(self):
self.value = 0
self.resumed_from_checkpoint = False
self.checkpoint_dir = checkpoint_dir
def node_id(self):
return ray.worker.global_worker.node.unique_id
def increase(self):
self.value += 1
return self.value
def get(self):
return self.value
def was_resumed_from_checkpoint(self):
return self.resumed_from_checkpoint
def get_pid(self):
return os.getpid()
def should_checkpoint(self, checkpoint_context):
# Checkpoint the actor when value is increased to 3.
should_checkpoint = self.value == 3
return should_checkpoint
def save_checkpoint(self, actor_id, checkpoint_id):
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
# Save checkpoint into a file.
with open(self.checkpoint_dir + actor_id, "a+") as f:
print(checkpoint_id, self.value, file=f)
def load_checkpoint(self, actor_id, available_checkpoints):
actor_id = actor_id.hex()
filename = self.checkpoint_dir + actor_id
# Load checkpoint from the file.
if not os.path.isfile(filename):
return None
available_checkpoint_ids = [
c.checkpoint_id for c in available_checkpoints
]
with open(filename, "r") as f:
for line in f:
checkpoint_id, value = line.strip().split(" ")
checkpoint_id = ray.ActorCheckpointID(
ray.utils.hex_to_binary(checkpoint_id))
if checkpoint_id in available_checkpoint_ids:
self.value = int(value)
self.resumed_from_checkpoint = True
return checkpoint_id
return None
def checkpoint_expired(self, actor_id, checkpoint_id):
pass
return CheckpointableActor
@pytest.fixture
def ray_init_with_task_retry_delay():
address = ray.init(
_internal_config=json.dumps({
"task_retry_delay_ms": 100
}))
address = ray.init(_system_config={"task_retry_delay_ms": 100})
yield address
ray.shutdown()
@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
def test_actor_restart_on_node_failure(ray_start_cluster):
config = json.dumps({
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 1000,
"object_timeout_milliseconds": 1000,
"task_retry_delay_ms": 100,
})
}
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
cluster.add_node(num_cpus=0, _system_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address)
@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
assert ray.get(RetryableTask.remote(remote_actor)) == 3
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
# NOTE(hchen): we set object_timeout_milliseconds to 1s for
# this test. Because if this value is too small, suprious task reconstruction
# may happen and cause the test fauilure. If the value is too large, this test
# could be very slow. We can remove this once we support dynamic timeout.
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_internal_config_map(
initial_reconstruction_timeout_milliseconds=1000,
num_heartbeats_timeout=10)
generate_system_config_map(
object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
],
indirect=True)
def test_multiple_actor_restart(ray_start_cluster_head):
@@ -520,287 +446,6 @@ def kill_actor(actor):
wait_for_pid_to_exit(pid)
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test actor checkpointing and restoring from a checkpoint."""
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
# Call increase 3 times, triggering a checkpoint.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that restart still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test checkpointing of a remote actor through method invocation."""
# Define a class that exposes a method to save checkpoints.
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def __init__(self):
super(RemoteCheckpointableActor, self).__init__()
self._should_checkpoint = False
def checkpoint(self):
self._should_checkpoint = True
def should_checkpoint(self, checkpoint_context):
should_checkpoint = self._should_checkpoint
self._should_checkpoint = False
return should_checkpoint
cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
actor = cls.remote()
# Call increase 3 times.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Call a checkpoint task.
actor.checkpoint.remote()
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that restart still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
ray_checkpointable_actor_cls):
"""Test actor checkpointing on a remote node."""
# Place the actor on the remote node.
cluster = ray_start_cluster_2_nodes
remote_node = list(cluster.worker_nodes)
actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
actor = actor_cls.remote()
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
actor = actor_cls.remote()
# Call increase several times.
expected = 0
for _ in range(6):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
cluster.remove_node(remote_node[0])
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to complete."""
p = error_pubsub
@ray.remote(max_restarts=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def save_checkpoint(self, actor_id, checkpoint_context):
raise Exception("Intentional error saving checkpoint.")
actor = RemoteCheckpointableActor.remote()
# Call increase 3 times, triggering a checkpoint that will fail.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor still wasn't resumed from a checkpoint and its
# value is still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that restart still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Check that the checkpoint error was pushed to the driver.
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
assert len(errors) == 1
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to load."""
p = error_pubsub
@ray.remote(max_restarts=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def load_checkpoint(self, actor_id, checkpoints):
raise Exception("Intentional error loading checkpoint.")
actor = RemoteCheckpointableActor.remote()
# Call increase 3 times, triggering a checkpoint that will succeed.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint because loading
# it failed.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor still wasn't resumed from a checkpoint and its
# value is still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that restart still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Check that the checkpoint error was pushed to the driver.
errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
assert len(errors) == 1
assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
@pytest.mark.parametrize(
"ray_start_regular",
# This overwrite currently isn't effective,
# see https://github.com/ray-project/ray/issues/3926.
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
indirect=True,
)
def test_deleting_actor_checkpoint(ray_start_regular):
"""Test deleting old actor checkpoints."""
@ray.remote
class CheckpointableActor(ray.actor.Checkpointable):
def __init__(self):
self.checkpoint_ids = []
def get_checkpoint_ids(self):
return self.checkpoint_ids
def should_checkpoint(self, checkpoint_context):
# Save checkpoints after every task
return True
def save_checkpoint(self, actor_id, checkpoint_id):
self.checkpoint_ids.append(checkpoint_id)
pass
def load_checkpoint(self, actor_id, available_checkpoints):
pass
def checkpoint_expired(self, actor_id, checkpoint_id):
assert checkpoint_id == self.checkpoint_ids[0]
del self.checkpoint_ids[0]
actor = CheckpointableActor.remote()
for i in range(19):
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
for _ in range(20):
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
def test_bad_checkpointable_actor_class():
"""Test error raised if an actor class doesn't implement all abstract
methods in the Checkpointable interface."""
with pytest.raises(TypeError):
@ray.remote
class BadCheckpointableActor(ray.actor.Checkpointable):
def should_checkpoint(self, checkpoint_context):
return True
def test_init_exception_in_checkpointable_actor(
ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
# This test is similar to test_failure.py::test_failed_actor_init.
# This test is used to guarantee that checkpointable actor does not
# break the same logic.
error_message1 = "actor constructor failed"
error_message2 = "actor method failed"
p = error_pubsub
@ray.remote
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
def __init__(self):
raise Exception(error_message1)
def fail_method(self):
raise Exception(error_message2)
def should_checkpoint(self, checkpoint_context):
return True
a = CheckpointableFailedActor.remote()
# Make sure that we get errors from a failed constructor.
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
assert len(errors) == 1
assert error_message1 in errors[0].error_message
# Make sure that we get errors from a failed method.
a.fail_method.remote()
errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
assert len(errors) == 1
assert error_message1 in errors[0].error_message
def test_decorated_method(ray_start_regular):
def method_invocation_decorator(f):
def new_f_invocation(args, kwargs):
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
return self.dependency
# Make sure it is scheduled in the second node.
@ray.remote(resources={"node": 1}, num_cpus=1)
@ray.remote(resources={"node": 1})
class Owner:
def get_pid(self):
return os.getpid()
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
# Wait until the `Caller` start executing the remote `call` method.
ray.get(signal_handle.wait.remote())
@ray.remote
@ray.remote(resources={"caller": 1})
class Caller:
def call(self, owner_pid, signal_handle, actor_handle):
# Notify the `Owner` that the `Caller` is executing the remote
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
return True
cluster = ray_start_cluster
node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
node_to_be_broken = cluster.add_node(resources={"node": 1})
cluster.add_node(resources={"caller": 1})
owner = Owner.remote()
owner_pid = ray.get(owner.get_pid.remote())
caller = Caller.remote()
owner.create_actor.remote(caller)
ray.get(owner.create_actor.remote(caller))
cluster.remove_node(node_to_be_broken)
# Wait for the `Owner` to exit.
wait_for_pid_to_exit(owner_pid)
# It will hang here if location is not properly resolved.