[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)

* remove * internal config updates, remove Checkpointable * Lower object timeout default * remove json * Fix flaky test * Fix unit test
2026-06-30 06:46:40 +08:00 · 2020-08-27 10:19:53 -07:00
parent 0aec4cbccb
commit f75dfd60a3
56 changed files with 239 additions and 1267 deletions
@@ -1,5 +1,4 @@
 import collections
-import json
 import numpy as np
 import os
 import pytest
@@ -8,94 +7,22 @@ import sys
 import time

 import ray
-import ray.ray_constants as ray_constants
 import ray.test_utils
 import ray.cluster_utils
 from ray.test_utils import (
    wait_for_condition,
    wait_for_pid_to_exit,
-    generate_internal_config_map,
+    generate_system_config_map,
    get_other_nodes,
    SignalActor,
-    get_error_message,
 )

 SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM


-@pytest.fixture
-def ray_checkpointable_actor_cls(request):
-    checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
-                                  "ray_temp_checkpoint_dir") + os.sep
-    if not os.path.isdir(checkpoint_dir):
-        os.mkdir(checkpoint_dir)
-
-    class CheckpointableActor(ray.actor.Checkpointable):
-        def __init__(self):
-            self.value = 0
-            self.resumed_from_checkpoint = False
-            self.checkpoint_dir = checkpoint_dir
-
-        def node_id(self):
-            return ray.worker.global_worker.node.unique_id
-
-        def increase(self):
-            self.value += 1
-            return self.value
-
-        def get(self):
-            return self.value
-
-        def was_resumed_from_checkpoint(self):
-            return self.resumed_from_checkpoint
-
-        def get_pid(self):
-            return os.getpid()
-
-        def should_checkpoint(self, checkpoint_context):
-            # Checkpoint the actor when value is increased to 3.
-            should_checkpoint = self.value == 3
-            return should_checkpoint
-
-        def save_checkpoint(self, actor_id, checkpoint_id):
-            actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
-            # Save checkpoint into a file.
-            with open(self.checkpoint_dir + actor_id, "a+") as f:
-                print(checkpoint_id, self.value, file=f)
-
-        def load_checkpoint(self, actor_id, available_checkpoints):
-            actor_id = actor_id.hex()
-            filename = self.checkpoint_dir + actor_id
-            # Load checkpoint from the file.
-            if not os.path.isfile(filename):
-                return None
-
-            available_checkpoint_ids = [
-                c.checkpoint_id for c in available_checkpoints
-            ]
-            with open(filename, "r") as f:
-                for line in f:
-                    checkpoint_id, value = line.strip().split(" ")
-                    checkpoint_id = ray.ActorCheckpointID(
-                        ray.utils.hex_to_binary(checkpoint_id))
-                    if checkpoint_id in available_checkpoint_ids:
-                        self.value = int(value)
-                        self.resumed_from_checkpoint = True
-                        return checkpoint_id
-                return None
-
-        def checkpoint_expired(self, actor_id, checkpoint_id):
-            pass
-
-    return CheckpointableActor
-
-
@pytest.fixture
 def ray_init_with_task_retry_delay():
-    address = ray.init(
-        _internal_config=json.dumps({
-            "task_retry_delay_ms": 100
-        }))
+    address = ray.init(_system_config={"task_retry_delay_ms": 100})
    yield address
    ray.shutdown()

@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):


 def test_actor_restart_on_node_failure(ray_start_cluster):
-    config = json.dumps({
+    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 1000,
+        "object_timeout_milliseconds": 1000,
        "task_retry_delay_ms": 100,
-    })
+    }
    cluster = ray_start_cluster
    # Head node with no resources.
-    cluster.add_node(num_cpus=0, _internal_config=config)
+    cluster.add_node(num_cpus=0, _system_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
    assert ray.get(RetryableTask.remote(remote_actor)) == 3


-# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
+# NOTE(hchen): we set object_timeout_milliseconds to 1s for
 # this test. Because if this value is too small, suprious task reconstruction
 # may happen and cause the test fauilure. If the value is too large, this test
 # could be very slow. We can remove this once we support dynamic timeout.
@pytest.mark.parametrize(
    "ray_start_cluster_head", [
-        generate_internal_config_map(
-            initial_reconstruction_timeout_milliseconds=1000,
-            num_heartbeats_timeout=10)
+        generate_system_config_map(
+            object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
    ],
    indirect=True)
 def test_multiple_actor_restart(ray_start_cluster_head):
@@ -520,287 +446,6 @@ def kill_actor(actor):
    wait_for_pid_to_exit(pid)


-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
-    """Test actor checkpointing and restoring from a checkpoint."""
-    actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
-    # Call increase 3 times, triggering a checkpoint.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor was resumed from a checkpoint and its value is
-    # still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again and check that restart still works after the
-    # actor resuming from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
-    """Test checkpointing of a remote actor through method invocation."""
-
-    # Define a class that exposes a method to save checkpoints.
-    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
-        def __init__(self):
-            super(RemoteCheckpointableActor, self).__init__()
-            self._should_checkpoint = False
-
-        def checkpoint(self):
-            self._should_checkpoint = True
-
-        def should_checkpoint(self, checkpoint_context):
-            should_checkpoint = self._should_checkpoint
-            self._should_checkpoint = False
-            return should_checkpoint
-
-    cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
-    actor = cls.remote()
-    # Call increase 3 times.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Call a checkpoint task.
-    actor.checkpoint.remote()
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor was resumed from a checkpoint and its value is
-    # still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again and check that restart still works after the
-    # actor resuming from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
-                                       ray_checkpointable_actor_cls):
-    """Test actor checkpointing on a remote node."""
-    # Place the actor on the remote node.
-    cluster = ray_start_cluster_2_nodes
-    remote_node = list(cluster.worker_nodes)
-    actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
-    actor = actor_cls.remote()
-    while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
-        actor = actor_cls.remote()
-
-    # Call increase several times.
-    expected = 0
-    for _ in range(6):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    cluster.remove_node(remote_node[0])
-    # Assert that the actor was resumed from a checkpoint and its value is
-    # still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
-                                      ray_checkpointable_actor_cls):
-    """Test actor can still be recovered if checkpoints fail to complete."""
-
-    p = error_pubsub
-
-    @ray.remote(max_restarts=2)
-    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
-        def save_checkpoint(self, actor_id, checkpoint_context):
-            raise Exception("Intentional error saving checkpoint.")
-
-    actor = RemoteCheckpointableActor.remote()
-    # Call increase 3 times, triggering a checkpoint that will fail.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor still wasn't resumed from a checkpoint and its
-    # value is still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again, and check that restart still works and the actor
-    # wasn't resumed from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Check that the checkpoint error was pushed to the driver.
-    errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
-    assert len(errors) == 1
-    assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
-                                      ray_checkpointable_actor_cls):
-    """Test actor can still be recovered if checkpoints fail to load."""
-
-    p = error_pubsub
-
-    @ray.remote(max_restarts=2)
-    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
-        def load_checkpoint(self, actor_id, checkpoints):
-            raise Exception("Intentional error loading checkpoint.")
-
-    actor = RemoteCheckpointableActor.remote()
-    # Call increase 3 times, triggering a checkpoint that will succeed.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint because loading
-    # it failed.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor still wasn't resumed from a checkpoint and its
-    # value is still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again, and check that restart still works and the actor
-    # wasn't resumed from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Check that the checkpoint error was pushed to the driver.
-    errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
-    assert len(errors) == 1
-    assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
-
-
-@pytest.mark.parametrize(
-    "ray_start_regular",
-    # This overwrite currently isn't effective,
-    # see https://github.com/ray-project/ray/issues/3926.
-    [generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
-    indirect=True,
-)
-def test_deleting_actor_checkpoint(ray_start_regular):
-    """Test deleting old actor checkpoints."""
-
-    @ray.remote
-    class CheckpointableActor(ray.actor.Checkpointable):
-        def __init__(self):
-            self.checkpoint_ids = []
-
-        def get_checkpoint_ids(self):
-            return self.checkpoint_ids
-
-        def should_checkpoint(self, checkpoint_context):
-            # Save checkpoints after every task
-            return True
-
-        def save_checkpoint(self, actor_id, checkpoint_id):
-            self.checkpoint_ids.append(checkpoint_id)
-            pass
-
-        def load_checkpoint(self, actor_id, available_checkpoints):
-            pass
-
-        def checkpoint_expired(self, actor_id, checkpoint_id):
-            assert checkpoint_id == self.checkpoint_ids[0]
-            del self.checkpoint_ids[0]
-
-    actor = CheckpointableActor.remote()
-    for i in range(19):
-        assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
-    for _ in range(20):
-        assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
-
-
-def test_bad_checkpointable_actor_class():
-    """Test error raised if an actor class doesn't implement all abstract
-    methods in the Checkpointable interface."""
-
-    with pytest.raises(TypeError):
-
-        @ray.remote
-        class BadCheckpointableActor(ray.actor.Checkpointable):
-            def should_checkpoint(self, checkpoint_context):
-                return True
-
-
-def test_init_exception_in_checkpointable_actor(
-        ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
-    # This test is similar to test_failure.py::test_failed_actor_init.
-    # This test is used to guarantee that checkpointable actor does not
-    # break the same logic.
-    error_message1 = "actor constructor failed"
-    error_message2 = "actor method failed"
-
-    p = error_pubsub
-
-    @ray.remote
-    class CheckpointableFailedActor(ray_checkpointable_actor_cls):
-        def __init__(self):
-            raise Exception(error_message1)
-
-        def fail_method(self):
-            raise Exception(error_message2)
-
-        def should_checkpoint(self, checkpoint_context):
-            return True
-
-    a = CheckpointableFailedActor.remote()
-
-    # Make sure that we get errors from a failed constructor.
-    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
-    assert len(errors) == 1
-    assert error_message1 in errors[0].error_message
-
-    # Make sure that we get errors from a failed method.
-    a.fail_method.remote()
-    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
-    assert len(errors) == 1
-    assert error_message1 in errors[0].error_message
-
-
 def test_decorated_method(ray_start_regular):
    def method_invocation_decorator(f):
        def new_f_invocation(args, kwargs):
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
            return self.dependency

    # Make sure it is scheduled in the second node.
-    @ray.remote(resources={"node": 1}, num_cpus=1)
+    @ray.remote(resources={"node": 1})
    class Owner:
        def get_pid(self):
            return os.getpid()
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
            # Wait until the `Caller` start executing the remote `call` method.
            ray.get(signal_handle.wait.remote())

-    @ray.remote
+    @ray.remote(resources={"caller": 1})
    class Caller:
        def call(self, owner_pid, signal_handle, actor_handle):
            # Notify the `Owner` that the `Caller` is executing the remote
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
            return True

    cluster = ray_start_cluster
-    node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
+    node_to_be_broken = cluster.add_node(resources={"node": 1})
+    cluster.add_node(resources={"caller": 1})

    owner = Owner.remote()
    owner_pid = ray.get(owner.get_pid.remote())

    caller = Caller.remote()
-    owner.create_actor.remote(caller)
+    ray.get(owner.create_actor.remote(caller))
    cluster.remove_node(node_to_be_broken)
-    # Wait for the `Owner` to exit.
    wait_for_pid_to_exit(owner_pid)

    # It will hang here if location is not properly resolved.