[core] Enable object reconstruction for retryable actor tasks (#9557)

* Test actor plasma reconstruction * Allow resubmission of actor tasks * doc * Test for actor constructor * Kill PID before removing node * Kill pid before node
2026-06-29 11:01:06 +08:00 · 2020-07-23 21:15:12 -07:00
parent 239196fffc
commit f2705e2c73
8 changed files with 224 additions and 24 deletions
@@ -1,4 +1,6 @@
 import json
+import os
+import signal
 import sys

 import numpy as np
@@ -6,7 +8,11 @@ import pytest

 import ray
 from ray.test_utils import (
-    wait_for_condition, )
+    wait_for_condition,
+    wait_for_pid_to_exit,
+)
+
+SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM


 def test_cached_object(ray_start_cluster):
@@ -217,6 +223,161 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
            pass


+@pytest.mark.parametrize("reconstruction_enabled", [False, True])
+def test_basic_reconstruction_actor_task(ray_start_cluster,
+                                         reconstruction_enabled):
+    config = {
+        "num_heartbeats_timeout": 10,
+        "raylet_heartbeat_timeout_milliseconds": 100,
+        "initial_reconstruction_timeout_milliseconds": 200,
+    }
+    # Workaround to reset the config to the default value.
+    if not reconstruction_enabled:
+        config["lineage_pinning_enabled"] = 0
+    config = json.dumps(config)
+
+    cluster = ray_start_cluster
+    # Head node with no resources.
+    cluster.add_node(
+        num_cpus=0,
+        _internal_config=config,
+        enable_object_reconstruction=reconstruction_enabled)
+    ray.init(address=cluster.address)
+    # Node to place the initial object.
+    node_to_kill = cluster.add_node(
+        num_cpus=1, resources={"node1": 2}, object_store_memory=10**8)
+    cluster.add_node(
+        num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
+    cluster.wait_for_nodes()
+
+    @ray.remote(
+        max_restarts=-1,
+        max_task_retries=-1 if reconstruction_enabled else 0,
+        resources={"node1": 1},
+        num_cpus=0)
+    class Actor:
+        def __init__(self):
+            pass
+
+        def large_object(self):
+            return np.zeros(10**7, dtype=np.uint8)
+
+        def pid(self):
+            return os.getpid()
+
+    @ray.remote
+    def dependent_task(x):
+        return
+
+    a = Actor.remote()
+    pid = ray.get(a.pid.remote())
+    obj = a.large_object.remote()
+    ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
+
+    # Workaround to kill the actor process too since there is a bug where the
+    # actor's plasma client hangs after the plasma store has exited.
+    os.kill(pid, SIGKILL)
+
+    cluster.remove_node(node_to_kill, allow_graceful=False)
+    cluster.add_node(
+        num_cpus=1, resources={"node1": 2}, object_store_memory=10**8)
+
+    wait_for_pid_to_exit(pid)
+
+    if reconstruction_enabled:
+        ray.get(dependent_task.remote(obj))
+    else:
+        with pytest.raises(ray.exceptions.RayTaskError) as e:
+            ray.get(dependent_task.remote(obj))
+            with pytest.raises(ray.exceptions.UnreconstructableError):
+                raise e.as_instanceof_cause()
+
+    # Make sure the actor handle is still usable.
+    pid = ray.get(a.pid.remote())
+
+
+@pytest.mark.parametrize("reconstruction_enabled", [False, True])
+def test_basic_reconstruction_actor_constructor(ray_start_cluster,
+                                                reconstruction_enabled):
+    config = {
+        "num_heartbeats_timeout": 10,
+        "raylet_heartbeat_timeout_milliseconds": 100,
+        "initial_reconstruction_timeout_milliseconds": 200,
+    }
+    # Workaround to reset the config to the default value.
+    if not reconstruction_enabled:
+        config["lineage_pinning_enabled"] = 0
+    config = json.dumps(config)
+
+    cluster = ray_start_cluster
+    # Head node with no resources.
+    cluster.add_node(
+        num_cpus=0,
+        _internal_config=config,
+        enable_object_reconstruction=reconstruction_enabled)
+    ray.init(address=cluster.address)
+    # Node to place the initial object.
+    node_to_kill = cluster.add_node(
+        num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
+    cluster.add_node(
+        num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
+    cluster.wait_for_nodes()
+
+    @ray.remote(max_retries=1 if reconstruction_enabled else 0)
+    def large_object():
+        return np.zeros(10**7, dtype=np.uint8)
+
+    # Both the constructor and a method depend on the large object.
+    @ray.remote(max_restarts=-1)
+    class Actor:
+        def __init__(self, x):
+            pass
+
+        def dependent_task(self, x):
+            return
+
+        def pid(self):
+            return os.getpid()
+
+    obj = large_object.options(resources={"node1": 1}).remote()
+    a = Actor.options(resources={"node1": 1}).remote(obj)
+    ray.get(a.dependent_task.remote(obj))
+    pid = ray.get(a.pid.remote())
+
+    # Workaround to kill the actor process too since there is a bug where the
+    # actor's plasma client hangs after the plasma store has exited.
+    os.kill(pid, SIGKILL)
+
+    cluster.remove_node(node_to_kill, allow_graceful=False)
+    cluster.add_node(
+        num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
+
+    wait_for_pid_to_exit(pid)
+
+    # Wait for the actor to restart.
+    def probe():
+        try:
+            ray.get(a.dependent_task.remote(obj))
+            return True
+        except ray.exceptions.RayActorError:
+            return False
+        except (ray.exceptions.RayTaskError,
+                ray.exceptions.UnreconstructableError):
+            return True
+
+    wait_for_condition(probe)
+
+    if reconstruction_enabled:
+        ray.get(a.dependent_task.remote(obj))
+    else:
+        with pytest.raises(ray.exceptions.RayTaskError) as e:
+            x = a.dependent_task.remote(obj)
+            print(x)
+            ray.get(x)
+            with pytest.raises(ray.exceptions.UnreconstructableError):
+                raise e.as_instanceof_cause()
+
+
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
 def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
    config = {