mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:46:10 +08:00
By default, reconstruction should only be enabled for actor creation. (#6613)
* wip * fix * fix
This commit is contained in:
+1
-6
@@ -768,12 +768,7 @@ def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
|
||||
"methods in the `Checkpointable` interface.")
|
||||
|
||||
if max_reconstructions is None:
|
||||
if ray_constants.direct_call_enabled():
|
||||
# Allow the actor creation task to be resubmitted automatically
|
||||
# by default.
|
||||
max_reconstructions = 3
|
||||
else:
|
||||
max_reconstructions = 0
|
||||
max_reconstructions = 0
|
||||
|
||||
if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
|
||||
ray_constants.INFINITE_RECONSTRUCTION):
|
||||
|
||||
@@ -16,11 +16,8 @@ import time
|
||||
import ray
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
from ray import ray_constants
|
||||
from ray.test_utils import run_string_as_driver
|
||||
|
||||
RAY_FORCE_DIRECT = ray_constants.direct_call_enabled()
|
||||
|
||||
|
||||
def test_actor_init_error_propagated(ray_start_regular):
|
||||
@ray.remote
|
||||
@@ -810,7 +807,6 @@ def test_exception_raised_when_actor_node_dies(ray_start_cluster_head):
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
@pytest.mark.skipif(RAY_FORCE_DIRECT, reason="no ft yet")
|
||||
def test_actor_init_fails(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
remote_node = cluster.add_node()
|
||||
|
||||
@@ -12,8 +12,6 @@ import pytest
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
RAY_FORCE_DIRECT = ray_constants.direct_call_enabled()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
@@ -64,19 +62,16 @@ def test_actor_creation_node_failure(ray_start_cluster):
|
||||
except ray.exceptions.RayActorError:
|
||||
children[i] = Child.remote(death_probability)
|
||||
|
||||
if (RAY_FORCE_DIRECT):
|
||||
children_out = [
|
||||
child.get_probability.remote() for child in children
|
||||
]
|
||||
# Wait for new created actors to finish creation before
|
||||
# removing a node. This is needed because right now we don't
|
||||
# support reconstructing actors that died in the process of
|
||||
# being created.
|
||||
ready, _ = ray.wait(
|
||||
children_out,
|
||||
num_returns=len(children_out),
|
||||
timeout=5 * 60.0)
|
||||
assert len(ready) == len(children_out)
|
||||
children_out = [
|
||||
child.get_probability.remote() for child in children
|
||||
]
|
||||
# Wait for new created actors to finish creation before
|
||||
# removing a node. This is needed because right now we don't
|
||||
# support reconstructing actors that died in the process of
|
||||
# being created.
|
||||
ready, _ = ray.wait(
|
||||
children_out, num_returns=len(children_out), timeout=5 * 60.0)
|
||||
assert len(ready) == len(children_out)
|
||||
|
||||
# Remove a node. Any actor creation tasks that were forwarded to this
|
||||
# node must be reconstructed.
|
||||
|
||||
@@ -888,9 +888,6 @@ def test_fill_object_store_exception(ray_start_cluster_head):
|
||||
ray.put(np.zeros(10**8 + 2, dtype=np.uint8))
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not RAY_FORCE_DIRECT,
|
||||
reason="raylet path attempts reconstruction for evicted objects")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_nodes": 1,
|
||||
@@ -925,9 +922,6 @@ def test_direct_call_eviction(ray_start_cluster):
|
||||
ray.get(dependent_task.remote(obj))
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not RAY_FORCE_DIRECT,
|
||||
reason="raylet path attempts reconstruction for evicted objects")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_nodes": 1,
|
||||
|
||||
@@ -82,7 +82,6 @@ def test_object_reconstruction(ray_start_cluster):
|
||||
ray.get(xs)
|
||||
|
||||
|
||||
@pytest.mark.skipif(RAY_FORCE_DIRECT, reason="no actor restart yet")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 4,
|
||||
@@ -105,7 +104,7 @@ def test_actor_creation_node_failure(ray_start_cluster):
|
||||
if exit_chance < self.death_probability:
|
||||
sys.exit(-1)
|
||||
|
||||
num_children = 50
|
||||
num_children = 25
|
||||
# Children actors will die about half the time.
|
||||
death_probability = 0.5
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import pytest
|
||||
import time
|
||||
|
||||
from ray import ray_constants
|
||||
import ray
|
||||
import ray.experimental.signal as signal
|
||||
|
||||
@@ -276,9 +275,6 @@ def test_forget(ray_start_regular):
|
||||
assert len(result_list) == count
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
ray_constants.direct_call_enabled(),
|
||||
reason="TODO(ekl): this requires reconstruction")
|
||||
def test_signal_on_node_failure(two_node_cluster):
|
||||
"""Test actor checkpointing on a remote node."""
|
||||
|
||||
@@ -395,6 +391,5 @@ def test_small_receive_timeout(ray_start_regular):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user