[core] Reconstruction for lost plasma objects (#7733)

* Add a lineage_ref_count to References

* Refactor TaskManager to store TaskEntry as a struct

* Refactor to fix deadlock between TaskManager and ReferenceCounter
Add references to task specs

* Pin TaskEntries and References in the lineage of any ObjectIDs in scope

* Fix deadlock, convert num_plasma_returns to a set of object IDs

* fix unit tests

* Feature flag

* Do not release lineage for objects that were promoted to plasma

* fix build

* fix build

* Remove num executions

* Remove num executions

* Add pinned locations to ReferenceCounter, empty handler for node death

* Fix num returns for actor tasks, fix Put return value

* Add regression test

* Clear pinned locations and callbacks on node removal

* Clear pinned locations and callbacks on node removal

* Simplify num return values

* Remove unused

* doc

* tmp

* Set num returns

* Move lineage pinning flag to ReferenceCounter

* comments

* Recover from plasma failures by pinning a new copy

* Basic object reconstruction, no concurrent reqs yet

* reconstruction test suite and a few fixes:
- fix for disabling lineage
- fix for updating submitted task refs

* Handle concurrent attempts to recover the same object

* Fix deadlock in DrainAndShutdown

* Revert "[core] Revert lineage pinning (#7499) (#7692)"

This reverts commit ba86a02b37.

* debug rllib

* debug rllib

* turn on all rllib tests again

* debug rllib

* Fix drain bug, check number of pending tasks

* revert rllib debug

* remove todo

* Trigger rllib tests

* revert rllib debug commit

* Split out logic into ObjectRecoveryManager

* Fix python tests

* Refactor to remove dependency on gcs client

* Unit tests

* Move pinned at node ID to direct memory store

* Unit test fixes and lint

* simplify and more tests

* Add ResubmitTask test for TaskManager

* Doc

* fix build

* comments

* Fix

* debug

* Update

* fix

* Fix

* Fix bad status handling, unit test

* Fix build
This commit is contained in:
Stephanie Wang
2020-04-11 16:52:57 -07:00
committed by GitHub
parent 18e9a076e5
commit d7eef808b8
18 changed files with 1284 additions and 48 deletions
+8
View File
@@ -203,6 +203,14 @@ py_test(
deps = ["//:ray_lib"],
)
py_test(
name = "test_reconstruction",
size = "medium",
srcs = ["test_reconstruction.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_reference_counting",
size = "medium",
+279
View File
@@ -0,0 +1,279 @@
import json
import sys
import numpy as np
import pytest
import ray
from ray.cluster_utils import Cluster
from ray.test_utils import (
wait_for_condition, )
def test_cached_object(ray_start_cluster):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
})
cluster = Cluster()
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
# Node to place the initial object.
node_to_kill = cluster.add_node(
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
cluster.add_node(
num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
cluster.wait_for_nodes()
ray.init(address=cluster.address)
@ray.remote
def large_object():
return np.zeros(10**7, dtype=np.uint8)
@ray.remote
def dependent_task(x):
return
obj = large_object.options(resources={"node1": 1}).remote()
ray.get(dependent_task.options(resources={"node2": 1}).remote(obj))
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
assert wait_for_condition(
lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10)
for _ in range(20):
large_object.options(resources={"node2": 1}).remote()
ray.get(dependent_task.remote(obj))
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_reconstruction_cached_dependency(ray_start_cluster,
reconstruction_enabled):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
})
cluster = Cluster()
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
# Node to place the initial object.
node_to_kill = cluster.add_node(
num_cpus=1,
resources={"node1": 1},
object_store_memory=10**8,
_internal_config=config)
cluster.add_node(
num_cpus=1,
resources={"node2": 1},
object_store_memory=10**8,
_internal_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address, _internal_config=config)
@ray.remote(max_retries=0)
def large_object():
return np.zeros(10**7, dtype=np.uint8)
@ray.remote
def chain(x):
return x
@ray.remote
def dependent_task(x):
return
obj = large_object.options(resources={"node2": 1}).remote()
obj = chain.options(resources={"node1": 1}).remote(obj)
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1,
resources={"node1": 1},
object_store_memory=10**8,
_internal_config=config)
assert wait_for_condition(
lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10)
for _ in range(20):
large_object.options(resources={"node2": 1}).remote()
if reconstruction_enabled:
ray.get(dependent_task.remote(obj))
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
raise e.as_instanceof_cause()
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
})
cluster = Cluster()
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
# Node to place the initial object.
node_to_kill = cluster.add_node(
num_cpus=1,
resources={"node1": 1},
object_store_memory=10**8,
_internal_config=config)
cluster.add_node(
num_cpus=1,
resources={"node2": 1},
object_store_memory=10**8,
_internal_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address, _internal_config=config)
@ray.remote(max_retries=1 if reconstruction_enabled else 0)
def large_object():
return np.zeros(10**7, dtype=np.uint8)
@ray.remote
def dependent_task(x):
return
obj = large_object.options(resources={"node1": 1}).remote()
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1,
resources={"node1": 1},
object_store_memory=10**8,
_internal_config=config)
if reconstruction_enabled:
ray.get(dependent_task.remote(obj))
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
raise e.as_instanceof_cause()
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
})
cluster = Cluster()
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
# Node to place the initial object.
node_to_kill = cluster.add_node(
num_cpus=1,
resources={"node1": 1},
object_store_memory=10**8,
_internal_config=config)
cluster.add_node(
num_cpus=1,
resources={"node2": 1},
object_store_memory=10**8,
_internal_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address, _internal_config=config)
@ray.remote(max_retries=1 if reconstruction_enabled else 0)
def large_object():
return np.zeros(10**7, dtype=np.uint8)
@ray.remote
def chain(x):
return x
@ray.remote
def dependent_task(x):
return
obj = large_object.options(resources={"node2": 1}).remote()
downstream = [chain.remote(obj) for _ in range(4)]
for obj in downstream:
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1,
resources={"node1": 1},
object_store_memory=10**8,
_internal_config=config)
if reconstruction_enabled:
for obj in downstream:
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
for obj in downstream:
ray.get(
dependent_task.options(resources={
"node1": 1
}).remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
raise e.as_instanceof_cause()
@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
})
cluster = Cluster()
# Head node with no resources.
cluster.add_node(
num_cpus=0, _internal_config=config, object_store_memory=10**8)
node_to_kill = cluster.add_node(
num_cpus=1, object_store_memory=10**8, _internal_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address, _internal_config=config)
@ray.remote(max_retries=1 if reconstruction_enabled else 0)
def large_object():
return np.zeros(10**7, dtype=np.uint8)
@ray.remote
def chain(x):
return x
@ray.remote
def dependent_task(x):
return x
obj = large_object.remote()
for _ in range(20):
obj = chain.remote(obj)
ray.get(dependent_task.remote(obj))
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1, object_store_memory=10**8, _internal_config=config)
if reconstruction_enabled:
ray.get(dependent_task.remote(obj))
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
raise e.as_instanceof_cause()
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))