[core] Replace task resubmission in raylet with ownership protocol (#9394)

* Add intended worker ID to GetObjectStatus, tests

* Remove TaskID owner_id

* lint

* Add owner address to task args

* Make TaskArg a virtual class, remove multi args

* Set owner address for task args

* merge

* Fix tests

* Add ObjectRefs to task dependency manager, pass from task spec args

* tmp

* tmp

* Fix

* Add ownership info for task arguments

* Convert WaitForDirectActorCallArgs

* lint

* build

* update

* build

* java

* Move code

* build

* Revert "Fix Google log directory again (#9063)"

This reverts commit 275da2e400.

* Fix free

* Regression tests - shorten timeouts in reconstruction unit tests

* Remove timeout for non-actor tasks

* Modify tests using ray.internal.free

* Clean up future resolution code

* Raylet polls the owner

* todo

* comment

* Update src/ray/core_worker/core_worker.cc

Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>

* Drop stale actor table notifications

* Fix bug where actor restart hangs

* Revert buggy code for duplicate tasks

* build

* Fix errors for lru_evict and internal.free

* Revert "Drop stale actor table notifications"

This reverts commit 193c5d20e5577befd43f166e16c972e2f9247c91.

* Revert "build"

This reverts commit 5644edbac906ff6ef98feb40b6f62c9e63698c29.

* Fix free test

* Fixes for freed objects

Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
This commit is contained in:
Stephanie Wang
2020-07-15 14:55:51 -07:00
committed by GitHub
parent 5a40299d42
commit 4e81804cba
15 changed files with 239 additions and 107 deletions
+3 -3
View File
@@ -30,7 +30,7 @@ def memory_summary():
def free(object_refs, local_only=False, delete_creating_tasks=False):
"""Free a list of IDs from object stores.
"""Free a list of IDs from the in-process and plasma object stores.
This function is a low-level API which should be used in restricted
scenarios.
@@ -38,8 +38,8 @@ def free(object_refs, local_only=False, delete_creating_tasks=False):
If local_only is false, the request will be send to all object stores.
This method will not return any value to indicate whether the deletion is
successful or not. This function is an instruction to object store. If
the some of the objects are in use, object stores will delete them later
successful or not. This function is an instruction to the object store. If
some of the objects are in use, the object stores will delete them later
when the ref count is down to 0.
Examples:
+3 -2
View File
@@ -33,11 +33,12 @@ def test_internal_free(shutdown_only):
sampler = Sampler.remote()
# Free does not delete from in-memory store.
# Free deletes from in-memory store.
obj_ref = sampler.sample.remote()
ray.get(obj_ref)
ray.internal.free(obj_ref)
assert ray.get(obj_ref) == [1, 2, 3, 4, 5]
with pytest.raises(Exception):
ray.get(obj_ref)
# Free deletes big objects from plasma store.
big_id = sampler.sample_big.remote()
+4 -2
View File
@@ -6,13 +6,15 @@ import pytest
import ray
from ray.exceptions import RayCancellationError, RayTaskError, \
RayTimeoutError, RayWorkerError
RayTimeoutError, RayWorkerError, \
UnreconstructableError
from ray.test_utils import SignalActor
def valid_exceptions(use_force):
if use_force:
return (RayTaskError, RayCancellationError, RayWorkerError)
return (RayTaskError, RayCancellationError, RayWorkerError,
UnreconstructableError)
else:
return (RayTaskError, RayCancellationError)
-4
View File
@@ -821,8 +821,6 @@ def test_raylet_crash_when_get(ray_start_regular):
object_ref = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
ray.internal.free(object_ref)
while ray.worker.global_worker.core_worker.object_exists(object_ref):
time.sleep(1)
thread = threading.Thread(target=sleep_to_kill_raylet)
thread.start()
@@ -984,8 +982,6 @@ def test_eviction(ray_start_cluster):
assert (isinstance(ray.get(obj), np.ndarray))
# Evict the object.
ray.internal.free([obj])
while ray.worker.global_worker.core_worker.object_exists(obj):
time.sleep(1)
# ray.get throws an exception.
with pytest.raises(ray.exceptions.UnreconstructableError):
ray.get(obj)
+7
View File
@@ -13,6 +13,7 @@ def test_cached_object(ray_start_cluster):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.
@@ -56,6 +57,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.
@@ -109,6 +111,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.
@@ -152,6 +155,7 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.
@@ -198,6 +202,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.
@@ -252,6 +257,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
"raylet_heartbeat_timeout_milliseconds": 100,
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
"free_objects_period_milliseconds": -1,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.
@@ -298,6 +304,7 @@ def test_reconstruction_stress(ray_start_cluster):
"free_objects_period_milliseconds": -1,
"max_direct_call_object_size": 100,
"task_retry_delay_ms": 100,
"initial_reconstruction_timeout_milliseconds": 200,
})
cluster = ray_start_cluster
# Head node with no resources.