mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 12:45:44 +08:00
[core] Replace task resubmission in raylet with ownership protocol (#9394)
* Add intended worker ID to GetObjectStatus, tests
* Remove TaskID owner_id
* lint
* Add owner address to task args
* Make TaskArg a virtual class, remove multi args
* Set owner address for task args
* merge
* Fix tests
* Add ObjectRefs to task dependency manager, pass from task spec args
* tmp
* tmp
* Fix
* Add ownership info for task arguments
* Convert WaitForDirectActorCallArgs
* lint
* build
* update
* build
* java
* Move code
* build
* Revert "Fix Google log directory again (#9063)"
This reverts commit 275da2e400.
* Fix free
* Regression tests - shorten timeouts in reconstruction unit tests
* Remove timeout for non-actor tasks
* Modify tests using ray.internal.free
* Clean up future resolution code
* Raylet polls the owner
* todo
* comment
* Update src/ray/core_worker/core_worker.cc
Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
* Drop stale actor table notifications
* Fix bug where actor restart hangs
* Revert buggy code for duplicate tasks
* build
* Fix errors for lru_evict and internal.free
* Revert "Drop stale actor table notifications"
This reverts commit 193c5d20e5577befd43f166e16c972e2f9247c91.
* Revert "build"
This reverts commit 5644edbac906ff6ef98feb40b6f62c9e63698c29.
* Fix free test
* Fixes for freed objects
Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
This commit is contained in:
@@ -30,7 +30,7 @@ def memory_summary():
|
||||
|
||||
|
||||
def free(object_refs, local_only=False, delete_creating_tasks=False):
|
||||
"""Free a list of IDs from object stores.
|
||||
"""Free a list of IDs from the in-process and plasma object stores.
|
||||
|
||||
This function is a low-level API which should be used in restricted
|
||||
scenarios.
|
||||
@@ -38,8 +38,8 @@ def free(object_refs, local_only=False, delete_creating_tasks=False):
|
||||
If local_only is false, the request will be send to all object stores.
|
||||
|
||||
This method will not return any value to indicate whether the deletion is
|
||||
successful or not. This function is an instruction to object store. If
|
||||
the some of the objects are in use, object stores will delete them later
|
||||
successful or not. This function is an instruction to the object store. If
|
||||
some of the objects are in use, the object stores will delete them later
|
||||
when the ref count is down to 0.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -33,11 +33,12 @@ def test_internal_free(shutdown_only):
|
||||
|
||||
sampler = Sampler.remote()
|
||||
|
||||
# Free does not delete from in-memory store.
|
||||
# Free deletes from in-memory store.
|
||||
obj_ref = sampler.sample.remote()
|
||||
ray.get(obj_ref)
|
||||
ray.internal.free(obj_ref)
|
||||
assert ray.get(obj_ref) == [1, 2, 3, 4, 5]
|
||||
with pytest.raises(Exception):
|
||||
ray.get(obj_ref)
|
||||
|
||||
# Free deletes big objects from plasma store.
|
||||
big_id = sampler.sample_big.remote()
|
||||
|
||||
@@ -6,13 +6,15 @@ import pytest
|
||||
|
||||
import ray
|
||||
from ray.exceptions import RayCancellationError, RayTaskError, \
|
||||
RayTimeoutError, RayWorkerError
|
||||
RayTimeoutError, RayWorkerError, \
|
||||
UnreconstructableError
|
||||
from ray.test_utils import SignalActor
|
||||
|
||||
|
||||
def valid_exceptions(use_force):
|
||||
if use_force:
|
||||
return (RayTaskError, RayCancellationError, RayWorkerError)
|
||||
return (RayTaskError, RayCancellationError, RayWorkerError,
|
||||
UnreconstructableError)
|
||||
else:
|
||||
return (RayTaskError, RayCancellationError)
|
||||
|
||||
|
||||
@@ -821,8 +821,6 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
|
||||
object_ref = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
|
||||
ray.internal.free(object_ref)
|
||||
while ray.worker.global_worker.core_worker.object_exists(object_ref):
|
||||
time.sleep(1)
|
||||
|
||||
thread = threading.Thread(target=sleep_to_kill_raylet)
|
||||
thread.start()
|
||||
@@ -984,8 +982,6 @@ def test_eviction(ray_start_cluster):
|
||||
assert (isinstance(ray.get(obj), np.ndarray))
|
||||
# Evict the object.
|
||||
ray.internal.free([obj])
|
||||
while ray.worker.global_worker.core_worker.object_exists(obj):
|
||||
time.sleep(1)
|
||||
# ray.get throws an exception.
|
||||
with pytest.raises(ray.exceptions.UnreconstructableError):
|
||||
ray.get(obj)
|
||||
|
||||
@@ -13,6 +13,7 @@ def test_cached_object(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
@@ -56,6 +57,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
|
||||
"free_objects_period_milliseconds": -1,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
@@ -109,6 +111,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
|
||||
"free_objects_period_milliseconds": -1,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
@@ -152,6 +155,7 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
|
||||
"free_objects_period_milliseconds": -1,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
@@ -198,6 +202,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
|
||||
"free_objects_period_milliseconds": -1,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
@@ -252,6 +257,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"lineage_pinning_enabled": 1 if reconstruction_enabled else 0,
|
||||
"free_objects_period_milliseconds": -1,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
@@ -298,6 +304,7 @@ def test_reconstruction_stress(ray_start_cluster):
|
||||
"free_objects_period_milliseconds": -1,
|
||||
"max_direct_call_object_size": 100,
|
||||
"task_retry_delay_ms": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
|
||||
Reference in New Issue
Block a user