mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 22:20:31 +08:00
Reference counting for direct call submitted tasks (#6514)
Co-authored-by: Zhijun Fu <37800433+zhijunfu@users.noreply.github.com>
This commit is contained in:
+20
-6
@@ -1073,16 +1073,12 @@ cdef class CoreWorker:
|
||||
return output
|
||||
|
||||
def add_object_id_reference(self, ObjectID object_id):
|
||||
cdef:
|
||||
CObjectID c_object_id = object_id.native()
|
||||
# Note: faster to not release GIL for short-running op.
|
||||
self.core_worker.get().AddObjectIDReference(c_object_id)
|
||||
self.core_worker.get().AddLocalReference(object_id.native())
|
||||
|
||||
def remove_object_id_reference(self, ObjectID object_id):
|
||||
cdef:
|
||||
CObjectID c_object_id = object_id.native()
|
||||
# Note: faster to not release GIL for short-running op.
|
||||
self.core_worker.get().RemoveObjectIDReference(c_object_id)
|
||||
self.core_worker.get().RemoveLocalReference(object_id.native())
|
||||
|
||||
def serialize_and_promote_object_id(self, ObjectID object_id):
|
||||
cdef:
|
||||
@@ -1174,6 +1170,24 @@ cdef class CoreWorker:
|
||||
def current_actor_is_asyncio(self):
|
||||
return self.core_worker.get().GetWorkerContext().CurrentActorIsAsync()
|
||||
|
||||
def get_all_reference_counts(self):
|
||||
cdef:
|
||||
unordered_map[CObjectID, pair[size_t, size_t]] c_ref_counts
|
||||
unordered_map[CObjectID, pair[size_t, size_t]].iterator it
|
||||
|
||||
c_ref_counts = self.core_worker.get().GetAllReferenceCounts()
|
||||
it = c_ref_counts.begin()
|
||||
|
||||
ref_counts = {}
|
||||
while it != c_ref_counts.end():
|
||||
object_id = ObjectID(dereference(it).first.Binary())
|
||||
ref_counts[object_id] = {
|
||||
"local": dereference(it).second.first,
|
||||
"submitted": dereference(it).second.second}
|
||||
postincrement(it)
|
||||
|
||||
return ref_counts
|
||||
|
||||
def in_memory_store_get_async(self, ObjectID object_id, future):
|
||||
self.core_worker.get().GetAsync(
|
||||
object_id.native(),
|
||||
|
||||
@@ -118,8 +118,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
|
||||
CRayStatus SerializeActorHandle(const CActorID &actor_id, c_string
|
||||
*bytes)
|
||||
void AddObjectIDReference(const CObjectID &object_id)
|
||||
void RemoveObjectIDReference(const CObjectID &object_id)
|
||||
void AddLocalReference(const CObjectID &object_id)
|
||||
void RemoveLocalReference(const CObjectID &object_id)
|
||||
void PromoteObjectToPlasma(const CObjectID &object_id)
|
||||
void PromoteToPlasmaAndGetOwnershipInfo(const CObjectID &object_id,
|
||||
CTaskID *owner_id,
|
||||
@@ -149,6 +149,9 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
|
||||
CWorkerContext &GetWorkerContext()
|
||||
void YieldCurrentFiber(CFiberEvent &coroutine_done)
|
||||
|
||||
unordered_map[CObjectID, pair[size_t, size_t]] GetAllReferenceCounts()
|
||||
|
||||
void GetAsync(const CObjectID &object_id,
|
||||
ray_callback_function successs_callback,
|
||||
ray_callback_function fallback_callback,
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
# coding: utf-8
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import copy
|
||||
import tempfile
|
||||
import numpy as np
|
||||
import time
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
import ray
|
||||
import ray.cluster_utils
|
||||
import ray.test_utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _check_refcounts(expected):
|
||||
actual = ray.worker.global_worker.core_worker.get_all_reference_counts()
|
||||
assert len(expected) == len(actual)
|
||||
for object_id, (local, submitted) in expected.items():
|
||||
assert object_id in actual
|
||||
assert local == actual[object_id]["local"]
|
||||
assert submitted == actual[object_id]["submitted"]
|
||||
|
||||
|
||||
def check_refcounts(expected, timeout=1):
|
||||
start = time.time()
|
||||
while True:
|
||||
try:
|
||||
_check_refcounts(expected)
|
||||
break
|
||||
except AssertionError as e:
|
||||
if time.time() - start > timeout:
|
||||
raise e
|
||||
else:
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def test_local_refcounts(ray_start_regular):
|
||||
oid1 = ray.put(None)
|
||||
check_refcounts({oid1: (1, 0)})
|
||||
oid1_copy = copy.copy(oid1)
|
||||
check_refcounts({oid1: (2, 0)})
|
||||
del oid1
|
||||
check_refcounts({oid1_copy: (1, 0)})
|
||||
del oid1_copy
|
||||
check_refcounts({})
|
||||
|
||||
|
||||
def test_dependency_refcounts(ray_start_regular):
|
||||
# Return a large object that will be spilled to plasma.
|
||||
def large_object():
|
||||
return np.zeros(10 * 1024 * 1024, dtype=np.uint8)
|
||||
|
||||
# TODO: Clean up tmpfiles?
|
||||
def random_path():
|
||||
return os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
|
||||
|
||||
def touch(path):
|
||||
with open(path, "w"):
|
||||
pass
|
||||
|
||||
def wait_for_file(path):
|
||||
while True:
|
||||
if os.path.exists(path):
|
||||
break
|
||||
time.sleep(0.1)
|
||||
|
||||
@ray.remote
|
||||
def one_dep(dep, path=None, fail=False):
|
||||
if path is not None:
|
||||
wait_for_file(path)
|
||||
if fail:
|
||||
raise Exception("failed on purpose")
|
||||
|
||||
@ray.remote
|
||||
def one_dep_large(dep, path=None):
|
||||
if path is not None:
|
||||
wait_for_file(path)
|
||||
# This should be spilled to plasma.
|
||||
return large_object()
|
||||
|
||||
# Test that regular plasma dependency refcounts are decremented once the
|
||||
# task finishes.
|
||||
f = random_path()
|
||||
large_dep = ray.put(large_object())
|
||||
result = one_dep.remote(large_dep, path=f)
|
||||
check_refcounts({large_dep: (1, 1), result: (1, 0)})
|
||||
touch(f)
|
||||
# Reference count should be removed once the task finishes.
|
||||
check_refcounts({large_dep: (1, 0), result: (1, 0)})
|
||||
del large_dep, result
|
||||
check_refcounts({})
|
||||
|
||||
# Test that inlined dependency refcounts are decremented once they are
|
||||
# inlined.
|
||||
f = random_path()
|
||||
dep = one_dep.remote(None, path=f)
|
||||
check_refcounts({dep: (1, 0)})
|
||||
result = one_dep.remote(dep)
|
||||
check_refcounts({dep: (1, 1), result: (1, 0)})
|
||||
touch(f)
|
||||
# Reference count should be removed as soon as the dependency is inlined.
|
||||
check_refcounts({dep: (1, 0), result: (1, 0)}, timeout=1)
|
||||
del dep, result
|
||||
check_refcounts({})
|
||||
|
||||
# Test that spilled plasma dependency refcounts are decremented once
|
||||
# the task finishes.
|
||||
f1, f2 = random_path(), random_path()
|
||||
dep = one_dep_large.remote(None, path=f1)
|
||||
check_refcounts({dep: (1, 0)})
|
||||
result = one_dep.remote(dep, path=f2)
|
||||
check_refcounts({dep: (1, 1), result: (1, 0)})
|
||||
touch(f1)
|
||||
ray.get(dep, timeout=5.0)
|
||||
# Reference count should remain because the dependency is in plasma.
|
||||
check_refcounts({dep: (1, 1), result: (1, 0)})
|
||||
touch(f2)
|
||||
# Reference count should be removed because the task finished.
|
||||
check_refcounts({dep: (1, 0), result: (1, 0)})
|
||||
del dep, result
|
||||
check_refcounts({})
|
||||
|
||||
# Test that regular plasma dependency refcounts are decremented if a task
|
||||
# fails.
|
||||
f = random_path()
|
||||
large_dep = ray.put(large_object())
|
||||
result = one_dep.remote(large_dep, path=f, fail=True)
|
||||
check_refcounts({large_dep: (1, 1), result: (1, 0)})
|
||||
touch(f)
|
||||
# Reference count should be removed once the task finishes.
|
||||
check_refcounts({large_dep: (1, 0), result: (1, 0)})
|
||||
del large_dep, result
|
||||
check_refcounts({})
|
||||
|
||||
# Test that spilled plasma dependency refcounts are decremented if a task
|
||||
# fails.
|
||||
f1, f2 = random_path(), random_path()
|
||||
dep = one_dep_large.remote(None, path=f1)
|
||||
check_refcounts({dep: (1, 0)})
|
||||
result = one_dep.remote(dep, path=f2, fail=True)
|
||||
check_refcounts({dep: (1, 1), result: (1, 0)})
|
||||
touch(f1)
|
||||
ray.get(dep, timeout=5.0)
|
||||
# Reference count should remain because the dependency is in plasma.
|
||||
check_refcounts({dep: (1, 1), result: (1, 0)})
|
||||
touch(f2)
|
||||
# Reference count should be removed because the task finished.
|
||||
check_refcounts({dep: (1, 0), result: (1, 0)})
|
||||
del dep, result
|
||||
check_refcounts({})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
Reference in New Issue
Block a user