Reference counting for direct call submitted tasks (#6514)

Co-authored-by: Zhijun Fu <37800433+zhijunfu@users.noreply.github.com>
This commit is contained in:
Edward Oakes
2019-12-20 17:06:33 -08:00
committed by GitHub
parent b0b6b56bb7
commit e50aa99be1
17 changed files with 512 additions and 344 deletions
+20 -6
View File
@@ -1073,16 +1073,12 @@ cdef class CoreWorker:
return output
def add_object_id_reference(self, ObjectID object_id):
cdef:
CObjectID c_object_id = object_id.native()
# Note: faster to not release GIL for short-running op.
self.core_worker.get().AddObjectIDReference(c_object_id)
self.core_worker.get().AddLocalReference(object_id.native())
def remove_object_id_reference(self, ObjectID object_id):
cdef:
CObjectID c_object_id = object_id.native()
# Note: faster to not release GIL for short-running op.
self.core_worker.get().RemoveObjectIDReference(c_object_id)
self.core_worker.get().RemoveLocalReference(object_id.native())
def serialize_and_promote_object_id(self, ObjectID object_id):
cdef:
@@ -1174,6 +1170,24 @@ cdef class CoreWorker:
def current_actor_is_asyncio(self):
return self.core_worker.get().GetWorkerContext().CurrentActorIsAsync()
def get_all_reference_counts(self):
cdef:
unordered_map[CObjectID, pair[size_t, size_t]] c_ref_counts
unordered_map[CObjectID, pair[size_t, size_t]].iterator it
c_ref_counts = self.core_worker.get().GetAllReferenceCounts()
it = c_ref_counts.begin()
ref_counts = {}
while it != c_ref_counts.end():
object_id = ObjectID(dereference(it).first.Binary())
ref_counts[object_id] = {
"local": dereference(it).second.first,
"submitted": dereference(it).second.second}
postincrement(it)
return ref_counts
def in_memory_store_get_async(self, ObjectID object_id, future):
self.core_worker.get().GetAsync(
object_id.native(),
+5 -2
View File
@@ -118,8 +118,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
CRayStatus SerializeActorHandle(const CActorID &actor_id, c_string
*bytes)
void AddObjectIDReference(const CObjectID &object_id)
void RemoveObjectIDReference(const CObjectID &object_id)
void AddLocalReference(const CObjectID &object_id)
void RemoveLocalReference(const CObjectID &object_id)
void PromoteObjectToPlasma(const CObjectID &object_id)
void PromoteToPlasmaAndGetOwnershipInfo(const CObjectID &object_id,
CTaskID *owner_id,
@@ -149,6 +149,9 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CWorkerContext &GetWorkerContext()
void YieldCurrentFiber(CFiberEvent &coroutine_done)
unordered_map[CObjectID, pair[size_t, size_t]] GetAllReferenceCounts()
void GetAsync(const CObjectID &object_id,
ray_callback_function successs_callback,
ray_callback_function fallback_callback,
+162
View File
@@ -0,0 +1,162 @@
# coding: utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import copy
import tempfile
import numpy as np
import time
import logging
import uuid
import ray
import ray.cluster_utils
import ray.test_utils
logger = logging.getLogger(__name__)
def _check_refcounts(expected):
actual = ray.worker.global_worker.core_worker.get_all_reference_counts()
assert len(expected) == len(actual)
for object_id, (local, submitted) in expected.items():
assert object_id in actual
assert local == actual[object_id]["local"]
assert submitted == actual[object_id]["submitted"]
def check_refcounts(expected, timeout=1):
start = time.time()
while True:
try:
_check_refcounts(expected)
break
except AssertionError as e:
if time.time() - start > timeout:
raise e
else:
time.sleep(0.1)
def test_local_refcounts(ray_start_regular):
oid1 = ray.put(None)
check_refcounts({oid1: (1, 0)})
oid1_copy = copy.copy(oid1)
check_refcounts({oid1: (2, 0)})
del oid1
check_refcounts({oid1_copy: (1, 0)})
del oid1_copy
check_refcounts({})
def test_dependency_refcounts(ray_start_regular):
# Return a large object that will be spilled to plasma.
def large_object():
return np.zeros(10 * 1024 * 1024, dtype=np.uint8)
# TODO: Clean up tmpfiles?
def random_path():
return os.path.join(tempfile.gettempdir(), uuid.uuid4().hex)
def touch(path):
with open(path, "w"):
pass
def wait_for_file(path):
while True:
if os.path.exists(path):
break
time.sleep(0.1)
@ray.remote
def one_dep(dep, path=None, fail=False):
if path is not None:
wait_for_file(path)
if fail:
raise Exception("failed on purpose")
@ray.remote
def one_dep_large(dep, path=None):
if path is not None:
wait_for_file(path)
# This should be spilled to plasma.
return large_object()
# Test that regular plasma dependency refcounts are decremented once the
# task finishes.
f = random_path()
large_dep = ray.put(large_object())
result = one_dep.remote(large_dep, path=f)
check_refcounts({large_dep: (1, 1), result: (1, 0)})
touch(f)
# Reference count should be removed once the task finishes.
check_refcounts({large_dep: (1, 0), result: (1, 0)})
del large_dep, result
check_refcounts({})
# Test that inlined dependency refcounts are decremented once they are
# inlined.
f = random_path()
dep = one_dep.remote(None, path=f)
check_refcounts({dep: (1, 0)})
result = one_dep.remote(dep)
check_refcounts({dep: (1, 1), result: (1, 0)})
touch(f)
# Reference count should be removed as soon as the dependency is inlined.
check_refcounts({dep: (1, 0), result: (1, 0)}, timeout=1)
del dep, result
check_refcounts({})
# Test that spilled plasma dependency refcounts are decremented once
# the task finishes.
f1, f2 = random_path(), random_path()
dep = one_dep_large.remote(None, path=f1)
check_refcounts({dep: (1, 0)})
result = one_dep.remote(dep, path=f2)
check_refcounts({dep: (1, 1), result: (1, 0)})
touch(f1)
ray.get(dep, timeout=5.0)
# Reference count should remain because the dependency is in plasma.
check_refcounts({dep: (1, 1), result: (1, 0)})
touch(f2)
# Reference count should be removed because the task finished.
check_refcounts({dep: (1, 0), result: (1, 0)})
del dep, result
check_refcounts({})
# Test that regular plasma dependency refcounts are decremented if a task
# fails.
f = random_path()
large_dep = ray.put(large_object())
result = one_dep.remote(large_dep, path=f, fail=True)
check_refcounts({large_dep: (1, 1), result: (1, 0)})
touch(f)
# Reference count should be removed once the task finishes.
check_refcounts({large_dep: (1, 0), result: (1, 0)})
del large_dep, result
check_refcounts({})
# Test that spilled plasma dependency refcounts are decremented if a task
# fails.
f1, f2 = random_path(), random_path()
dep = one_dep_large.remote(None, path=f1)
check_refcounts({dep: (1, 0)})
result = one_dep.remote(dep, path=f2, fail=True)
check_refcounts({dep: (1, 1), result: (1, 0)})
touch(f1)
ray.get(dep, timeout=5.0)
# Reference count should remain because the dependency is in plasma.
check_refcounts({dep: (1, 1), result: (1, 0)})
touch(f2)
# Reference count should be removed because the task finished.
check_refcounts({dep: (1, 0), result: (1, 0)})
del dep, result
check_refcounts({})
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))