mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 02:00:46 +08:00
Distributed ref counting for serialized ObjectIDs (#6945)
* Skeleton plus a unit test for simple borrower case * First unit test passes - forward an ID and task returns with 1 submitted task pending on the inner ID * Invariant for contained_in * Unit test passes for testing task return without creating a borrower * Wrap ref count functionality in test case * Fix bad delete * Unit test and fix for borrowers creating more borrowers * Unit test and fix for simple borrowing, but owner sends call after borrower's ref count goes to 0 * Refactor: - keep a sentinel ref count for task argument IDs - keep contained_in_borrowed in addition to contained_in_owned * Unit test for nested IDs passes * Refactor so that an object ID can only be contained in 1 borrowed ID at a time * Add check * Fix * Unit test (passes) to test nesting object IDs but no borrowers created * Unit test for nested objects from different owners passes, refactor to unset contained_in when popping refs * Unit tests for borrowers receiving an ObjectID from multiple sources, skip adding ownership info if we already have it to handle duplicate refs * Unit test for returning object ID passes * More unit tests for returning object IDs pass * Add serialized ID tests * fix serialization issue * remove swap * It builds! * debugging and some fixes: - register handler for WaitForRefRemoved - don't create a python reference for arg IDs - pass in client factory into ReferenceCounter - fix bad decrement in PopBorrowerRefs * Fix accounting for serialized IDs: - don't decrement for IDs on dependency resolution, wait until task finished - add object IDs that were inlined when building the arguments to the task spec, pin these on the task executor until task finishes * mu_ -> mutex_ * lint * fix build * clear outer_object_id * add direct call type check * Fix test for direct call IDs and return IDs for actor calls * Fix CoreWorkerClient.Addr() * Remove unneeded lock * Remove unnecessary ObjectID refs * Fix worker holding serialized refs test * Fix hex IDs * fix * fix tests * fix tests * refactor and cleanups * lint * Put inlined Ids in task args and some cleanup * Add back gc.collect() line for test case * Refactor and fixes: - store inlined IDs in RayObject - allow storing objects with inlined IDs in memory store - pin objects that were promoted to plasma * oops * make sure worker ID is set in address, pass in rpc::Address to CoreWorkerClient * todos * cleanups and test builds * Fix tests * Add feature flag * cleanups * address comments and some cleanups * cleanup * fix recursive test * Comments for tests * Turn off ref counting by default * Skip tests * Fix some bugs for test_array.py, java build * Don't include nested objects in the ref count when the feature flag is off * C++ feature flag does not work... * Remove * Turn on python tests and add a warning when plasma objects are evicted before being pinned * Fix build and remove irrelevant test * Fix for java * Revert "Fix build and remove irrelevant test" This reverts commit 056cca9b263ed05b0f9ab2250907338edcbca2d5. * Fix ray.internal.free * Fixes and skip some flaky tests * fix java build * fix windows build * Add IDs contained in owned objects * Update src/ray/protobuf/core_worker.proto Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update src/ray/core_worker/reference_count.cc Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update src/ray/protobuf/core_worker.proto Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update src/ray/protobuf/core_worker.proto Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update src/ray/core_worker/reference_count.h Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update src/ray/core_worker/reference_count.h Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update src/ray/core_worker/reference_count.cc Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Apply suggestions from code review Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * update * Try to fix ::test_direct_call_serialized_id_eviction Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
This commit is contained in:
+17
-13
@@ -156,6 +156,8 @@ class SerializationContext:
|
||||
self.add_contained_object_id(obj)
|
||||
owner_id = ""
|
||||
owner_address = ""
|
||||
# TODO(swang): Remove this check. Otherwise, we will not be able to
|
||||
# handle serialized plasma IDs correctly.
|
||||
if obj.is_direct_call_type():
|
||||
worker = ray.worker.get_global_worker()
|
||||
worker.check_connected()
|
||||
@@ -176,14 +178,14 @@ class SerializationContext:
|
||||
# to 'self' here instead, but this function is itself pickled
|
||||
# somewhere, which causes an error.
|
||||
context = ray.worker.global_worker.get_serialization_context()
|
||||
context.add_contained_object_id(deserialized_object_id)
|
||||
if owner_id:
|
||||
worker = ray.worker.get_global_worker()
|
||||
worker.check_connected()
|
||||
# UniqueIDs are serialized as
|
||||
# (class name, (unique bytes,)).
|
||||
outer_id = context.get_outer_object_id()
|
||||
worker.core_worker.deserialize_and_register_object_id(
|
||||
obj_id[1][0], owner_id[1][0], owner_address)
|
||||
obj_id[1][0], outer_id, owner_id[1][0], owner_address)
|
||||
return deserialized_object_id
|
||||
|
||||
for id_type in ray._raylet._ID_TYPES:
|
||||
@@ -204,6 +206,12 @@ class SerializationContext:
|
||||
# construct a reducer
|
||||
pickle.CloudPickler.dispatch[cls] = _CloudPicklerReducer
|
||||
|
||||
def set_outer_object_id(self, outer_object_id):
|
||||
self._thread_local.outer_object_id = outer_object_id
|
||||
|
||||
def get_outer_object_id(self):
|
||||
return getattr(self._thread_local, "outer_object_id", None)
|
||||
|
||||
def get_and_clear_contained_object_ids(self):
|
||||
if not hasattr(self._thread_local, "object_ids"):
|
||||
self._thread_local.object_ids = set()
|
||||
@@ -235,18 +243,8 @@ class SerializationContext:
|
||||
# cloudpickle does not provide error types
|
||||
except pickle.pickle.PicklingError:
|
||||
raise DeserializationError()
|
||||
|
||||
# Check that there are no ObjectIDs serialized in arguments
|
||||
# that are inlined.
|
||||
if object_id.is_nil():
|
||||
assert len(self.get_and_clear_contained_object_ids()) == 0
|
||||
else:
|
||||
worker = ray.worker.global_worker
|
||||
worker.core_worker.add_contained_object_ids(
|
||||
object_id,
|
||||
self.get_and_clear_contained_object_ids(),
|
||||
)
|
||||
return obj
|
||||
|
||||
# Check if the object should be returned as raw bytes.
|
||||
if metadata == ray_constants.RAW_BUFFER_METADATA:
|
||||
if data is None:
|
||||
@@ -287,6 +285,8 @@ class SerializationContext:
|
||||
while i < len(object_ids):
|
||||
object_id = object_ids[i]
|
||||
data, metadata = data_metadata_pairs[i]
|
||||
assert self.get_outer_object_id() is None
|
||||
self.set_outer_object_id(object_id)
|
||||
try:
|
||||
results.append(
|
||||
self._deserialize_object(data, metadata, object_id))
|
||||
@@ -310,6 +310,9 @@ class SerializationContext:
|
||||
warning_message,
|
||||
job_id=self.worker.current_job_id)
|
||||
warning_sent = True
|
||||
finally:
|
||||
# Must clear ObjectID to not hold a reference.
|
||||
self.set_outer_object_id(None)
|
||||
|
||||
return results
|
||||
|
||||
@@ -328,6 +331,7 @@ class SerializationContext:
|
||||
assert self.worker.use_pickle
|
||||
assert ray.cloudpickle.FAST_CLOUDPICKLE_USED
|
||||
writer = Pickle5Writer()
|
||||
# TODO(swang): Check that contained_object_ids is empty.
|
||||
inband = pickle.dumps(
|
||||
value, protocol=5, buffer_callback=writer.buffer_callback)
|
||||
return Pickle5SerializedObject(
|
||||
|
||||
Reference in New Issue
Block a user