workaround for python3.5 fast numpy serialization (#6675)

This commit is contained in:
Siyuan (Ryans) Zhuang
2020-02-03 16:08:18 -05:00
committed by GitHub
parent 271de9b04d
commit 42cbf801e1
5 changed files with 110 additions and 281 deletions
+57 -273
View File
@@ -1,9 +1,7 @@
import hashlib
import io
import logging
import time
import pyarrow
import pyarrow.plasma as plasma
import ray.cloudpickle as pickle
@@ -15,7 +13,6 @@ from ray.exceptions import (
RayActorError,
RayWorkerError,
UnreconstructableError,
RAY_EXCEPTION_TYPES,
)
from ray._raylet import Pickle5Writer, unpack_pickle5_buffers
@@ -65,16 +62,6 @@ class Pickle5SerializedObject(SerializedObject):
return self._total_bytes
class ArrowSerializedObject(SerializedObject):
def __init__(self, serialized_object):
super(ArrowSerializedObject, self).__init__(b"")
self.serialized_object = serialized_object
@property
def total_bytes(self):
return self.serialized_object.total_bytes
class RawSerializedObject(SerializedObject):
def __init__(self, value):
super(RawSerializedObject,
@@ -137,6 +124,7 @@ class SerializationContext:
def __init__(self, worker):
self.worker = worker
assert worker.use_pickle
self.use_pickle = worker.use_pickle
def actor_handle_serializer(obj):
@@ -147,175 +135,62 @@ class SerializationContext:
new_handle._deserialization_helper(serialized_obj, True)
return new_handle
if not worker.use_pickle:
serialization_context = pyarrow.default_serialization_context()
# Tell the serialization context to use the cloudpickle version
# that we ship with Ray.
serialization_context.set_pickle(pickle.dumps, pickle.loads)
pyarrow.register_torch_serialization_handlers(
serialization_context)
self._register_cloudpickle_serializer(
ray.actor.ActorHandle,
custom_serializer=actor_handle_serializer,
custom_deserializer=actor_handle_deserializer)
def id_serializer(obj):
return pickle.dumps(obj)
def id_serializer(obj):
return obj.__reduce__()
def id_deserializer(serialized_obj):
return pickle.loads(serialized_obj)
def id_deserializer(serialized_obj):
return serialized_obj[0](*serialized_obj[1])
def object_id_serializer(obj):
owner_id = ""
owner_address = ""
if obj.is_direct_call_type():
worker = ray.worker.get_global_worker()
worker.check_connected()
obj, owner_id, owner_address = (
worker.core_worker.serialize_and_promote_object_id(obj)
)
obj = obj.__reduce__()
owner_id = owner_id.__reduce__() if owner_id else owner_id
return pickle.dumps((obj, owner_id, owner_address))
def object_id_serializer(obj):
owner_id = ""
owner_address = ""
if obj.is_direct_call_type():
worker = ray.worker.get_global_worker()
worker.check_connected()
obj, owner_id, owner_address = (
worker.core_worker.serialize_and_promote_object_id(obj))
obj = id_serializer(obj)
owner_id = id_serializer(owner_id) if owner_id else owner_id
return (obj, owner_id, owner_address)
def object_id_deserializer(serialized_obj):
obj_id, owner_id, owner_address = pickle.loads(serialized_obj)
# NOTE(swang): Must deserialize the object first before asking
# the core worker to resolve the value. This is to make sure
# that the ref count for the ObjectID is greater than 0 by the
# time the core worker resolves the value of the object.
deserialized_object_id = obj_id[0](obj_id[1][0])
if owner_id:
worker = ray.worker.get_global_worker()
worker.check_connected()
# UniqueIDs are serialized as
# (class name, (unique bytes,)).
worker.core_worker.deserialize_and_register_object_id(
obj_id[1][0], owner_id[1][0], owner_address)
return deserialized_object_id
def object_id_deserializer(serialized_obj):
obj_id, owner_id, owner_address = serialized_obj
# NOTE(swang): Must deserialize the object first before asking
# the core worker to resolve the value. This is to make sure
# that the ref count for the ObjectID is greater than 0 by the
# time the core worker resolves the value of the object.
deserialized_object_id = id_deserializer(obj_id)
if owner_id:
worker = ray.worker.get_global_worker()
worker.check_connected()
# UniqueIDs are serialized as
# (class name, (unique bytes,)).
worker.core_worker.deserialize_and_register_object_id(
obj_id[1][0], owner_id[1][0], owner_address)
return deserialized_object_id
for id_type in ray._raylet._ID_TYPES:
if id_type == ray._raylet.ObjectID:
serialization_context.register_type(
id_type,
"{}.{}".format(id_type.__module__, id_type.__name__),
custom_serializer=object_id_serializer,
custom_deserializer=object_id_deserializer)
else:
serialization_context.register_type(
id_type,
"{}.{}".format(id_type.__module__, id_type.__name__),
custom_serializer=id_serializer,
custom_deserializer=id_deserializer)
# We register this serializer on each worker instead of calling
# _register_custom_serializer from the driver so that isinstance
# still works.
serialization_context.register_type(
ray.actor.ActorHandle,
"ray.ActorHandle",
pickle=False,
custom_serializer=actor_handle_serializer,
custom_deserializer=actor_handle_deserializer)
self.pyarrow_context = serialization_context
else:
self._register_cloudpickle_serializer(
ray.actor.ActorHandle,
custom_serializer=actor_handle_serializer,
custom_deserializer=actor_handle_deserializer)
def id_serializer(obj):
return obj.__reduce__()
def id_deserializer(serialized_obj):
return serialized_obj[0](*serialized_obj[1])
def object_id_serializer(obj):
owner_id = ""
owner_address = ""
if obj.is_direct_call_type():
worker = ray.worker.get_global_worker()
worker.check_connected()
obj, owner_id, owner_address = (
worker.core_worker.serialize_and_promote_object_id(obj)
)
obj = id_serializer(obj)
owner_id = id_serializer(owner_id) if owner_id else owner_id
return (obj, owner_id, owner_address)
def object_id_deserializer(serialized_obj):
obj_id, owner_id, owner_address = serialized_obj
# NOTE(swang): Must deserialize the object first before asking
# the core worker to resolve the value. This is to make sure
# that the ref count for the ObjectID is greater than 0 by the
# time the core worker resolves the value of the object.
deserialized_object_id = id_deserializer(obj_id)
if owner_id:
worker = ray.worker.get_global_worker()
worker.check_connected()
# UniqueIDs are serialized as
# (class name, (unique bytes,)).
worker.core_worker.deserialize_and_register_object_id(
obj_id[1][0], owner_id[1][0], owner_address)
return deserialized_object_id
for id_type in ray._raylet._ID_TYPES:
if id_type == ray._raylet.ObjectID:
self._register_cloudpickle_serializer(
id_type, object_id_serializer, object_id_deserializer)
else:
self._register_cloudpickle_serializer(
id_type, id_serializer, id_deserializer)
def initialize(self):
""" Register custom serializers """
if not self.worker.use_pickle:
for error_cls in RAY_EXCEPTION_TYPES:
self.register_custom_serializer(
error_cls,
use_dict=True,
local=True,
class_id=error_cls.__module__ + ". " + error_cls.__name__,
)
# Tell Ray to serialize lambdas with pickle.
self.register_custom_serializer(
type(lambda: 0),
use_pickle=True,
local=True,
class_id="lambda")
# Tell Ray to serialize types with pickle.
self.register_custom_serializer(
type(int), use_pickle=True, local=True, class_id="type")
# Tell Ray to serialize RayParameters as dictionaries. This is
# used when passing around actor handles.
self.register_custom_serializer(
ray.signature.RayParameter,
use_dict=True,
local=True,
class_id="ray.signature.RayParameter")
# Tell Ray to serialize StringIO with pickle. We do this because
# Ray's default __dict__ serialization is incorrect for this type
# (the object's __dict__ is empty and therefore doesn't
# contain the full state of the object).
self.register_custom_serializer(
io.StringIO,
use_pickle=True,
local=True,
class_id="io.StringIO")
for id_type in ray._raylet._ID_TYPES:
if id_type == ray._raylet.ObjectID:
self._register_cloudpickle_serializer(
id_type, object_id_serializer, object_id_deserializer)
else:
self._register_cloudpickle_serializer(id_type, id_serializer,
id_deserializer)
def _register_cloudpickle_serializer(self, cls, custom_serializer,
custom_deserializer):
if pickle.FAST_CLOUDPICKLE_USED:
assert pickle.FAST_CLOUDPICKLE_USED
def _CloudPicklerReducer(obj):
return custom_deserializer, (custom_serializer(obj), )
def _CloudPicklerReducer(obj):
return custom_deserializer, (custom_serializer(obj), )
# construct a reducer
pickle.CloudPickler.dispatch[cls] = _CloudPicklerReducer
else:
def _CloudPicklerReducer(_self, obj):
_self.save_reduce(
custom_deserializer, (custom_serializer(obj), ), obj=obj)
# use a placeholder for 'self' argument
pickle.CloudPickler.dispatch[cls] = _CloudPicklerReducer
# construct a reducer
pickle.CloudPickler.dispatch[cls] = _CloudPicklerReducer
def _deserialize_object(self, data, metadata, object_id):
if metadata:
@@ -352,15 +227,7 @@ class SerializationContext:
"Tried to get object that has been promoted to plasma."
assert False, "Unrecognized error type " + str(error_type)
elif data:
if self.use_pickle:
raise ValueError("Receiving plasma serialized objects "
"while the serialization context is "
"using pickle5 as the backend.")
try:
# If data is not empty, deserialize the object.
return pyarrow.deserialize(data, self.pyarrow_context)
except pyarrow.DeserializationCallbackError:
raise DeserializationError()
raise ValueError("non-null object should always have metadata")
else:
# Object isn't available in plasma. This should never be returned
# to the user. We should only reach this line if this object was
@@ -368,64 +235,10 @@ class SerializationContext:
# throws an exception.
return plasma.ObjectNotAvailable
def _store_and_register_pyarrow(self, value, depth=100):
"""Store an object and attempt to register its class if needed.
Args:
value: The value to put in the object store.
depth: The maximum number of classes to recursively register.
Raises:
Exception: An exception is raised if the attempt to serialize the
object fails.
"""
counter = 0
while True:
if counter == depth:
raise Exception("Ray exceeded the maximum number of classes "
"that it will recursively serialize when "
"attempting to serialize an object of "
"type {}.".format(type(value)))
counter += 1
try:
return pyarrow.serialize(value, self.pyarrow_context)
except pyarrow.SerializationCallbackError as e:
cls_type = type(e.example_object)
try:
self.register_custom_serializer(cls_type, use_dict=True)
warning_message = (
"WARNING: Serializing objects of type "
"{} by expanding them as dictionaries "
"of their fields. This behavior may "
"be incorrect in some cases.".format(cls_type))
logger.debug(warning_message)
except (RayNotDictionarySerializable, CloudPickleError,
pickle.pickle.PicklingError, Exception):
# We also handle generic exceptions here because
# cloudpickle can fail with many different types of errors.
warning_message = (
"Falling back to serializing {} objects by using "
"pickle. Use `ray.register_custom_serializer({},...)` "
"to provide faster serialization.".format(
cls_type, cls_type))
try:
self.register_custom_serializer(
cls_type, use_pickle=True)
logger.warning(warning_message)
except (CloudPickleError, ValueError):
self.register_custom_serializer(
cls_type, use_pickle=True, local=True)
warning_message = ("WARNING: Pickling the class {} "
"failed, so we are using pickle "
"and only registering the class "
"locally.".format(cls_type))
logger.warning(warning_message)
def deserialize_objects(self,
data_metadata_pairs,
object_ids,
error_timeout=10):
pass
assert len(data_metadata_pairs) == len(object_ids)
start_time = time.time()
@@ -473,29 +286,12 @@ class SerializationContext:
# that this object can also be read by Java.
return RawSerializedObject(value)
if self.worker.use_pickle:
writer = Pickle5Writer()
if ray.cloudpickle.FAST_CLOUDPICKLE_USED:
inband = pickle.dumps(
value, protocol=5, buffer_callback=writer.buffer_callback)
else:
inband = pickle.dumps(value)
return Pickle5SerializedObject(inband, writer)
else:
try:
serialized_value = self._store_and_register_pyarrow(value)
except TypeError:
# TypeError can happen because one of the members of the object
# may not be serializable for cloudpickle. So we need
# these extra fallbacks here to start from the beginning.
# Hopefully the object could have a `__reduce__` method.
self.register_custom_serializer(type(value), use_pickle=True)
logger.warning("WARNING: Serializing the class {} failed, "
"falling back to cloudpickle.".format(
type(value)))
serialized_value = self._store_and_register_pyarrow(value)
return ArrowSerializedObject(serialized_value)
assert self.worker.use_pickle
assert ray.cloudpickle.FAST_CLOUDPICKLE_USED
writer = Pickle5Writer()
inband = pickle.dumps(
value, protocol=5, buffer_callback=writer.buffer_callback)
return Pickle5SerializedObject(inband, writer)
def register_custom_serializer(self,
cls,
@@ -583,22 +379,10 @@ class SerializationContext:
assert isinstance(job_id, JobID)
def register_class_for_serialization(worker_info):
assert worker_info["worker"].use_pickle
context = worker_info["worker"].get_serialization_context(job_id)
if worker_info["worker"].use_pickle:
context._register_cloudpickle_serializer(
cls, serializer, deserializer)
else:
# TODO(rkn): We need to be more thoughtful about what to do if
# custom serializers have already been registered for
# class_id. In some cases, we may want to use the last
# user-defined serializers and ignore subsequent calls to
# register_custom_serializer that were made by the system.
context.pyarrow_context.register_type(
cls,
class_id,
pickle=use_pickle,
custom_serializer=serializer,
custom_deserializer=deserializer)
context._register_cloudpickle_serializer(cls, serializer,
deserializer)
if not local:
self.worker.run_function_on_all_workers(