[Streaming] Streaming Cross-Lang API (#7464)

This commit is contained in:
chaokunyang
2020-04-29 13:42:08 +08:00
committed by GitHub
parent 101255f782
commit 91f630f709
72 changed files with 1612 additions and 408 deletions
@@ -55,6 +55,11 @@ class GatewayClient:
call = self._python_gateway_actor.callMethod.remote(java_params)
return deserialize(ray.get(call))
def new_instance(self, java_class_name):
call = self._python_gateway_actor.newInstance.remote(
serialize(java_class_name))
return deserialize(ray.get(call))
def serialize(obj) -> bytes:
"""Serialize a python object which can be deserialized by `PythonGateway`
+3 -1
View File
@@ -53,7 +53,9 @@ class ExecutionEdge:
self.src_node_id = edge_pb.src_node_id
self.target_node_id = edge_pb.target_node_id
partition_bytes = edge_pb.partition
if language == Language.PYTHON:
# Sink node doesn't have partition function,
# so we only deserialize partition_bytes when it's not None or empty
if language == Language.PYTHON and partition_bytes:
self.partition = partition.load_partition(partition_bytes)
+57
View File
@@ -0,0 +1,57 @@
from abc import ABC, abstractmethod
import pickle
import msgpack
from ray.streaming import message
_RECORD_TYPE_ID = 0
_KEY_RECORD_TYPE_ID = 1
_CROSS_LANG_TYPE_ID = b"0"
_JAVA_TYPE_ID = b"1"
_PYTHON_TYPE_ID = b"2"
class Serializer(ABC):
@abstractmethod
def serialize(self, obj):
pass
@abstractmethod
def deserialize(self, serialized_bytes):
pass
class PythonSerializer(Serializer):
def serialize(self, obj):
return pickle.dumps(obj)
def deserialize(self, serialized_bytes):
return pickle.loads(serialized_bytes)
class CrossLangSerializer(Serializer):
"""Serialize stream element between java/python"""
def serialize(self, obj):
if type(obj) is message.Record:
fields = [_RECORD_TYPE_ID, obj.stream, obj.value]
elif type(obj) is message.KeyRecord:
fields = [_KEY_RECORD_TYPE_ID, obj.stream, obj.key, obj.value]
else:
raise Exception("Unsupported value {}".format(obj))
return msgpack.packb(fields, use_bin_type=True)
def deserialize(self, data):
fields = msgpack.unpackb(data, raw=False)
if fields[0] == _RECORD_TYPE_ID:
stream, value = fields[1:]
record = message.Record(value)
record.stream = stream
return record
elif fields[0] == _KEY_RECORD_TYPE_ID:
stream, key, value = fields[1:]
key_record = message.KeyRecord(key, value)
key_record.stream = stream
return key_record
else:
raise Exception("Unsupported type id {}, type {}".format(
fields[0], type(fields[0])))
+29 -17
View File
@@ -1,11 +1,13 @@
import logging
import pickle
import threading
from abc import ABC, abstractmethod
from ray.streaming.collector import OutputCollector
from ray.streaming.config import Config
from ray.streaming.context import RuntimeContextImpl
from ray.streaming.runtime import serialization
from ray.streaming.runtime.serialization import \
PythonSerializer, CrossLangSerializer
from ray.streaming.runtime.transfer import ChannelID, DataWriter, DataReader
logger = logging.getLogger(__name__)
@@ -38,36 +40,40 @@ class StreamTask(ABC):
# writers
collectors = []
for edge in execution_node.output_edges:
output_actor_ids = {}
output_actors_map = {}
task_id2_worker = execution_graph.get_task_id2_worker_by_node_id(
edge.target_node_id)
for target_task_id, target_actor in task_id2_worker.items():
channel_name = ChannelID.gen_id(self.task_id, target_task_id,
execution_graph.build_time())
output_actor_ids[channel_name] = target_actor
if len(output_actor_ids) > 0:
channel_ids = list(output_actor_ids.keys())
to_actor_ids = list(output_actor_ids.values())
writer = DataWriter(channel_ids, to_actor_ids, channel_conf)
logger.info("Create DataWriter succeed.")
output_actors_map[channel_name] = target_actor
if len(output_actors_map) > 0:
channel_ids = list(output_actors_map.keys())
target_actors = list(output_actors_map.values())
logger.info(
"Create DataWriter channel_ids {}, target_actors {}."
.format(channel_ids, target_actors))
writer = DataWriter(channel_ids, target_actors, channel_conf)
self.writers[edge] = writer
collectors.append(
OutputCollector(channel_ids, writer, edge.partition))
OutputCollector(writer, channel_ids, target_actors,
edge.partition))
# readers
input_actor_ids = {}
input_actor_map = {}
for edge in execution_node.input_edges:
task_id2_worker = execution_graph.get_task_id2_worker_by_node_id(
edge.src_node_id)
for src_task_id, src_actor in task_id2_worker.items():
channel_name = ChannelID.gen_id(src_task_id, self.task_id,
execution_graph.build_time())
input_actor_ids[channel_name] = src_actor
if len(input_actor_ids) > 0:
channel_ids = list(input_actor_ids.keys())
from_actor_ids = list(input_actor_ids.values())
logger.info("Create DataReader, channels {}.".format(channel_ids))
self.reader = DataReader(channel_ids, from_actor_ids, channel_conf)
input_actor_map[channel_name] = src_actor
if len(input_actor_map) > 0:
channel_ids = list(input_actor_map.keys())
from_actors = list(input_actor_map.values())
logger.info("Create DataReader, channels {}, input_actors {}."
.format(channel_ids, from_actors))
self.reader = DataReader(channel_ids, from_actors, channel_conf)
def exit_handler():
# Make DataReader stop read data when MockQueue destructor
@@ -111,6 +117,8 @@ class InputStreamTask(StreamTask):
self.read_timeout_millis = \
int(worker.config.get(Config.READ_TIMEOUT_MS,
Config.DEFAULT_READ_TIMEOUT_MS))
self.python_serializer = PythonSerializer()
self.cross_lang_serializer = CrossLangSerializer()
def init(self):
pass
@@ -120,7 +128,11 @@ class InputStreamTask(StreamTask):
item = self.reader.read(self.read_timeout_millis)
if item is not None:
msg_data = item.body()
msg = pickle.loads(msg_data)
type_id = msg_data[:1]
if (type_id == serialization._PYTHON_TYPE_ID):
msg = self.python_serializer.deserialize(msg_data[1:])
else:
msg = self.cross_lang_serializer.deserialize(msg_data[1:])
self.processor.process(msg)
self.stopped = True
+8 -4
View File
@@ -147,13 +147,17 @@ class ChannelCreationParametersBuilder:
wrap initial parameters needed by a streaming queue
"""
_java_reader_async_function_descriptor = JavaFunctionDescriptor(
"io.ray.streaming.runtime.worker", "onReaderMessage", "([B)V")
"io.ray.streaming.runtime.worker.JobWorker", "onReaderMessage",
"([B)V")
_java_reader_sync_function_descriptor = JavaFunctionDescriptor(
"io.ray.streaming.runtime.worker", "onReaderMessageSync", "([B)[B")
"io.ray.streaming.runtime.worker.JobWorker", "onReaderMessageSync",
"([B)[B")
_java_writer_async_function_descriptor = JavaFunctionDescriptor(
"io.ray.streaming.runtime.worker", "onWriterMessage", "([B)V")
"io.ray.streaming.runtime.worker.JobWorker", "onWriterMessage",
"([B)V")
_java_writer_sync_function_descriptor = JavaFunctionDescriptor(
"io.ray.streaming.runtime.worker", "onWriterMessageSync", "([B)[B")
"io.ray.streaming.runtime.worker.JobWorker", "onWriterMessageSync",
"([B)[B")
_python_reader_async_function_descriptor = PythonFunctionDescriptor(
"ray.streaming.runtime.worker", "on_reader_message", "JobWorker")
_python_reader_sync_function_descriptor = PythonFunctionDescriptor(
+17 -6
View File
@@ -10,6 +10,9 @@ from ray.streaming.runtime.task import SourceStreamTask, OneInputStreamTask
logger = logging.getLogger(__name__)
# special flag to indicate this actor not ready
_NOT_READY_FLAG_ = b" " * 4
@ray.remote
class JobWorker(object):
@@ -66,23 +69,31 @@ class JobWorker(object):
type(self.stream_processor))
def on_reader_message(self, buffer: bytes):
"""used in direct call mode"""
"""Called by upstream queue writer to send data message to downstream
queue reader.
"""
self.reader_client.on_reader_message(buffer)
def on_reader_message_sync(self, buffer: bytes):
"""used in direct call mode"""
"""Called by upstream queue writer to send control message to downstream
downstream queue reader.
"""
if self.reader_client is None:
return b" " * 4 # special flag to indicate this actor not ready
return _NOT_READY_FLAG_
result = self.reader_client.on_reader_message_sync(buffer)
return result.to_pybytes()
def on_writer_message(self, buffer: bytes):
"""used in direct call mode"""
"""Called by downstream queue reader to send notify message to
upstream queue writer.
"""
self.writer_client.on_writer_message(buffer)
def on_writer_message_sync(self, buffer: bytes):
"""used in direct call mode"""
"""Called by downstream queue reader to send control message to
upstream queue writer.
"""
if self.writer_client is None:
return b" " * 4 # special flag to indicate this actor not ready
return _NOT_READY_FLAG_
result = self.writer_client.on_writer_message_sync(buffer)
return result.to_pybytes()