[Streaming] Streaming data transfer and python integration (#6185)

This commit is contained in:
Chaokun Yang
2019-12-10 20:33:24 +08:00
committed by Hao Chen
parent c1d4ab8bb4
commit 6272907a57
93 changed files with 8434 additions and 1480 deletions
+16
View File
@@ -0,0 +1,16 @@
Streaming Library
=================
Dependencies:
Install NetworkX: ``pip install networkx``
Examples:
- simple.py: A simple example with stateless operators and different parallelism per stage.
Run ``python simple.py --input-file toy.txt``
- wordcount.py: A streaming wordcount example with a stateful operator (rolling sum).
Run ``python wordcount.py --titles-file articles.txt``
View File
+3
View File
@@ -0,0 +1,3 @@
# flake8: noqa
# Ray should be imported before streaming
import ray
+6
View File
@@ -0,0 +1,6 @@
# cython: profile=False
# distutils: language = c++
# cython: embedsignature = True
# cython: language_level = 3
include "includes/transfer.pxi"
+283
View File
@@ -0,0 +1,283 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import hashlib
import logging
import pickle
import sys
import time
import ray
import ray.streaming.runtime.transfer as transfer
from ray.streaming.config import Config
from ray.streaming.operator import PStrategy
from ray.streaming.runtime.transfer import ChannelID
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Forward and broadcast stream partitioning strategies
forward_broadcast_strategies = [PStrategy.Forward, PStrategy.Broadcast]
# Used to choose output channel in case of hash-based shuffling
def _hash(value):
if isinstance(value, int):
return value
try:
return int(hashlib.sha1(value.encode("utf-8")).hexdigest(), 16)
except AttributeError:
return int(hashlib.sha1(value).hexdigest(), 16)
class DataChannel(object):
"""A data channel for actor-to-actor communication.
Attributes:
env (Environment): The environment the channel belongs to.
src_operator_id (UUID): The id of the source operator of the channel.
src_instance_index (int): The id of the source instance.
dst_operator_id (UUID): The id of the destination operator of the
channel.
dst_instance_index (int): The id of the destination instance.
"""
def __init__(self, src_operator_id, src_instance_index, dst_operator_id,
dst_instance_index, str_qid):
self.src_operator_id = src_operator_id
self.src_instance_index = src_instance_index
self.dst_operator_id = dst_operator_id
self.dst_instance_index = dst_instance_index
self.str_qid = str_qid
self.qid = ChannelID(str_qid)
def __repr__(self):
return "(src({},{}),dst({},{}), qid({}))".format(
self.src_operator_id, self.src_instance_index,
self.dst_operator_id, self.dst_instance_index, self.str_qid)
_CLOSE_FLAG = b" "
# Pulls and merges data from multiple input channels
class DataInput(object):
"""An input gate of an operator instance.
The input gate pulls records from all input channels in a round-robin
fashion.
Attributes:
input_channels (list): The list of input channels.
channel_index (int): The index of the next channel to pull from.
max_index (int): The number of input channels.
closed (list): A list of flags indicating whether an input channel
has been marked as 'closed'.
all_closed (bool): Denotes whether all input channels have been
closed (True) or not (False).
"""
def __init__(self, env, channels):
assert len(channels) > 0
self.env = env
self.reader = None # created in `init` method
self.input_channels = channels
self.channel_index = 0
self.max_index = len(channels)
# Tracks the channels that have been closed. qid: close status
self.closed = {}
def init(self):
channels = [c.str_qid for c in self.input_channels]
input_actors = []
for c in self.input_channels:
actor = self.env.execution_graph.get_actor(c.src_operator_id,
c.src_instance_index)
input_actors.append(actor)
logger.info("DataInput input_actors %s", input_actors)
conf = {
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
.current_driver_id,
Config.CHANNEL_TYPE: self.env.config.channel_type
}
self.reader = transfer.DataReader(channels, input_actors, conf)
def pull(self):
# pull from channel
item = self.reader.read(100)
while item is None:
time.sleep(0.001)
item = self.reader.read(100)
msg_data = item.body()
if msg_data == _CLOSE_FLAG:
self.closed[item.channel_id] = True
if len(self.closed) == len(self.input_channels):
return None
else:
return self.pull()
else:
return pickle.loads(msg_data)
def close(self):
self.reader.stop()
# Selects output channel(s) and pushes data
class DataOutput(object):
"""An output gate of an operator instance.
The output gate pushes records to output channels according to the
user-defined partitioning scheme.
Attributes:
partitioning_schemes (dict): A mapping from destination operator ids
to partitioning schemes (see: PScheme in operator.py).
forward_channels (list): A list of channels to forward records.
shuffle_channels (list(list)): A list of output channels to shuffle
records grouped by destination operator.
shuffle_key_channels (list(list)): A list of output channels to
shuffle records by a key grouped by destination operator.
shuffle_exists (bool): A flag indicating that there exists at least
one shuffle_channel.
shuffle_key_exists (bool): A flag indicating that there exists at
least one shuffle_key_channel.
"""
def __init__(self, env, channels, partitioning_schemes):
assert len(channels) > 0
self.env = env
self.writer = None # created in `init` method
self.channels = channels
self.key_selector = None
self.round_robin_indexes = [0]
self.partitioning_schemes = partitioning_schemes
# Prepare output -- collect channels by type
self.forward_channels = [] # Forward and broadcast channels
slots = sum(1 for scheme in self.partitioning_schemes.values()
if scheme.strategy == PStrategy.RoundRobin)
self.round_robin_channels = [[]] * slots # RoundRobin channels
self.round_robin_indexes = [-1] * slots
slots = sum(1 for scheme in self.partitioning_schemes.values()
if scheme.strategy == PStrategy.Shuffle)
# Flag used to avoid hashing when there is no shuffling
self.shuffle_exists = slots > 0
self.shuffle_channels = [[]] * slots # Shuffle channels
slots = sum(1 for scheme in self.partitioning_schemes.values()
if scheme.strategy == PStrategy.ShuffleByKey)
# Flag used to avoid hashing when there is no shuffling by key
self.shuffle_key_exists = slots > 0
self.shuffle_key_channels = [[]] * slots # Shuffle by key channels
# Distinct shuffle destinations
shuffle_destinations = {}
# Distinct shuffle by key destinations
shuffle_by_key_destinations = {}
# Distinct round robin destinations
round_robin_destinations = {}
index_1 = 0
index_2 = 0
index_3 = 0
for channel in channels:
p_scheme = self.partitioning_schemes[channel.dst_operator_id]
strategy = p_scheme.strategy
if strategy in forward_broadcast_strategies:
self.forward_channels.append(channel)
elif strategy == PStrategy.Shuffle:
pos = shuffle_destinations.setdefault(channel.dst_operator_id,
index_1)
self.shuffle_channels[pos].append(channel)
if pos == index_1:
index_1 += 1
elif strategy == PStrategy.ShuffleByKey:
pos = shuffle_by_key_destinations.setdefault(
channel.dst_operator_id, index_2)
self.shuffle_key_channels[pos].append(channel)
if pos == index_2:
index_2 += 1
elif strategy == PStrategy.RoundRobin:
pos = round_robin_destinations.setdefault(
channel.dst_operator_id, index_3)
self.round_robin_channels[pos].append(channel)
if pos == index_3:
index_3 += 1
else: # TODO (john): Add support for other strategies
sys.exit("Unrecognized or unsupported partitioning strategy.")
# A KeyedDataStream can only be shuffled by key
assert not (self.shuffle_exists and self.shuffle_key_exists)
def init(self):
"""init DataOutput which creates DataWriter"""
channel_ids = [c.str_qid for c in self.channels]
to_actors = []
for c in self.channels:
actor = self.env.execution_graph.get_actor(c.dst_operator_id,
c.dst_instance_index)
to_actors.append(actor)
logger.info("DataOutput output_actors %s", to_actors)
conf = {
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
.current_driver_id,
Config.CHANNEL_TYPE: self.env.config.channel_type
}
self.writer = transfer.DataWriter(channel_ids, to_actors, conf)
def close(self):
"""Close the channel (True) by propagating _CLOSE_FLAG
_CLOSE_FLAG is used as special type of record that is propagated from
sources to sink to notify that the end of data in a stream.
"""
for c in self.channels:
self.writer.write(c.qid, _CLOSE_FLAG)
# must ensure DataWriter send None flag to peer actor
self.writer.stop()
def push(self, record):
target_channels = []
# Forward record
for c in self.forward_channels:
logger.debug("[writer] Push record '{}' to channel {}".format(
record, c))
target_channels.append(c)
# Forward record
index = 0
for channels in self.round_robin_channels:
self.round_robin_indexes[index] += 1
if self.round_robin_indexes[index] == len(channels):
self.round_robin_indexes[index] = 0 # Reset index
c = channels[self.round_robin_indexes[index]]
logger.debug("[writer] Push record '{}' to channel {}".format(
record, c))
target_channels.append(c)
index += 1
# Hash-based shuffling by key
if self.shuffle_key_exists:
key, _ = record
h = _hash(key)
for channels in self.shuffle_key_channels:
num_instances = len(channels) # Downstream instances
c = channels[h % num_instances]
logger.debug(
"[key_shuffle] Push record '{}' to channel {}".format(
record, c))
target_channels.append(c)
elif self.shuffle_exists: # Hash-based shuffling per destination
h = _hash(record)
for channels in self.shuffle_channels:
num_instances = len(channels) # Downstream instances
c = channels[h % num_instances]
logger.debug("[shuffle] Push record '{}' to channel {}".format(
record, c))
target_channels.append(c)
else: # TODO (john): Handle rescaling
pass
msg_data = pickle.dumps(record)
for c in target_channels:
# send data to channel
self.writer.write(c.qid, msg_data)
def push_all(self, records):
for record in records:
self.push(record)
+23
View File
@@ -0,0 +1,23 @@
class Config:
STREAMING_JOB_NAME = "streaming.job.name"
STREAMING_OP_NAME = "streaming.op_name"
TASK_JOB_ID = "streaming.task_job_id"
STREAMING_WORKER_NAME = "streaming.worker_name"
# channel
CHANNEL_TYPE = "channel_type"
MEMORY_CHANNEL = "memory_channel"
NATIVE_CHANNEL = "native_channel"
CHANNEL_SIZE = "channel_size"
CHANNEL_SIZE_DEFAULT = 10**8
IS_RECREATE = "streaming.is_recreate"
# return from StreamingReader.getBundle if only empty message read in this
# interval.
TIMER_INTERVAL_MS = "timer_interval_ms"
STREAMING_RING_BUFFER_CAPACITY = "streaming.ring_buffer_capacity"
# write an empty message if there is no data to be written in this
# interval.
STREAMING_EMPTY_MESSAGE_INTERVAL = "streaming.empty_message_interval"
# operator type
OPERATOR_TYPE = "operator_type"
+8
View File
@@ -0,0 +1,8 @@
New York City
Berlin
London
Paris
United States
Germany
France
United Kingdom
@@ -0,0 +1,71 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import logging
import time
import ray
from ray.streaming.streaming import Environment
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", required=True, help="the input text file")
# A class used to check attribute-based key selection
class Record(object):
def __init__(self, record):
k, _ = record
self.word = k
self.record = record
# Splits input line into words and outputs objects of type Record
# each one consisting of a key (word) and a tuple (word,1)
def splitter(line):
records = []
words = line.split()
for w in words:
records.append(Record((w, 1)))
return records
# Receives an object of type Record and returns the actual tuple
def as_tuple(record):
return record.record
if __name__ == "__main__":
# Get program parameters
args = parser.parse_args()
input_file = str(args.input_file)
ray.init()
ray.register_custom_serializer(Record, use_dict=True)
# A Ray streaming environment with the default configuration
env = Environment()
env.set_parallelism(2) # Each operator will be executed by two actors
# 'key_by("word")' physically partitions the stream of records
# based on the hash value of the 'word' attribute (see Record class above)
# 'map(as_tuple)' maps a record of type Record into a tuple
# 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
stream = env.read_text_file(input_file) \
.round_robin() \
.flat_map(splitter) \
.key_by("word") \
.map(as_tuple) \
.sum(1) \
.inspect(print) # Prints the content of the
# stream to stdout
start = time.time()
env_handle = env.execute() # Deploys and executes the dataflow
ray.get(env_handle) # Stay alive until execution finishes
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))
+56
View File
@@ -0,0 +1,56 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import logging
import time
import ray
from ray.streaming.config import Config
from ray.streaming.streaming import Environment, Conf
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", required=True, help="the input text file")
# Test functions
def splitter(line):
return line.split()
def filter_fn(word):
if "f" in word:
return True
return False
if __name__ == "__main__":
args = parser.parse_args()
ray.init(local_mode=False)
# A Ray streaming environment with the default configuration
env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
# Stream represents the ouput of the filter and
# can be forked into other dataflows
stream = env.read_text_file(args.input_file) \
.shuffle() \
.flat_map(splitter) \
.set_parallelism(2) \
.filter(filter_fn) \
.set_parallelism(2) \
.inspect(lambda x: print("result", x)) # Prints the contents of the
# stream to stdout
start = time.time()
env_handle = env.execute()
ray.get(env_handle) # Stay alive until execution finishes
env.wait_finish()
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))
+5
View File
@@ -0,0 +1,5 @@
This is
a test file
to test if example
works
fine
+109
View File
@@ -0,0 +1,109 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import logging
import time
import ray
import wikipedia
from ray.streaming.streaming import Environment
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--titles-file",
required=True,
help="the file containing the wikipedia titles to lookup")
# A custom data source that reads articles from wikipedia
# Custom data sources need to implement a get_next() method
# that returns the next data element, in this case sentences
class Wikipedia(object):
def __init__(self, title_file):
# Titles in this file will be as queries
self.title_file = title_file
# TODO (john): Handle possible exception here
self.title_reader = iter(list(open(self.title_file, "r").readlines()))
self.done = False
self.article_done = True
self.sentences = iter([])
# Returns next sentence from a wikipedia article
def get_next(self):
if self.done:
return None # Source exhausted
while True:
if self.article_done:
try: # Try next title
next_title = next(self.title_reader)
except StopIteration:
self.done = True # Source exhausted
return None
# Get next article
logger.debug("Next article: {}".format(next_title))
article = wikipedia.page(next_title).content
# Split article in sentences
self.sentences = iter(article.split("."))
self.article_done = False
try: # Try next sentence
sentence = next(self.sentences)
logger.debug("Next sentence: {}".format(sentence))
return sentence
except StopIteration:
self.article_done = True
# Splits input line into words and
# outputs records of the form (word,1)
def splitter(line):
records = []
words = line.split()
for w in words:
records.append((w, 1))
return records
# Returns the first attribute of a tuple
def key_selector(tuple):
return tuple[0]
# Returns the second attribute of a tuple
def attribute_selector(tuple):
return tuple[1]
if __name__ == "__main__":
# Get program parameters
args = parser.parse_args()
titles_file = str(args.titles_file)
ray.init()
# A Ray streaming environment with the default configuration
env = Environment()
env.set_parallelism(2) # Each operator will be executed by two actors
# The following dataflow is a simple streaming wordcount
# with a rolling sum operator.
# It reads articles from wikipedia, splits them in words,
# shuffles words, and counts the occurences of each word.
stream = env.source(Wikipedia(titles_file)) \
.round_robin() \
.flat_map(splitter) \
.key_by(key_selector) \
.sum(attribute_selector) \
.inspect(print) # Prints the contents of the
# stream to stdout
start = time.time()
env_handle = env.execute() # Deploys and executes the dataflow
ray.get(env_handle) # Stay alive until execution finishes
env.wait_finish()
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))
+153
View File
@@ -0,0 +1,153 @@
# cython: profile=False
# distutils: language = c++
# cython: embedsignature = True
# cython: language_level = 3
# flake8: noqa
from libc.stdint cimport *
from libcpp cimport bool as c_bool
from libcpp.memory cimport shared_ptr
from libcpp.vector cimport vector as c_vector
from libcpp.list cimport list as c_list
from cpython cimport PyObject
cimport cpython
cdef inline object PyObject_to_object(PyObject* o):
# Cast to "object" increments reference count
cdef object result = <object> o
cpython.Py_DECREF(result)
return result
from ray.includes.common cimport (
CLanguage,
CRayObject,
CRayStatus,
CRayFunction
)
from ray.includes.unique_ids cimport (
CActorID,
CJobID,
CTaskID,
CObjectID,
)
from ray.includes.libcoreworker cimport CCoreWorker
cdef extern from "status.h" namespace "ray::streaming" nogil:
cdef cppclass CStreamingStatus "ray::streaming::StreamingStatus":
pass
cdef CStreamingStatus StatusOK "ray::streaming::StreamingStatus::OK"
cdef CStreamingStatus StatusReconstructTimeOut "ray::streaming::StreamingStatus::ReconstructTimeOut"
cdef CStreamingStatus StatusQueueIdNotFound "ray::streaming::StreamingStatus::QueueIdNotFound"
cdef CStreamingStatus StatusResubscribeFailed "ray::streaming::StreamingStatus::ResubscribeFailed"
cdef CStreamingStatus StatusEmptyRingBuffer "ray::streaming::StreamingStatus::EmptyRingBuffer"
cdef CStreamingStatus StatusFullChannel "ray::streaming::StreamingStatus::FullChannel"
cdef CStreamingStatus StatusNoSuchItem "ray::streaming::StreamingStatus::NoSuchItem"
cdef CStreamingStatus StatusInitQueueFailed "ray::streaming::StreamingStatus::InitQueueFailed"
cdef CStreamingStatus StatusGetBundleTimeOut "ray::streaming::StreamingStatus::GetBundleTimeOut"
cdef CStreamingStatus StatusSkipSendEmptyMessage "ray::streaming::StreamingStatus::SkipSendEmptyMessage"
cdef CStreamingStatus StatusInterrupted "ray::streaming::StreamingStatus::Interrupted"
cdef CStreamingStatus StatusWaitQueueTimeOut "ray::streaming::StreamingStatus::WaitQueueTimeOut"
cdef CStreamingStatus StatusOutOfMemory "ray::streaming::StreamingStatus::OutOfMemory"
cdef CStreamingStatus StatusInvalid "ray::streaming::StreamingStatus::Invalid"
cdef CStreamingStatus StatusUnknownError "ray::streaming::StreamingStatus::UnknownError"
cdef CStreamingStatus StatusTailStatus "ray::streaming::StreamingStatus::TailStatus"
cdef cppclass CStreamingCommon "ray::streaming::StreamingCommon":
void SetConfig(const uint8_t *, uint32_t size)
cdef extern from "runtime_context.h" namespace "ray::streaming" nogil:
cdef cppclass CRuntimeContext "ray::streaming::RuntimeContext":
CRuntimeContext()
void SetConfig(const uint8_t *data, uint32_t size)
inline void MarkMockTest()
inline c_bool IsMockTest()
cdef extern from "message/message.h" namespace "ray::streaming" nogil:
cdef cppclass CStreamingMessageType "ray::streaming::StreamingMessageType":
pass
cdef CStreamingMessageType MessageTypeBarrier "ray::streaming::StreamingMessageType::Barrier"
cdef CStreamingMessageType MessageTypeMessage "ray::streaming::StreamingMessageType::Message"
cdef cppclass CStreamingMessage "ray::streaming::StreamingMessage":
inline uint8_t *RawData() const
inline uint32_t GetDataSize() const
inline CStreamingMessageType GetMessageType() const
inline uint64_t GetMessageSeqId() const
cdef extern from "message/message_bundle.h" namespace "ray::streaming" nogil:
cdef cppclass CStreamingMessageBundleType "ray::streaming::StreamingMessageBundleType":
pass
cdef CStreamingMessageBundleType BundleTypeEmpty "ray::streaming::StreamingMessageBundleType::Empty"
cdef CStreamingMessageBundleType BundleTypeBarrier "ray::streaming::StreamingMessageBundleType::Barrier"
cdef CStreamingMessageBundleType BundleTypeBundle "ray::streaming::StreamingMessageBundleType::Bundle"
cdef cppclass CStreamingMessageBundleMeta "ray::streaming::StreamingMessageBundleMeta":
CStreamingMessageBundleMeta()
inline uint64_t GetMessageBundleTs() const
inline uint64_t GetLastMessageId() const
inline uint32_t GetMessageListSize() const
inline CStreamingMessageBundleType GetBundleType() const
inline c_bool IsBarrier()
inline c_bool IsBundle()
ctypedef shared_ptr[CStreamingMessageBundleMeta] CStreamingMessageBundleMetaPtr
uint32_t kMessageBundleHeaderSize "ray::streaming::kMessageBundleHeaderSize"
cdef cppclass CStreamingMessageBundle "ray::streaming::StreamingMessageBundle"(CStreamingMessageBundleMeta):
@staticmethod
void GetMessageListFromRawData(const uint8_t *data, uint32_t size, uint32_t msg_nums,
c_list[shared_ptr[CStreamingMessage]] &msg_list);
cdef extern from "queue/queue_client.h" namespace "ray::streaming" nogil:
cdef cppclass CReaderClient "ray::streaming::ReaderClient":
CReaderClient(CCoreWorker *core_worker,
CRayFunction &async_func,
CRayFunction &sync_func)
void OnReaderMessage(shared_ptr[CLocalMemoryBuffer] buffer);
shared_ptr[CLocalMemoryBuffer] OnReaderMessageSync(shared_ptr[CLocalMemoryBuffer] buffer);
cdef cppclass CWriterClient "ray::streaming::WriterClient":
CWriterClient(CCoreWorker *core_worker,
CRayFunction &async_func,
CRayFunction &sync_func)
void OnWriterMessage(shared_ptr[CLocalMemoryBuffer] buffer);
shared_ptr[CLocalMemoryBuffer] OnWriterMessageSync(shared_ptr[CLocalMemoryBuffer] buffer);
cdef extern from "data_reader.h" namespace "ray::streaming" nogil:
cdef cppclass CDataBundle "ray::streaming::DataBundle":
uint8_t *data
uint32_t data_size
CObjectID c_from "from"
uint64_t seq_id
CStreamingMessageBundleMetaPtr meta
cdef cppclass CDataReader "ray::streaming::DataReader"(CStreamingCommon):
CDataReader(shared_ptr[CRuntimeContext] &runtime_context)
void Init(const c_vector[CObjectID] &input_ids,
const c_vector[CActorID] &actor_ids,
const c_vector[uint64_t] &seq_ids,
const c_vector[uint64_t] &msg_ids,
int64_t timer_interval);
CStreamingStatus GetBundle(const uint32_t timeout_ms,
shared_ptr[CDataBundle] &message)
void Stop()
cdef extern from "data_writer.h" namespace "ray::streaming" nogil:
cdef cppclass CDataWriter "ray::streaming::DataWriter"(CStreamingCommon):
CDataWriter(shared_ptr[CRuntimeContext] &runtime_context)
CStreamingStatus Init(const c_vector[CObjectID] &channel_ids,
const c_vector[CActorID] &actor_ids,
const c_vector[uint64_t] &message_ids,
const c_vector[uint64_t] &queue_size_vec);
long WriteMessageToBufferRing(
const CObjectID &q_id, uint8_t *data, uint32_t data_size)
void Run()
void Stop()
cdef extern from "ray/common/buffer.h" nogil:
cdef cppclass CLocalMemoryBuffer "ray::LocalMemoryBuffer":
uint8_t *Data() const
size_t Size() const
+323
View File
@@ -0,0 +1,323 @@
# flake8: noqa
from libc.stdint cimport *
from libcpp cimport bool as c_bool
from libcpp.memory cimport shared_ptr, make_shared, dynamic_pointer_cast
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector as c_vector
from libcpp.list cimport list as c_list
from ray.includes.common cimport (
CRayFunction,
LANGUAGE_PYTHON,
CBuffer
)
from ray.includes.unique_ids cimport (
CActorID,
CObjectID
)
from ray._raylet cimport (
Buffer,
CoreWorker,
ActorID,
ObjectID,
string_vector_from_list
)
from ray.includes.libcoreworker cimport CCoreWorker
cimport ray.streaming.includes.libstreaming as libstreaming
from ray.streaming.includes.libstreaming cimport (
CStreamingStatus,
CStreamingMessage,
CStreamingMessageBundle,
CRuntimeContext,
CDataBundle,
CDataWriter,
CDataReader,
CReaderClient,
CWriterClient,
CLocalMemoryBuffer,
)
import logging
from ray.function_manager import FunctionDescriptor
channel_logger = logging.getLogger(__name__)
cdef class ReaderClient:
cdef:
CReaderClient *client
def __cinit__(self,
CoreWorker worker,
async_func: FunctionDescriptor,
sync_func: FunctionDescriptor):
cdef:
CCoreWorker *core_worker = worker.core_worker.get()
CRayFunction async_native_func
CRayFunction sync_native_func
async_native_func = CRayFunction(
LANGUAGE_PYTHON, string_vector_from_list(async_func.get_function_descriptor_list()))
sync_native_func = CRayFunction(
LANGUAGE_PYTHON, string_vector_from_list(sync_func.get_function_descriptor_list()))
self.client = new CReaderClient(core_worker, async_native_func, sync_native_func)
def __dealloc__(self):
del self.client
self.client = NULL
def on_reader_message(self, const unsigned char[:] value):
cdef:
size_t size = value.nbytes
shared_ptr[CLocalMemoryBuffer] local_buf = \
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
with nogil:
self.client.OnReaderMessage(local_buf)
def on_reader_message_sync(self, const unsigned char[:] value):
cdef:
size_t size = value.nbytes
shared_ptr[CLocalMemoryBuffer] local_buf = \
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
shared_ptr[CLocalMemoryBuffer] result_buffer
with nogil:
result_buffer = self.client.OnReaderMessageSync(local_buf)
return Buffer.make(dynamic_pointer_cast[CBuffer, CLocalMemoryBuffer](result_buffer))
cdef class WriterClient:
cdef:
CWriterClient * client
def __cinit__(self,
CoreWorker worker,
async_func: FunctionDescriptor,
sync_func: FunctionDescriptor):
cdef:
CCoreWorker *core_worker = worker.core_worker.get()
CRayFunction async_native_func
CRayFunction sync_native_func
async_native_func = CRayFunction(
LANGUAGE_PYTHON, string_vector_from_list(async_func.get_function_descriptor_list()))
sync_native_func = CRayFunction(
LANGUAGE_PYTHON, string_vector_from_list(sync_func.get_function_descriptor_list()))
self.client = new CWriterClient(core_worker, async_native_func, sync_native_func)
def __dealloc__(self):
del self.client
self.client = NULL
def on_writer_message(self, const unsigned char[:] value):
cdef:
size_t size = value.nbytes
shared_ptr[CLocalMemoryBuffer] local_buf = \
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
with nogil:
self.client.OnWriterMessage(local_buf)
def on_writer_message_sync(self, const unsigned char[:] value):
cdef:
size_t size = value.nbytes
shared_ptr[CLocalMemoryBuffer] local_buf = \
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
shared_ptr[CLocalMemoryBuffer] result_buffer
with nogil:
result_buffer = self.client.OnWriterMessageSync(local_buf)
return Buffer.make(dynamic_pointer_cast[CBuffer, CLocalMemoryBuffer](result_buffer))
cdef class DataWriter:
cdef:
CDataWriter *writer
def __init__(self):
raise Exception("use create() to create DataWriter")
@staticmethod
def create(list py_output_channels,
list output_actor_ids: list[ActorID],
uint64_t queue_size,
list py_msg_ids,
bytes config_bytes,
c_bool is_mock):
cdef:
c_vector[CObjectID] channel_ids = bytes_list_to_qid_vec(py_output_channels)
c_vector[CActorID] actor_ids
c_vector[uint64_t] msg_ids
CDataWriter *c_writer
cdef const unsigned char[:] config_data
for actor_id in output_actor_ids:
actor_ids.push_back((<ActorID>actor_id).data)
for py_msg_id in py_msg_ids:
msg_ids.push_back(<uint64_t>py_msg_id)
cdef shared_ptr[CRuntimeContext] ctx = make_shared[CRuntimeContext]()
if is_mock:
ctx.get().MarkMockTest()
if config_bytes:
config_data = config_bytes
channel_logger.info("load config, config bytes size: %s", config_data.nbytes)
ctx.get().SetConfig(<uint8_t *>(&config_data[0]), config_data.nbytes)
c_writer = new CDataWriter(ctx)
cdef:
c_vector[CObjectID] remain_id_vec
c_vector[uint64_t] queue_size_vec
for i in range(channel_ids.size()):
queue_size_vec.push_back(queue_size)
cdef CStreamingStatus status = c_writer.Init(channel_ids, actor_ids, msg_ids, queue_size_vec)
if remain_id_vec.size() != 0:
channel_logger.warning("failed queue amounts => %s", remain_id_vec.size())
if <uint32_t>status != <uint32_t> libstreaming.StatusOK:
msg = "initialize writer failed, status={}".format(<uint32_t>status)
channel_logger.error(msg)
del c_writer
import ray.streaming.runtime.transfer as transfer
raise transfer.ChannelInitException(msg, qid_vector_to_list(remain_id_vec))
c_writer.Run()
channel_logger.info("create native writer succeed")
cdef DataWriter writer = DataWriter.__new__(DataWriter)
writer.writer = c_writer
return writer
def __dealloc__(self):
if self.writer != NULL:
del self.writer
channel_logger.info("deleted DataWriter")
self.writer = NULL
def write(self, ObjectID qid, const unsigned char[:] value):
"""support zero-copy bytes, bytearray, array of unsigned char"""
cdef:
CObjectID native_id = qid.data
uint64_t msg_id
uint8_t *data = <uint8_t *>(&value[0])
uint32_t size = value.nbytes
with nogil:
msg_id = self.writer.WriteMessageToBufferRing(native_id, data, size)
return msg_id
def stop(self):
self.writer.Stop()
channel_logger.info("stopped DataWriter")
cdef class DataReader:
cdef:
CDataReader *reader
readonly bytes meta
readonly bytes data
def __init__(self):
raise Exception("use create() to create DataReader")
@staticmethod
def create(list py_input_queues,
list input_actor_ids: list[ActorID],
list py_seq_ids,
list py_msg_ids,
int64_t timer_interval,
c_bool is_recreate,
bytes config_bytes,
c_bool is_mock):
cdef:
c_vector[CObjectID] queue_id_vec = bytes_list_to_qid_vec(py_input_queues)
c_vector[CActorID] actor_ids
c_vector[uint64_t] seq_ids
c_vector[uint64_t] msg_ids
CDataReader *c_reader
cdef const unsigned char[:] config_data
for actor_id in input_actor_ids:
actor_ids.push_back((<ActorID>actor_id).data)
for py_seq_id in py_seq_ids:
seq_ids.push_back(<uint64_t>py_seq_id)
for py_msg_id in py_msg_ids:
msg_ids.push_back(<uint64_t>py_msg_id)
cdef shared_ptr[CRuntimeContext] ctx = make_shared[CRuntimeContext]()
if config_bytes:
config_data = config_bytes
channel_logger.info("load config, config bytes size: %s", config_data.nbytes)
ctx.get().SetConfig(<uint8_t *>(&(config_data[0])), config_data.nbytes)
if is_mock:
ctx.get().MarkMockTest()
c_reader = new CDataReader(ctx)
c_reader.Init(queue_id_vec, actor_ids, seq_ids, msg_ids, timer_interval)
channel_logger.info("create native reader succeed")
cdef DataReader reader = DataReader.__new__(DataReader)
reader.reader = c_reader
return reader
def __dealloc__(self):
if self.reader != NULL:
del self.reader
channel_logger.info("deleted DataReader")
self.reader = NULL
def read(self, uint32_t timeout_millis):
cdef:
shared_ptr[CDataBundle] bundle
CStreamingStatus status
with nogil:
status = self.reader.GetBundle(timeout_millis, bundle)
cdef uint32_t bundle_type = <uint32_t>(bundle.get().meta.get().GetBundleType())
if <uint32_t> status != <uint32_t> libstreaming.StatusOK:
if <uint32_t> status == <uint32_t> libstreaming.StatusInterrupted:
# avoid cyclic import
import ray.streaming.runtime.transfer as transfer
raise transfer.ChannelInterruptException("reader interrupted")
elif <uint32_t> status == <uint32_t> libstreaming.StatusInitQueueFailed:
raise Exception("init channel failed")
elif <uint32_t> status == <uint32_t> libstreaming.StatusWaitQueueTimeOut:
raise Exception("wait channel object timeout")
cdef:
uint32_t msg_nums
CObjectID queue_id
c_list[shared_ptr[CStreamingMessage]] msg_list
list msgs = []
uint64_t timestamp
uint64_t msg_id
if bundle_type == <uint32_t> libstreaming.BundleTypeBundle:
msg_nums = bundle.get().meta.get().GetMessageListSize()
CStreamingMessageBundle.GetMessageListFromRawData(
bundle.get().data + libstreaming.kMessageBundleHeaderSize,
bundle.get().data_size - libstreaming.kMessageBundleHeaderSize,
msg_nums,
msg_list)
timestamp = bundle.get().meta.get().GetMessageBundleTs()
for msg in msg_list:
msg_bytes = msg.get().RawData()[:msg.get().GetDataSize()]
qid_bytes = queue_id.Binary()
msg_id = msg.get().GetMessageSeqId()
msgs.append((msg_bytes, msg_id, timestamp, qid_bytes))
return msgs
elif bundle_type == <uint32_t> libstreaming.BundleTypeEmpty:
return []
else:
raise Exception("Unsupported bundle type {}".format(bundle_type))
def stop(self):
self.reader.Stop()
channel_logger.info("stopped DataReader")
cdef c_vector[CObjectID] bytes_list_to_qid_vec(list py_queue_ids) except *:
assert len(py_queue_ids) > 0
cdef:
c_vector[CObjectID] queue_id_vec
c_string q_id_data
for q_id in py_queue_ids:
q_id_data = q_id
assert q_id_data.size() == CObjectID.Size()
obj_id = CObjectID.FromBinary(q_id_data)
queue_id_vec.push_back(obj_id)
return queue_id_vec
cdef c_vector[c_string] qid_vector_to_list(c_vector[CObjectID] queue_id_vec):
queues = []
for obj_id in queue_id_vec:
queues.append(obj_id.Binary())
return queues
+124
View File
@@ -0,0 +1,124 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import pickle
import threading
import ray
import ray.streaming._streaming as _streaming
from ray.streaming.config import Config
from ray.function_manager import FunctionDescriptor
from ray.streaming.communication import DataInput, DataOutput
logger = logging.getLogger(__name__)
@ray.remote
class JobWorker(object):
"""A streaming job worker.
Attributes:
worker_id: The id of the instance.
input_channels: The input gate that manages input channels of
the instance (see: DataInput in communication.py).
output_channels (DataOutput): The output gate that manages output
channels of the instance (see: DataOutput in communication.py).
the operator instance.
"""
def __init__(self, worker_id, operator, input_channels, output_channels):
self.env = None
self.worker_id = worker_id
self.operator = operator
processor_name = operator.processor_class.__name__
processor_instance = operator.processor_class(operator)
self.processor_name = processor_name
self.processor_instance = processor_instance
self.input_channels = input_channels
self.output_channels = output_channels
self.input_gate = None
self.output_gate = None
self.reader_client = None
self.writer_client = None
def init(self, env):
"""init streaming actor"""
env = pickle.loads(env)
self.env = env
logger.info("init operator instance %s", self.processor_name)
if env.config.channel_type == Config.NATIVE_CHANNEL:
core_worker = ray.worker.global_worker.core_worker
reader_async_func = FunctionDescriptor(
__name__, self.on_reader_message.__name__,
self.__class__.__name__)
reader_sync_func = FunctionDescriptor(
__name__, self.on_reader_message_sync.__name__,
self.__class__.__name__)
self.reader_client = _streaming.ReaderClient(
core_worker, reader_async_func, reader_sync_func)
writer_async_func = FunctionDescriptor(
__name__, self.on_writer_message.__name__,
self.__class__.__name__)
writer_sync_func = FunctionDescriptor(
__name__, self.on_writer_message_sync.__name__,
self.__class__.__name__)
self.writer_client = _streaming.WriterClient(
core_worker, writer_async_func, writer_sync_func)
if len(self.input_channels) > 0:
self.input_gate = DataInput(env, self.input_channels)
self.input_gate.init()
if len(self.output_channels) > 0:
self.output_gate = DataOutput(
env, self.output_channels,
self.operator.partitioning_strategies)
self.output_gate.init()
logger.info("init operator instance %s succeed", self.processor_name)
return True
# Starts the actor
def start(self):
self.t = threading.Thread(target=self.run, daemon=True)
self.t.start()
actor_id = ray.worker.global_worker.actor_id
logger.info("%s %s started, actor id %s", self.__class__.__name__,
self.processor_name, actor_id)
def run(self):
logger.info("%s start running", self.processor_name)
self.processor_instance.run(self.input_gate, self.output_gate)
logger.info("%s finished running", self.processor_name)
self.close()
def close(self):
if self.input_gate:
self.input_gate.close()
if self.output_gate:
self.output_gate.close()
def is_finished(self):
return not self.t.is_alive()
def on_reader_message(self, buffer: bytes):
"""used in direct call mode"""
self.reader_client.on_reader_message(buffer)
def on_reader_message_sync(self, buffer: bytes):
"""used in direct call mode"""
if self.reader_client is None:
return b" " * 4 # special flag to indicate this actor not ready
result = self.reader_client.on_reader_message_sync(buffer)
return result.to_pybytes()
def on_writer_message(self, buffer: bytes):
"""used in direct call mode"""
self.writer_client.on_writer_message(buffer)
def on_writer_message_sync(self, buffer: bytes):
"""used in direct call mode"""
if self.writer_client is None:
return b" " * 4 # special flag to indicate this actor not ready
result = self.writer_client.on_writer_message_sync(buffer)
return result.to_pybytes()
+113
View File
@@ -0,0 +1,113 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import enum
import logging
import cloudpickle
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
# Stream partitioning schemes
class PScheme(object):
def __init__(self, strategy, partition_fn=None):
self.strategy = strategy
self.partition_fn = partition_fn
def __repr__(self):
return "({},{})".format(self.strategy, self.partition_fn)
# Partitioning strategies
class PStrategy(enum.Enum):
Forward = 0 # Default
Shuffle = 1
Rescale = 2
RoundRobin = 3
Broadcast = 4
Custom = 5
ShuffleByKey = 6
# ...
# Operator types
class OpType(enum.Enum):
Source = 0
Map = 1
FlatMap = 2
Filter = 3
TimeWindow = 4
KeyBy = 5
Sink = 6
WindowJoin = 7
Inspect = 8
ReadTextFile = 9
Reduce = 10
Sum = 11
# ...
# A logical dataflow operator
class Operator(object):
def __init__(self,
id,
op_type,
processor_class,
name="",
logic=None,
num_instances=1,
other=None,
state_actor=None):
self.id = id
self.type = op_type
self.processor_class = processor_class
self.name = name
self._logic = cloudpickle.dumps(logic) # The operator's logic
self.num_instances = num_instances
# One partitioning strategy per downstream operator (default: forward)
self.partitioning_strategies = {}
self.other_args = other # Depends on the type of the operator
self.state_actor = state_actor # Actor to query state
# Sets the partitioning scheme for an output stream of the operator
def _set_partition_strategy(self,
stream_id,
partitioning_scheme,
dest_operator=None):
self.partitioning_strategies[stream_id] = (partitioning_scheme,
dest_operator)
# Retrieves the partitioning scheme for the given
# output stream of the operator
# Returns None is no strategy has been defined for the particular stream
def _get_partition_strategy(self, stream_id):
return self.partitioning_strategies.get(stream_id)
# Cleans metatada from all partitioning strategies that lack a
# destination operator
# Valid entries are re-organized as
# 'destination operator id -> partitioning scheme'
# Should be called only after the logical dataflow has been constructed
def _clean(self):
strategies = {}
for _, v in self.partitioning_strategies.items():
strategy, destination_operator = v
if destination_operator is not None:
strategies.setdefault(destination_operator, strategy)
self.partitioning_strategies = strategies
def print(self):
log = "Operator<\nID = {}\nName = {}\nprocessor_class = {}\n"
log += "Logic = {}\nNumber_of_Instances = {}\n"
log += "Partitioning_Scheme = {}\nOther_Args = {}>\n"
logger.debug(
log.format(self.id, self.name, self.processor_class, self.logic,
self.num_instances, self.partitioning_strategies,
self.other_args))
@property
def logic(self):
return cloudpickle.loads(self._logic)
+226
View File
@@ -0,0 +1,226 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import sys
import time
import types
logger = logging.getLogger(__name__)
logger.setLevel("INFO")
def _identity(element):
return element
class ReadTextFile:
"""A source operator instance that reads a text file line by line.
Attributes:
filepath (string): The path to the input file.
"""
def __init__(self, operator):
self.filepath = operator.other_args
# TODO (john): Handle possible exception here
self.reader = open(self.filepath, "r")
# Read input file line by line
def run(self, input_gate, output_gate):
while True:
record = self.reader.readline()
# Reader returns empty string ('') on EOF
if not record:
self.reader.close()
return
output_gate.push(
record[:-1]) # Push after removing newline characters
class Map:
"""A map operator instance that applies a user-defined
stream transformation.
A map produces exactly one output record for each record in
the input stream.
"""
def __init__(self, operator):
self.map_fn = operator.logic
# Applies the mapper each record of the input stream(s)
# and pushes resulting records to the output stream(s)
def run(self, input_gate, output_gate):
elements = 0
while True:
record = input_gate.pull()
if record is None:
return
output_gate.push(self.map_fn(record))
elements += 1
class FlatMap:
"""A map operator instance that applies a user-defined
stream transformation.
A flatmap produces one or more output records for each record in
the input stream.
Attributes:
flatmap_fn (function): The user-defined function.
"""
def __init__(self, operator):
self.flatmap_fn = operator.logic
# Applies the splitter to the records of the input stream(s)
# and pushes resulting records to the output stream(s)
def run(self, input_gate, output_gate):
while True:
record = input_gate.pull()
if record is None:
return
output_gate.push_all(self.flatmap_fn(record))
class Filter:
"""A filter operator instance that applies a user-defined filter to
each record of the stream.
Output records are those that pass the filter, i.e. those for which
the filter function returns True.
Attributes:
filter_fn (function): The user-defined boolean function.
"""
def __init__(self, operator):
self.filter_fn = operator.logic
# Applies the filter to the records of the input stream(s)
# and pushes resulting records to the output stream(s)
def run(self, input_gate, output_gate):
while True:
record = input_gate.pull()
if record is None:
return
if self.filter_fn(record):
output_gate.push(record)
class Inspect:
"""A inspect operator instance that inspects the content of the stream.
Inspect is useful for printing the records in the stream.
"""
def __init__(self, operator):
self.inspect_fn = operator.logic
def run(self, input_gate, output_gate):
# Applies the inspect logic (e.g. print) to the records of
# the input stream(s)
# and leaves stream unaffected by simply pushing the records to
# the output stream(s)
while True:
record = input_gate.pull()
if record is None:
return
if output_gate:
output_gate.push(record)
self.inspect_fn(record)
class Reduce:
"""A reduce operator instance that combines a new value for a key
with the last reduced one according to a user-defined logic.
"""
def __init__(self, operator):
self.reduce_fn = operator.logic
# Set the attribute selector
self.attribute_selector = operator.other_args
if self.attribute_selector is None:
self.attribute_selector = _identity
elif isinstance(self.attribute_selector, int):
self.key_index = self.attribute_selector
self.attribute_selector =\
lambda record: record[self.attribute_selector]
elif isinstance(self.attribute_selector, str):
self.attribute_selector =\
lambda record: vars(record)[self.attribute_selector]
elif not isinstance(self.attribute_selector, types.FunctionType):
sys.exit("Unrecognized or unsupported key selector.")
self.state = {} # key -> value
# Combines the input value for a key with the last reduced
# value for that key to produce a new value.
# Outputs the result as (key,new value)
def run(self, input_gate, output_gate):
while True:
record = input_gate.pull()
if record is None:
return
key, rest = record
new_value = self.attribute_selector(rest)
# TODO (john): Is there a way to update state with
# a single dictionary lookup?
try:
old_value = self.state[key]
new_value = self.reduce_fn(old_value, new_value)
self.state[key] = new_value
except KeyError: # Key does not exist in state
self.state.setdefault(key, new_value)
output_gate.push((key, new_value))
# Returns the state of the actor
def get_state(self):
return self.state
class KeyBy:
"""A key_by operator instance that physically partitions the
stream based on a key.
"""
def __init__(self, operator):
# Set the key selector
self.key_selector = operator.other_args
if isinstance(self.key_selector, int):
self.key_selector = lambda r: r[self.key_selector]
elif isinstance(self.key_selector, str):
self.key_selector = lambda record: vars(record)[self.key_selector]
elif not isinstance(self.key_selector, types.FunctionType):
sys.exit("Unrecognized or unsupported key selector.")
# The actual partitioning is done by the output gate
def run(self, input_gate, output_gate):
while True:
record = input_gate.pull()
if record is None:
return
key = self.key_selector(record)
output_gate.push((key, record))
# A custom source actor
class Source:
def __init__(self, operator):
# The user-defined source with a get_next() method
self.source = operator.logic
# Starts the source by calling get_next() repeatedly
def run(self, input_gate, output_gate):
start = time.time()
elements = 0
while True:
record = self.source.get_next()
if not record:
logger.debug("[writer] puts per second: {}".format(
elements / (time.time() - start)))
return
output_gate.push(record)
elements += 1
+291
View File
@@ -0,0 +1,291 @@
import logging
import random
from queue import Queue
from typing import List
import ray
import ray.streaming._streaming as _streaming
import ray.streaming.generated.streaming_pb2 as streaming_pb
from ray.actor import ActorHandle, ActorID
from ray.streaming.config import Config
CHANNEL_ID_LEN = 20
class ChannelID:
"""
ChannelID is used to identify a transfer channel between
a upstream worker and downstream worker.
"""
def __init__(self, channel_id_str: str):
"""
Args:
channel_id_str: string representation of channel id
"""
self.channel_id_str = channel_id_str
self.object_qid = ray.ObjectID(channel_id_str_to_bytes(channel_id_str))
def __eq__(self, other):
if other is None:
return False
if type(other) is ChannelID:
return self.channel_id_str == other.channel_id_str
else:
return False
def __hash__(self):
return hash(self.channel_id_str)
def __repr__(self):
return self.channel_id_str
@staticmethod
def gen_random_id():
"""Generate a random channel id string
"""
res = ""
for i in range(CHANNEL_ID_LEN * 2):
res += str(chr(random.randint(0, 5) + ord("A")))
return res
@staticmethod
def gen_id(from_index, to_index, ts):
"""Generate channel id, which is 20 character"""
channel_id = bytearray(20)
for i in range(11, 7, -1):
channel_id[i] = ts & 0xff
ts >>= 8
channel_id[16] = (from_index & 0xffff) >> 8
channel_id[17] = (from_index & 0xff)
channel_id[18] = (to_index & 0xffff) >> 8
channel_id[19] = (to_index & 0xff)
return channel_bytes_to_str(bytes(channel_id))
def channel_id_str_to_bytes(channel_id_str):
"""
Args:
channel_id_str: string representation of channel id
Returns:
bytes representation of channel id
"""
assert type(channel_id_str) in [str, bytes]
if isinstance(channel_id_str, bytes):
return channel_id_str
qid_bytes = bytes.fromhex(channel_id_str)
assert len(qid_bytes) == CHANNEL_ID_LEN
return qid_bytes
def channel_bytes_to_str(id_bytes):
"""
Args:
id_bytes: bytes representation of channel id
Returns:
string representation of channel id
"""
assert type(id_bytes) in [str, bytes]
if isinstance(id_bytes, str):
return id_bytes
return bytes.hex(id_bytes)
class DataMessage:
"""
DataMessage represents data between upstream and downstream operator
"""
def __init__(self,
body,
timestamp,
channel_id,
message_id_,
is_empty_message=False):
self.__body = body
self.__timestamp = timestamp
self.__channel_id = channel_id
self.__message_id = message_id_
self.__is_empty_message = is_empty_message
def __len__(self):
return len(self.__body)
def body(self):
"""Message data"""
return self.__body
def timestamp(self):
"""Get timestamp when item is written by upstream DataWriter
"""
return self.__timestamp
def channel_id(self):
"""Get string id of channel where data is coming from
"""
return self.__channel_id
def is_empty_message(self):
"""Whether this message is an empty message.
Upstream DataWriter will send an empty message when this is no data
in specified interval.
"""
return self.__is_empty_message
@property
def message_id(self):
return self.__message_id
logger = logging.getLogger(__name__)
class DataWriter:
"""Data Writer is a wrapper of streaming c++ DataWriter, which sends data
to downstream workers
"""
def __init__(self, output_channels, to_actors: List[ActorHandle],
conf: dict):
"""Get DataWriter of output channels
Args:
output_channels: output channels ids
to_actors: downstream output actors
Returns:
DataWriter
"""
assert len(output_channels) > 0
py_output_channels = [
channel_id_str_to_bytes(qid_str) for qid_str in output_channels
]
output_actor_ids: List[ActorID] = [
handle._ray_actor_id for handle in to_actors
]
channel_size = conf.get(Config.CHANNEL_SIZE,
Config.CHANNEL_SIZE_DEFAULT)
py_msg_ids = [0 for _ in range(len(output_channels))]
config_bytes = _to_native_conf(conf)
is_mock = conf[Config.CHANNEL_TYPE] == Config.MEMORY_CHANNEL
self.writer = _streaming.DataWriter.create(
py_output_channels, output_actor_ids, channel_size, py_msg_ids,
config_bytes, is_mock)
logger.info("create DataWriter succeed")
def write(self, channel_id: ChannelID, item: bytes):
"""Write data into native channel
Args:
channel_id: channel id
item: bytes data
Returns:
msg_id
"""
assert type(item) == bytes
msg_id = self.writer.write(channel_id.object_qid, item)
return msg_id
def stop(self):
logger.info("stopping channel writer.")
self.writer.stop()
# destruct DataWriter
self.writer = None
def close(self):
logger.info("closing channel writer.")
class DataReader:
"""Data Reader is wrapper of streaming c++ DataReader, which read data
from channels of upstream workers
"""
def __init__(self, input_channels: List, from_actors: List[ActorHandle],
conf: dict):
"""Get DataReader of input channels
Args:
input_channels: input channels
from_actors: upstream input actors
Returns:
DataReader
"""
assert len(input_channels) > 0
py_input_channels = [
channel_id_str_to_bytes(qid_str) for qid_str in input_channels
]
input_actor_ids: List[ActorID] = [
handle._ray_actor_id for handle in from_actors
]
py_seq_ids = [0 for _ in range(len(input_channels))]
py_msg_ids = [0 for _ in range(len(input_channels))]
timer_interval = int(conf.get(Config.TIMER_INTERVAL_MS, -1))
is_recreate = bool(conf.get(Config.IS_RECREATE, False))
config_bytes = _to_native_conf(conf)
self.__queue = Queue(10000)
is_mock = conf[Config.CHANNEL_TYPE] == Config.MEMORY_CHANNEL
self.reader = _streaming.DataReader.create(
py_input_channels, input_actor_ids, py_seq_ids, py_msg_ids,
timer_interval, is_recreate, config_bytes, is_mock)
logger.info("create DataReader succeed")
def read(self, timeout_millis):
"""Read data from channel
Args:
timeout_millis: timeout millis when there is no data in channel
for this duration
Returns:
channel item
"""
if self.__queue.empty():
msgs = self.reader.read(timeout_millis)
for msg in msgs:
msg_bytes, msg_id, timestamp, qid_bytes = msg
data_msg = DataMessage(msg_bytes, timestamp,
channel_bytes_to_str(qid_bytes), msg_id)
self.__queue.put(data_msg)
if self.__queue.empty():
return None
return self.__queue.get()
def stop(self):
logger.info("stopping Data Reader.")
self.reader.stop()
# destruct DataReader
self.reader = None
def close(self):
logger.info("closing Data Reader.")
def _to_native_conf(conf):
config = streaming_pb.StreamingConfig()
if Config.STREAMING_JOB_NAME in conf:
config.job_name = conf[Config.STREAMING_JOB_NAME]
if Config.TASK_JOB_ID in conf:
job_id = conf[Config.TASK_JOB_ID]
config.task_job_id = job_id.hex()
if Config.STREAMING_WORKER_NAME in conf:
config.worker_name = conf[Config.STREAMING_WORKER_NAME]
if Config.STREAMING_OP_NAME in conf:
config.op_name = conf[Config.STREAMING_OP_NAME]
# TODO set operator type
if Config.STREAMING_RING_BUFFER_CAPACITY in conf:
config.ring_buffer_capacity = \
conf[Config.STREAMING_RING_BUFFER_CAPACITY]
if Config.STREAMING_EMPTY_MESSAGE_INTERVAL in conf:
config.empty_message_interval = \
conf[Config.STREAMING_EMPTY_MESSAGE_INTERVAL]
logger.info("conf: %s", str(config))
return config.SerializeToString()
class ChannelInitException(Exception):
def __init__(self, msg, abnormal_channels):
self.abnormal_channels = abnormal_channels
self.msg = msg
class ChannelInterruptException(Exception):
def __init__(self, msg=None):
self.msg = msg
+693
View File
@@ -0,0 +1,693 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import pickle
import sys
import time
import networkx as nx
import ray
import ray.streaming.processor as processor
import ray.streaming.runtime.transfer as transfer
from ray.streaming.communication import DataChannel
from ray.streaming.config import Config
from ray.streaming.jobworker import JobWorker
from ray.streaming.operator import Operator, OpType
from ray.streaming.operator import PScheme, PStrategy
logger = logging.getLogger(__name__)
logger.setLevel("INFO")
# Rolling sum's logic
def _sum(value_1, value_2):
return value_1 + value_2
# Partitioning strategies that require all-to-all instance communication
all_to_all_strategies = [
PStrategy.Shuffle, PStrategy.ShuffleByKey, PStrategy.Broadcast,
PStrategy.RoundRobin
]
# Environment configuration
class Conf(object):
"""Environment configuration.
This class includes all information about the configuration of the
streaming environment.
"""
def __init__(self, parallelism=1, channel_type=Config.MEMORY_CHANNEL):
self.parallelism = parallelism
self.channel_type = channel_type
# ...
class ExecutionGraph:
def __init__(self, env):
self.env = env
self.physical_topo = nx.DiGraph() # DAG
# Handles to all actors in the physical dataflow
self.actor_handles = []
# (op_id, op_instance_index) -> ActorID
self.actors_map = {}
# execution graph build time: milliseconds since epoch
self.build_time = 0
self.task_id_counter = 0
self.task_ids = {}
self.input_channels = {} # operator id -> input channels
self.output_channels = {} # operator id -> output channels
# Constructs and deploys a Ray actor of a specific type
# TODO (john): Actor placement information should be specified in
# the environment's configuration
def __generate_actor(self, instance_index, operator, input_channels,
output_channels):
"""Generates an actor that will execute a particular instance of
the logical operator
Attributes:
instance_index: The index of the instance the actor will execute.
operator: The metadata of the logical operator.
input_channels: The input channels of the instance.
output_channels The output channels of the instance.
"""
worker_id = (operator.id, instance_index)
# Record the physical dataflow graph (for debugging purposes)
self.__add_channel(worker_id, output_channels)
# Note direct_call only support pass by value
return JobWorker._remote(
args=[worker_id, operator, input_channels, output_channels],
is_direct_call=True)
# Constructs and deploys a Ray actor for each instance of
# the given operator
def __generate_actors(self, operator, upstream_channels,
downstream_channels):
"""Generates one actor for each instance of the given logical
operator.
Attributes:
operator (Operator): The logical operator metadata.
upstream_channels (list): A list of all upstream channels for
all instances of the operator.
downstream_channels (list): A list of all downstream channels
for all instances of the operator.
"""
num_instances = operator.num_instances
logger.info("Generating {} actors of type {}...".format(
num_instances, operator.type))
handles = []
for i in range(num_instances):
# Collect input and output channels for the particular instance
ip = [c for c in upstream_channels if c.dst_instance_index == i]
op = [c for c in downstream_channels if c.src_instance_index == i]
log = "Constructed {} input and {} output channels "
log += "for the {}-th instance of the {} operator."
logger.debug(log.format(len(ip), len(op), i, operator.type))
handle = self.__generate_actor(i, operator, ip, op)
if handle:
handles.append(handle)
self.actors_map[(operator.id, i)] = handle
return handles
# Adds a channel/edge to the physical dataflow graph
def __add_channel(self, actor_id, output_channels):
for c in output_channels:
dest_actor_id = (c.dst_operator_id, c.dst_instance_index)
self.physical_topo.add_edge(actor_id, dest_actor_id)
# Generates all required data channels between an operator
# and its downstream operators
def _generate_channels(self, operator):
"""Generates all output data channels
(see: DataChannel in communication.py) for all instances of
the given logical operator.
The function constructs one data channel for each pair of
communicating operator instances (instance_1,instance_2),
where instance_1 is an instance of the given operator and instance_2
is an instance of a direct downstream operator.
The number of total channels generated depends on the partitioning
strategy specified by the user.
"""
channels = {} # destination operator id -> channels
strategies = operator.partitioning_strategies
for dst_operator, p_scheme in strategies.items():
num_dest_instances = self.env.operators[dst_operator].num_instances
entry = channels.setdefault(dst_operator, [])
if p_scheme.strategy == PStrategy.Forward:
for i in range(operator.num_instances):
# ID of destination instance to connect
id = i % num_dest_instances
qid = self._gen_str_qid(operator.id, i, dst_operator, id)
c = DataChannel(operator.id, i, dst_operator, id, qid)
entry.append(c)
elif p_scheme.strategy in all_to_all_strategies:
for i in range(operator.num_instances):
for j in range(num_dest_instances):
qid = self._gen_str_qid(operator.id, i, dst_operator,
j)
c = DataChannel(operator.id, i, dst_operator, j, qid)
entry.append(c)
else:
# TODO (john): Add support for other partitioning strategies
sys.exit("Unrecognized or unsupported partitioning strategy.")
return channels
def _gen_str_qid(self, src_operator_id, src_instance_index,
dst_operator_id, dst_instance_index):
from_task_id = self.env.execution_graph.get_task_id(
src_operator_id, src_instance_index)
to_task_id = self.env.execution_graph.get_task_id(
dst_operator_id, dst_instance_index)
return transfer.ChannelID.gen_id(from_task_id, to_task_id,
self.build_time)
def _gen_task_id(self):
task_id = self.task_id_counter
self.task_id_counter += 1
return task_id
def get_task_id(self, op_id, op_instance_id):
return self.task_ids[(op_id, op_instance_id)]
def get_actor(self, op_id, op_instance_id):
return self.actors_map[(op_id, op_instance_id)]
# Prints the physical dataflow graph
def print_physical_graph(self):
logger.info("===================================")
logger.info("======Physical Dataflow Graph======")
logger.info("===================================")
# Print all data channels between operator instances
log = "(Source Operator ID,Source Operator Name,Source Instance ID)"
log += " --> "
log += "(Destination Operator ID,Destination Operator Name,"
log += "Destination Instance ID)"
logger.info(log)
for src_actor_id, dst_actor_id in self.physical_topo.edges:
src_operator_id, src_instance_index = src_actor_id
dst_operator_id, dst_instance_index = dst_actor_id
logger.info("({},{},{}) --> ({},{},{})".format(
src_operator_id, self.env.operators[src_operator_id].name,
src_instance_index, dst_operator_id,
self.env.operators[dst_operator_id].name, dst_instance_index))
def build_graph(self):
self.build_channels()
# to support cyclic reference serialization
try:
ray.register_custom_serializer(Environment, use_pickle=True)
ray.register_custom_serializer(ExecutionGraph, use_pickle=True)
ray.register_custom_serializer(OpType, use_pickle=True)
ray.register_custom_serializer(PStrategy, use_pickle=True)
except Exception:
# local mode can't use pickle
pass
# Each operator instance is implemented as a Ray actor
# Actors are deployed in topological order, as we traverse the
# logical dataflow from sources to sinks.
for node in nx.topological_sort(self.env.logical_topo):
operator = self.env.operators[node]
# Instantiate Ray actors
handles = self.__generate_actors(
operator, self.input_channels.get(node, []),
self.output_channels.get(node, []))
if handles:
self.actor_handles.extend(handles)
def build_channels(self):
self.build_time = int(time.time() * 1000)
# gen auto-incremented unique task id for every operator instance
for node in nx.topological_sort(self.env.logical_topo):
operator = self.env.operators[node]
for i in range(operator.num_instances):
operator_instance_id = (operator.id, i)
self.task_ids[operator_instance_id] = self._gen_task_id()
channels = {}
for node in nx.topological_sort(self.env.logical_topo):
operator = self.env.operators[node]
# Generate downstream data channels
downstream_channels = self._generate_channels(operator)
channels[node] = downstream_channels
# op_id -> channels
input_channels = {}
output_channels = {}
for op_id, all_downstream_channels in channels.items():
for dst_op_channels in all_downstream_channels.values():
for c in dst_op_channels:
dst = input_channels.setdefault(c.dst_operator_id, [])
dst.append(c)
src = output_channels.setdefault(c.src_operator_id, [])
src.append(c)
self.input_channels = input_channels
self.output_channels = output_channels
# The execution environment for a streaming job
class Environment(object):
"""A streaming environment.
This class is responsible for constructing the logical and the
physical dataflow.
Attributes:
logical_topo (DiGraph): The user-defined logical topology in
NetworkX DiGRaph format.
(See: https://networkx.github.io)
physical_topo (DiGraph): The physical topology in NetworkX
DiGRaph format. The physical dataflow is constructed by the
environment based on logical_topo.
operators (dict): A mapping from operator ids to operator metadata
(See: Operator in operator.py).
config (Config): The environment's configuration.
topo_cleaned (bool): A flag that indicates whether the logical
topology is garbage collected (True) or not (False).
actor_handles (list): A list of all Ray actor handles that execute
the streaming dataflow.
"""
def __init__(self, config=Conf()):
self.logical_topo = nx.DiGraph() # DAG
self.operators = {} # operator id --> operator object
self.config = config # Environment's configuration
self.topo_cleaned = False
self.operator_id_counter = 0
self.execution_graph = None # set when executed
def gen_operator_id(self):
op_id = self.operator_id_counter
self.operator_id_counter += 1
return op_id
# An edge denotes a flow of data between logical operators
# and may correspond to multiple data channels in the physical dataflow
def _add_edge(self, source, destination):
self.logical_topo.add_edge(source, destination)
# Cleans the logical dataflow graph to construct and
# deploy the physical dataflow
def _collect_garbage(self):
if self.topo_cleaned is True:
return
for node in self.logical_topo:
self.operators[node]._clean()
self.topo_cleaned = True
# Sets the level of parallelism for a registered operator
# Overwrites the environment parallelism (if set)
def _set_parallelism(self, operator_id, level_of_parallelism):
self.operators[operator_id].num_instances = level_of_parallelism
# Sets the same level of parallelism for all operators in the environment
def set_parallelism(self, parallelism):
self.config.parallelism = parallelism
# Creates and registers a user-defined data source
# TODO (john): There should be different types of sources, e.g. sources
# reading from Kafka, text files, etc.
# TODO (john): Handle case where environment parallelism is set
def source(self, source):
source_id = self.gen_operator_id()
source_stream = DataStream(self, source_id)
self.operators[source_id] = Operator(
source_id, OpType.Source, processor.Source, "Source", logic=source)
return source_stream
# Creates and registers a new data source that reads a
# text file line by line
# TODO (john): There should be different types of sources,
# e.g. sources reading from Kafka, text files, etc.
# TODO (john): Handle case where environment parallelism is set
def read_text_file(self, filepath):
source_id = self.gen_operator_id()
source_stream = DataStream(self, source_id)
self.operators[source_id] = Operator(
source_id,
OpType.ReadTextFile,
processor.ReadTextFile,
"Read Text File",
other=filepath)
return source_stream
# Constructs and deploys the physical dataflow
def execute(self):
"""Deploys and executes the physical dataflow."""
self._collect_garbage() # Make sure everything is clean
# TODO (john): Check if dataflow has any 'logical inconsistencies'
# For example, if there is a forward partitioning strategy but
# the number of downstream instances is larger than the number of
# upstream instances, some of the downstream instances will not be
# used at all
self.execution_graph = ExecutionGraph(self)
self.execution_graph.build_graph()
logger.info("init...")
# init
init_waits = []
for actor_handle in self.execution_graph.actor_handles:
init_waits.append(actor_handle.init.remote(pickle.dumps(self)))
for wait in init_waits:
assert ray.get(wait) is True
logger.info("running...")
# start
exec_handles = []
for actor_handle in self.execution_graph.actor_handles:
exec_handles.append(actor_handle.start.remote())
return exec_handles
def wait_finish(self):
for actor_handle in self.execution_graph.actor_handles:
while not ray.get(actor_handle.is_finished.remote()):
time.sleep(1)
# Prints the logical dataflow graph
def print_logical_graph(self):
self._collect_garbage()
logger.info("==================================")
logger.info("======Logical Dataflow Graph======")
logger.info("==================================")
# Print operators in topological order
for node in nx.topological_sort(self.logical_topo):
downstream_neighbors = list(self.logical_topo.neighbors(node))
logger.info("======Current Operator======")
operator = self.operators[node]
operator.print()
logger.info("======Downstream Operators======")
if len(downstream_neighbors) == 0:
logger.info("None\n")
for downstream_node in downstream_neighbors:
self.operators[downstream_node].print()
# TODO (john): We also need KeyedDataStream and WindowedDataStream as
# subclasses of DataStream to prevent ill-defined logical dataflows
# A DataStream corresponds to an edge in the logical dataflow
class DataStream(object):
"""A data stream.
This class contains all information about a logical stream, i.e. an edge
in the logical topology. It is the main class exposed to the user.
Attributes:
id (UUID): The id of the stream
env (Environment): The environment the stream belongs to.
src_operator_id (UUID): The id of the source operator of the stream.
dst_operator_id (UUID): The id of the destination operator of the
stream.
is_partitioned (bool): Denotes if there is a partitioning strategy
(e.g. shuffle) for the stream or not (default stategy: Forward).
"""
stream_id_counter = 0
def __init__(self,
environment,
source_id=None,
dest_id=None,
is_partitioned=False):
self.env = environment
self.id = DataStream.stream_id_counter
DataStream.stream_id_counter += 1
self.src_operator_id = source_id
self.dst_operator_id = dest_id
# True if a partitioning strategy for this stream exists,
# false otherwise
self.is_partitioned = is_partitioned
# Generates a new stream after a data transformation is applied
def __expand(self):
stream = DataStream(self.env)
assert (self.dst_operator_id is not None)
stream.src_operator_id = self.dst_operator_id
stream.dst_operator_id = None
return stream
# Assigns the partitioning strategy to a new 'open-ended' stream
# and returns the stream. At this point, the partitioning strategy
# is not associated with any destination operator. We expect this to
# be done later, as we continue assembling the dataflow graph
def __partition(self, strategy, partition_fn=None):
scheme = PScheme(strategy, partition_fn)
source_operator = self.env.operators[self.src_operator_id]
new_stream = DataStream(
self.env, source_id=source_operator.id, is_partitioned=True)
source_operator._set_partition_strategy(new_stream.id, scheme)
return new_stream
# Registers the operator to the environment and returns a new
# 'open-ended' stream. The registered operator serves as the destination
# of the previously 'open' stream
def __register(self, operator):
"""Registers the given logical operator to the environment and
connects it to its upstream operator (if any).
A call to this function adds a new edge to the logical topology.
Attributes:
operator (Operator): The metadata of the logical operator.
"""
self.env.operators[operator.id] = operator
self.dst_operator_id = operator.id
logger.debug("Adding new dataflow edge ({},{}) --> ({},{})".format(
self.src_operator_id,
self.env.operators[self.src_operator_id].name,
self.dst_operator_id,
self.env.operators[self.dst_operator_id].name))
# Update logical dataflow graphs
self.env._add_edge(self.src_operator_id, self.dst_operator_id)
# Keep track of the partitioning strategy and the destination operator
src_operator = self.env.operators[self.src_operator_id]
if self.is_partitioned is True:
partitioning, _ = src_operator._get_partition_strategy(self.id)
src_operator._set_partition_strategy(self.id, partitioning,
operator.id)
elif src_operator.type == OpType.KeyBy:
# Set the output partitioning strategy to shuffle by key
partitioning = PScheme(PStrategy.ShuffleByKey)
src_operator._set_partition_strategy(self.id, partitioning,
operator.id)
else: # No partitioning strategy has been defined - set default
partitioning = PScheme(PStrategy.Forward)
src_operator._set_partition_strategy(self.id, partitioning,
operator.id)
return self.__expand()
# Sets the level of parallelism for an operator, i.e. its total
# number of instances. Each operator instance corresponds to an actor
# in the physical dataflow
def set_parallelism(self, num_instances):
"""Sets the number of instances for the source operator of the stream.
Attributes:
num_instances (int): The level of parallelism for the source
operator of the stream.
"""
assert (num_instances > 0)
self.env._set_parallelism(self.src_operator_id, num_instances)
return self
# Stream Partitioning Strategies #
# TODO (john): Currently, only forward (default), shuffle,
# and broadcast are supported
# Hash-based record shuffling
def shuffle(self):
"""Registers a shuffling partitioning strategy for the stream."""
return self.__partition(PStrategy.Shuffle)
# Broadcasts each record to all downstream instances
def broadcast(self):
"""Registers a broadcast partitioning strategy for the stream."""
return self.__partition(PStrategy.Broadcast)
# Rescales load to downstream instances
def rescale(self):
"""Registers a rescale partitioning strategy for the stream.
Same as Flink's rescale (see: https://ci.apache.org/projects/flink/
flink-docs-stable/dev/stream/operators/#physical-partitioning).
"""
return self.__partition(PStrategy.Rescale)
# Round-robin partitioning
def round_robin(self):
"""Registers a round-robin partitioning strategy for the stream."""
return self.__partition(PStrategy.RoundRobin)
# User-defined partitioning
def partition(self, partition_fn):
"""Registers a user-defined partitioning strategy for the stream.
Attributes:
partition_fn (function): The user-defined partitioning function.
"""
return self.__partition(PStrategy.Custom, partition_fn)
# Data Trasnformations #
# TODO (john): Expand set of supported operators.
# TODO (john): To support event-time windows we need a mechanism for
# generating and processing watermarks
# Registers map operator to the environment
def map(self, map_fn, name="Map"):
"""Applies a map operator to the stream.
Attributes:
map_fn (function): The user-defined logic of the map.
"""
op = Operator(
self.env.gen_operator_id(),
OpType.Map,
processor.Map,
name,
map_fn,
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers flatmap operator to the environment
def flat_map(self, flatmap_fn):
"""Applies a flatmap operator to the stream.
Attributes:
flatmap_fn (function): The user-defined logic of the flatmap
(e.g. split()).
"""
op = Operator(
self.env.gen_operator_id(),
OpType.FlatMap,
processor.FlatMap,
"FlatMap",
flatmap_fn,
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers keyBy operator to the environment
# TODO (john): This should returned a KeyedDataStream
def key_by(self, key_selector):
"""Applies a key_by operator to the stream.
Attributes:
key_attribute_index (int): The index of the key attributed
(assuming tuple records).
"""
op = Operator(
self.env.gen_operator_id(),
OpType.KeyBy,
processor.KeyBy,
"KeyBy",
other=key_selector,
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers Reduce operator to the environment
def reduce(self, reduce_fn):
"""Applies a rolling sum operator to the stream.
Attributes:
sum_attribute_index (int): The index of the attribute to sum
(assuming tuple records).
"""
op = Operator(
self.env.gen_operator_id(),
OpType.Reduce,
processor.Reduce,
"Sum",
reduce_fn,
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers Sum operator to the environment
def sum(self, attribute_selector, state_keeper=None):
"""Applies a rolling sum operator to the stream.
Attributes:
sum_attribute_index (int): The index of the attribute to sum
(assuming tuple records).
"""
op = Operator(
self.env.gen_operator_id(),
OpType.Sum,
processor.Reduce,
"Sum",
_sum,
other=attribute_selector,
state_actor=state_keeper,
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers window operator to the environment.
# This is a system time window
# TODO (john): This should return a WindowedDataStream
def time_window(self, window_width_ms):
"""Applies a system time window to the stream.
Attributes:
window_width_ms (int): The length of the window in ms.
"""
raise Exception("time_window is unsupported")
# Registers filter operator to the environment
def filter(self, filter_fn):
"""Applies a filter to the stream.
Attributes:
filter_fn (function): The user-defined filter function.
"""
op = Operator(
self.env.gen_operator_id(),
OpType.Filter,
processor.Filter,
"Filter",
filter_fn,
num_instances=self.env.config.parallelism)
return self.__register(op)
# TODO (john): Registers window join operator to the environment
def window_join(self, other_stream, join_attribute, window_width):
op = Operator(
self.env.gen_operator_id(),
OpType.WindowJoin,
processor.WindowJoin,
"WindowJoin",
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers inspect operator to the environment
def inspect(self, inspect_logic):
"""Inspects the content of the stream.
Attributes:
inspect_logic (function): The user-defined inspect function.
"""
op = Operator(
self.env.gen_operator_id(),
OpType.Inspect,
processor.Inspect,
"Inspect",
inspect_logic,
num_instances=self.env.config.parallelism)
return self.__register(op)
# Registers sink operator to the environment
# TODO (john): A sink now just drops records but it should be able to
# export data to other systems
def sink(self):
"""Closes the stream with a sink operator."""
op = Operator(
self.env.gen_operator_id(),
OpType.Sink,
processor.Sink,
"Sink",
num_instances=self.env.config.parallelism)
return self.__register(op)
View File
@@ -0,0 +1,127 @@
import pickle
import threading
import time
import ray
import ray.streaming._streaming as _streaming
import ray.streaming.runtime.transfer as transfer
from ray.function_manager import FunctionDescriptor
from ray.streaming.config import Config
@ray.remote
class Worker:
def __init__(self):
core_worker = ray.worker.global_worker.core_worker
writer_async_func = FunctionDescriptor(
__name__, self.on_writer_message.__name__, self.__class__.__name__)
writer_sync_func = FunctionDescriptor(
__name__, self.on_writer_message_sync.__name__,
self.__class__.__name__)
self.writer_client = _streaming.WriterClient(
core_worker, writer_async_func, writer_sync_func)
reader_async_func = FunctionDescriptor(
__name__, self.on_reader_message.__name__, self.__class__.__name__)
reader_sync_func = FunctionDescriptor(
__name__, self.on_reader_message_sync.__name__,
self.__class__.__name__)
self.reader_client = _streaming.ReaderClient(
core_worker, reader_async_func, reader_sync_func)
self.writer = None
self.output_channel_id = None
self.reader = None
def init_writer(self, output_channel, reader_actor):
conf = {
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
.current_driver_id,
Config.CHANNEL_TYPE: Config.NATIVE_CHANNEL
}
self.writer = transfer.DataWriter([output_channel],
[pickle.loads(reader_actor)], conf)
self.output_channel_id = transfer.ChannelID(output_channel)
def init_reader(self, input_channel, writer_actor):
conf = {
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
.current_driver_id,
Config.CHANNEL_TYPE: Config.NATIVE_CHANNEL
}
self.reader = transfer.DataReader([input_channel],
[pickle.loads(writer_actor)], conf)
def start_write(self, msg_nums):
self.t = threading.Thread(
target=self.run_writer, args=[msg_nums], daemon=True)
self.t.start()
def run_writer(self, msg_nums):
for i in range(msg_nums):
self.writer.write(self.output_channel_id, pickle.dumps(i))
print("WriterWorker done.")
def start_read(self, msg_nums):
self.t = threading.Thread(
target=self.run_reader, args=[msg_nums], daemon=True)
self.t.start()
def run_reader(self, msg_nums):
count = 0
msg = None
while count != msg_nums:
item = self.reader.read(100)
if item is None:
time.sleep(0.01)
else:
msg = pickle.loads(item.body())
count += 1
assert msg == msg_nums - 1
print("ReaderWorker done.")
def is_finished(self):
return not self.t.is_alive()
def on_reader_message(self, buffer: bytes):
"""used in direct call mode"""
self.reader_client.on_reader_message(buffer)
def on_reader_message_sync(self, buffer: bytes):
"""used in direct call mode"""
if self.reader_client is None:
return b" " * 4 # special flag to indicate this actor not ready
result = self.reader_client.on_reader_message_sync(buffer)
return result.to_pybytes()
def on_writer_message(self, buffer: bytes):
"""used in direct call mode"""
self.writer_client.on_writer_message(buffer)
def on_writer_message_sync(self, buffer: bytes):
"""used in direct call mode"""
if self.writer_client is None:
return b" " * 4 # special flag to indicate this actor not ready
result = self.writer_client.on_writer_message_sync(buffer)
return result.to_pybytes()
def test_queue():
ray.init()
writer = Worker._remote(is_direct_call=True)
reader = Worker._remote(is_direct_call=True)
channel_id_str = transfer.ChannelID.gen_random_id()
inits = [
writer.init_writer.remote(channel_id_str, pickle.dumps(reader)),
reader.init_reader.remote(channel_id_str, pickle.dumps(writer))
]
ray.get(inits)
msg_nums = 1000
print("start read/write")
reader.start_read.remote(msg_nums)
writer.start_write.remote(msg_nums)
while not ray.get(reader.is_finished.remote()):
time.sleep(0.1)
ray.shutdown()
if __name__ == "__main__":
test_queue()
@@ -0,0 +1,210 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.streaming.streaming import Environment, ExecutionGraph
from ray.streaming.operator import OpType, PStrategy
def test_parallelism():
"""Tests operator parallelism."""
env = Environment()
# Try setting a common parallelism for all operators
env.set_parallelism(2)
stream = env.source(None).map(None).filter(None).flat_map(None)
env._collect_garbage()
for operator in env.operators.values():
if operator.type == OpType.Source:
# TODO (john): Currently each source has only one instance
assert operator.num_instances == 1, (operator.num_instances, 1)
else:
assert operator.num_instances == 2, (operator.num_instances, 2)
# Check again after adding an operator with different parallelism
stream.map(None, "Map1").shuffle().set_parallelism(3).map(
None, "Map2").set_parallelism(4)
env._collect_garbage()
for operator in env.operators.values():
if operator.type == OpType.Source:
assert operator.num_instances == 1, (operator.num_instances, 1)
elif operator.name != "Map1" and operator.name != "Map2":
assert operator.num_instances == 2, (operator.num_instances, 2)
elif operator.name != "Map2":
assert operator.num_instances == 3, (operator.num_instances, 3)
else:
assert operator.num_instances == 4, (operator.num_instances, 4)
def test_partitioning():
"""Tests stream partitioning."""
env = Environment()
# Try defining multiple partitioning strategies for the same stream
_ = env.source(None).shuffle().rescale().broadcast().map(
None).broadcast().shuffle()
env._collect_garbage()
for operator in env.operators.values():
p_schemes = operator.partitioning_strategies
for scheme in p_schemes.values():
# Only last defined strategy should be kept
if operator.type == OpType.Source:
assert scheme.strategy == PStrategy.Broadcast, (
scheme.strategy, PStrategy.Broadcast)
else:
assert scheme.strategy == PStrategy.Shuffle, (
scheme.strategy, PStrategy.Shuffle)
def test_forking():
"""Tests stream forking."""
env = Environment()
# Try forking a stream
stream = env.source(None).map(None).set_parallelism(2)
# First branch with a shuffle partitioning strategy
_ = stream.shuffle().key_by(0).sum(1)
# Second branch with the default partitioning strategy
_ = stream.key_by(1).sum(2)
env._collect_garbage()
# Operator ids
source_id = None
map_id = None
keyby1_id = None
keyby2_id = None
sum1_id = None
sum2_id = None
# Collect ids
for id, operator in env.operators.items():
if operator.type == OpType.Source:
source_id = id
elif operator.type == OpType.Map:
map_id = id
elif operator.type == OpType.KeyBy:
if operator.other_args == 0:
keyby1_id = id
else:
assert operator.other_args == 1, (operator.other_args, 1)
keyby2_id = id
elif operator.type == OpType.Sum:
if operator.other_args == 1:
sum1_id = id
else:
assert operator.other_args == 2, (operator.other_args, 2)
sum2_id = id
# Check generated streams and their partitioning
for source, destination in env.logical_topo.edges:
operator = env.operators[source]
if source == source_id:
assert destination == map_id, (destination, map_id)
elif source == map_id:
p_scheme = operator.partitioning_strategies[destination]
strategy = p_scheme.strategy
key_index = env.operators[destination].other_args
if key_index == 0: # This must be the first branch
assert strategy == PStrategy.Shuffle, (strategy,
PStrategy.Shuffle)
assert destination == keyby1_id, (destination, keyby1_id)
else: # This must be the second branch
assert key_index == 1, (key_index, 1)
assert strategy == PStrategy.Forward, (strategy,
PStrategy.Forward)
assert destination == keyby2_id, (destination, keyby2_id)
elif source == keyby1_id or source == keyby2_id:
p_scheme = operator.partitioning_strategies[destination]
strategy = p_scheme.strategy
key_index = env.operators[destination].other_args
if key_index == 1: # This must be the first branch
assert strategy == PStrategy.ShuffleByKey, (
strategy, PStrategy.ShuffleByKey)
assert destination == sum1_id, (destination, sum1_id)
else: # This must be the second branch
assert key_index == 2, (key_index, 2)
assert strategy == PStrategy.ShuffleByKey, (
strategy, PStrategy.ShuffleByKey)
assert destination == sum2_id, (destination, sum2_id)
else: # This must be a sum operator
assert operator.type == OpType.Sum, (operator.type, OpType.Sum)
def _test_shuffle_channels():
"""Tests shuffling connectivity."""
env = Environment()
# Try defining a shuffle
_ = env.source(None).shuffle().map(None).set_parallelism(4)
expected = [(0, 0), (0, 1), (0, 2), (0, 3)]
_test_channels(env, expected)
def _test_forward_channels():
"""Tests forward connectivity."""
env = Environment()
# Try the default partitioning strategy
_ = env.source(None).set_parallelism(4).map(None).set_parallelism(2)
expected = [(0, 0), (1, 1), (2, 0), (3, 1)]
_test_channels(env, expected)
def _test_broadcast_channels():
"""Tests broadcast connectivity."""
env = Environment()
# Try broadcasting
_ = env.source(None).set_parallelism(4).broadcast().map(
None).set_parallelism(2)
expected = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), (3, 1)]
_test_channels(env, expected)
def _test_round_robin_channels():
"""Tests round-robin connectivity."""
env = Environment()
# Try broadcasting
_ = env.source(None).round_robin().map(None).set_parallelism(2)
expected = [(0, 0), (0, 1)]
_test_channels(env, expected)
def _test_channels(environment, expected_channels):
"""Tests operator connectivity."""
environment._collect_garbage()
map_id = None
# Get id
for id, operator in environment.operators.items():
if operator.type == OpType.Map:
map_id = id
# Collect channels
environment.execution_graph = ExecutionGraph(environment)
environment.execution_graph.build_channels()
channels_per_destination = []
for operator in environment.operators.values():
channels_per_destination.append(
environment.execution_graph._generate_channels(operator))
# Check actual connectivity
actual = []
for destination in channels_per_destination:
for channels in destination.values():
for channel in channels:
src_instance_index = channel.src_instance_index
dst_instance_index = channel.dst_instance_index
connection = (src_instance_index, dst_instance_index)
assert channel.dst_operator_id == map_id, (
channel.dst_operator_id, map_id)
actual.append(connection)
# Make sure connections are as expected
set_1 = set(expected_channels)
set_2 = set(actual)
assert set_1 == set_2, (set_1, set_2)
def test_channel_generation():
"""Tests data channel generation."""
_test_shuffle_channels()
_test_broadcast_channels()
_test_round_robin_channels()
_test_forward_channels()
# TODO (john): Add simple wordcount test
def test_wordcount():
"""Tests a simple streaming wordcount."""
pass
if __name__ == "__main__":
test_channel_generation()
+20
View File
@@ -0,0 +1,20 @@
import ray
from ray.streaming.config import Config
from ray.streaming.streaming import Environment, Conf
def test_word_count():
ray.init()
env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
env.read_text_file(__file__) \
.set_parallelism(1) \
.filter(lambda x: "word" in x) \
.inspect(lambda x: print("result", x))
env_handle = env.execute()
ray.get(env_handle) # Stay alive until execution finishes
env.wait_finish()
ray.shutdown()
if __name__ == "__main__":
test_word_count()