mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 17:05:44 +08:00
[Streaming] Streaming data transfer and python integration (#6185)
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
Streaming Library
|
||||
=================
|
||||
|
||||
Dependencies:
|
||||
|
||||
Install NetworkX: ``pip install networkx``
|
||||
|
||||
Examples:
|
||||
|
||||
- simple.py: A simple example with stateless operators and different parallelism per stage.
|
||||
|
||||
Run ``python simple.py --input-file toy.txt``
|
||||
|
||||
- wordcount.py: A streaming wordcount example with a stateful operator (rolling sum).
|
||||
|
||||
Run ``python wordcount.py --titles-file articles.txt``
|
||||
@@ -0,0 +1,3 @@
|
||||
# flake8: noqa
|
||||
# Ray should be imported before streaming
|
||||
import ray
|
||||
@@ -0,0 +1,6 @@
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: embedsignature = True
|
||||
# cython: language_level = 3
|
||||
|
||||
include "includes/transfer.pxi"
|
||||
@@ -0,0 +1,283 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import pickle
|
||||
import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.streaming.runtime.transfer as transfer
|
||||
from ray.streaming.config import Config
|
||||
from ray.streaming.operator import PStrategy
|
||||
from ray.streaming.runtime.transfer import ChannelID
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Forward and broadcast stream partitioning strategies
|
||||
forward_broadcast_strategies = [PStrategy.Forward, PStrategy.Broadcast]
|
||||
|
||||
|
||||
# Used to choose output channel in case of hash-based shuffling
|
||||
def _hash(value):
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
try:
|
||||
return int(hashlib.sha1(value.encode("utf-8")).hexdigest(), 16)
|
||||
except AttributeError:
|
||||
return int(hashlib.sha1(value).hexdigest(), 16)
|
||||
|
||||
|
||||
class DataChannel(object):
|
||||
"""A data channel for actor-to-actor communication.
|
||||
|
||||
Attributes:
|
||||
env (Environment): The environment the channel belongs to.
|
||||
src_operator_id (UUID): The id of the source operator of the channel.
|
||||
src_instance_index (int): The id of the source instance.
|
||||
dst_operator_id (UUID): The id of the destination operator of the
|
||||
channel.
|
||||
dst_instance_index (int): The id of the destination instance.
|
||||
"""
|
||||
|
||||
def __init__(self, src_operator_id, src_instance_index, dst_operator_id,
|
||||
dst_instance_index, str_qid):
|
||||
self.src_operator_id = src_operator_id
|
||||
self.src_instance_index = src_instance_index
|
||||
self.dst_operator_id = dst_operator_id
|
||||
self.dst_instance_index = dst_instance_index
|
||||
self.str_qid = str_qid
|
||||
self.qid = ChannelID(str_qid)
|
||||
|
||||
def __repr__(self):
|
||||
return "(src({},{}),dst({},{}), qid({}))".format(
|
||||
self.src_operator_id, self.src_instance_index,
|
||||
self.dst_operator_id, self.dst_instance_index, self.str_qid)
|
||||
|
||||
|
||||
_CLOSE_FLAG = b" "
|
||||
|
||||
|
||||
# Pulls and merges data from multiple input channels
|
||||
class DataInput(object):
|
||||
"""An input gate of an operator instance.
|
||||
|
||||
The input gate pulls records from all input channels in a round-robin
|
||||
fashion.
|
||||
|
||||
Attributes:
|
||||
input_channels (list): The list of input channels.
|
||||
channel_index (int): The index of the next channel to pull from.
|
||||
max_index (int): The number of input channels.
|
||||
closed (list): A list of flags indicating whether an input channel
|
||||
has been marked as 'closed'.
|
||||
all_closed (bool): Denotes whether all input channels have been
|
||||
closed (True) or not (False).
|
||||
"""
|
||||
|
||||
def __init__(self, env, channels):
|
||||
assert len(channels) > 0
|
||||
self.env = env
|
||||
self.reader = None # created in `init` method
|
||||
self.input_channels = channels
|
||||
self.channel_index = 0
|
||||
self.max_index = len(channels)
|
||||
# Tracks the channels that have been closed. qid: close status
|
||||
self.closed = {}
|
||||
|
||||
def init(self):
|
||||
channels = [c.str_qid for c in self.input_channels]
|
||||
input_actors = []
|
||||
for c in self.input_channels:
|
||||
actor = self.env.execution_graph.get_actor(c.src_operator_id,
|
||||
c.src_instance_index)
|
||||
input_actors.append(actor)
|
||||
logger.info("DataInput input_actors %s", input_actors)
|
||||
conf = {
|
||||
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
|
||||
.current_driver_id,
|
||||
Config.CHANNEL_TYPE: self.env.config.channel_type
|
||||
}
|
||||
self.reader = transfer.DataReader(channels, input_actors, conf)
|
||||
|
||||
def pull(self):
|
||||
# pull from channel
|
||||
item = self.reader.read(100)
|
||||
while item is None:
|
||||
time.sleep(0.001)
|
||||
item = self.reader.read(100)
|
||||
msg_data = item.body()
|
||||
if msg_data == _CLOSE_FLAG:
|
||||
self.closed[item.channel_id] = True
|
||||
if len(self.closed) == len(self.input_channels):
|
||||
return None
|
||||
else:
|
||||
return self.pull()
|
||||
else:
|
||||
return pickle.loads(msg_data)
|
||||
|
||||
def close(self):
|
||||
self.reader.stop()
|
||||
|
||||
|
||||
# Selects output channel(s) and pushes data
|
||||
class DataOutput(object):
|
||||
"""An output gate of an operator instance.
|
||||
|
||||
The output gate pushes records to output channels according to the
|
||||
user-defined partitioning scheme.
|
||||
|
||||
Attributes:
|
||||
partitioning_schemes (dict): A mapping from destination operator ids
|
||||
to partitioning schemes (see: PScheme in operator.py).
|
||||
forward_channels (list): A list of channels to forward records.
|
||||
shuffle_channels (list(list)): A list of output channels to shuffle
|
||||
records grouped by destination operator.
|
||||
shuffle_key_channels (list(list)): A list of output channels to
|
||||
shuffle records by a key grouped by destination operator.
|
||||
shuffle_exists (bool): A flag indicating that there exists at least
|
||||
one shuffle_channel.
|
||||
shuffle_key_exists (bool): A flag indicating that there exists at
|
||||
least one shuffle_key_channel.
|
||||
"""
|
||||
|
||||
def __init__(self, env, channels, partitioning_schemes):
|
||||
assert len(channels) > 0
|
||||
self.env = env
|
||||
self.writer = None # created in `init` method
|
||||
self.channels = channels
|
||||
self.key_selector = None
|
||||
self.round_robin_indexes = [0]
|
||||
self.partitioning_schemes = partitioning_schemes
|
||||
# Prepare output -- collect channels by type
|
||||
self.forward_channels = [] # Forward and broadcast channels
|
||||
slots = sum(1 for scheme in self.partitioning_schemes.values()
|
||||
if scheme.strategy == PStrategy.RoundRobin)
|
||||
self.round_robin_channels = [[]] * slots # RoundRobin channels
|
||||
self.round_robin_indexes = [-1] * slots
|
||||
slots = sum(1 for scheme in self.partitioning_schemes.values()
|
||||
if scheme.strategy == PStrategy.Shuffle)
|
||||
# Flag used to avoid hashing when there is no shuffling
|
||||
self.shuffle_exists = slots > 0
|
||||
self.shuffle_channels = [[]] * slots # Shuffle channels
|
||||
slots = sum(1 for scheme in self.partitioning_schemes.values()
|
||||
if scheme.strategy == PStrategy.ShuffleByKey)
|
||||
# Flag used to avoid hashing when there is no shuffling by key
|
||||
self.shuffle_key_exists = slots > 0
|
||||
self.shuffle_key_channels = [[]] * slots # Shuffle by key channels
|
||||
# Distinct shuffle destinations
|
||||
shuffle_destinations = {}
|
||||
# Distinct shuffle by key destinations
|
||||
shuffle_by_key_destinations = {}
|
||||
# Distinct round robin destinations
|
||||
round_robin_destinations = {}
|
||||
index_1 = 0
|
||||
index_2 = 0
|
||||
index_3 = 0
|
||||
for channel in channels:
|
||||
p_scheme = self.partitioning_schemes[channel.dst_operator_id]
|
||||
strategy = p_scheme.strategy
|
||||
if strategy in forward_broadcast_strategies:
|
||||
self.forward_channels.append(channel)
|
||||
elif strategy == PStrategy.Shuffle:
|
||||
pos = shuffle_destinations.setdefault(channel.dst_operator_id,
|
||||
index_1)
|
||||
self.shuffle_channels[pos].append(channel)
|
||||
if pos == index_1:
|
||||
index_1 += 1
|
||||
elif strategy == PStrategy.ShuffleByKey:
|
||||
pos = shuffle_by_key_destinations.setdefault(
|
||||
channel.dst_operator_id, index_2)
|
||||
self.shuffle_key_channels[pos].append(channel)
|
||||
if pos == index_2:
|
||||
index_2 += 1
|
||||
elif strategy == PStrategy.RoundRobin:
|
||||
pos = round_robin_destinations.setdefault(
|
||||
channel.dst_operator_id, index_3)
|
||||
self.round_robin_channels[pos].append(channel)
|
||||
if pos == index_3:
|
||||
index_3 += 1
|
||||
else: # TODO (john): Add support for other strategies
|
||||
sys.exit("Unrecognized or unsupported partitioning strategy.")
|
||||
# A KeyedDataStream can only be shuffled by key
|
||||
assert not (self.shuffle_exists and self.shuffle_key_exists)
|
||||
|
||||
def init(self):
|
||||
"""init DataOutput which creates DataWriter"""
|
||||
channel_ids = [c.str_qid for c in self.channels]
|
||||
to_actors = []
|
||||
for c in self.channels:
|
||||
actor = self.env.execution_graph.get_actor(c.dst_operator_id,
|
||||
c.dst_instance_index)
|
||||
to_actors.append(actor)
|
||||
logger.info("DataOutput output_actors %s", to_actors)
|
||||
|
||||
conf = {
|
||||
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
|
||||
.current_driver_id,
|
||||
Config.CHANNEL_TYPE: self.env.config.channel_type
|
||||
}
|
||||
self.writer = transfer.DataWriter(channel_ids, to_actors, conf)
|
||||
|
||||
def close(self):
|
||||
"""Close the channel (True) by propagating _CLOSE_FLAG
|
||||
|
||||
_CLOSE_FLAG is used as special type of record that is propagated from
|
||||
sources to sink to notify that the end of data in a stream.
|
||||
"""
|
||||
for c in self.channels:
|
||||
self.writer.write(c.qid, _CLOSE_FLAG)
|
||||
# must ensure DataWriter send None flag to peer actor
|
||||
self.writer.stop()
|
||||
|
||||
def push(self, record):
|
||||
target_channels = []
|
||||
# Forward record
|
||||
for c in self.forward_channels:
|
||||
logger.debug("[writer] Push record '{}' to channel {}".format(
|
||||
record, c))
|
||||
target_channels.append(c)
|
||||
# Forward record
|
||||
index = 0
|
||||
for channels in self.round_robin_channels:
|
||||
self.round_robin_indexes[index] += 1
|
||||
if self.round_robin_indexes[index] == len(channels):
|
||||
self.round_robin_indexes[index] = 0 # Reset index
|
||||
c = channels[self.round_robin_indexes[index]]
|
||||
logger.debug("[writer] Push record '{}' to channel {}".format(
|
||||
record, c))
|
||||
target_channels.append(c)
|
||||
index += 1
|
||||
# Hash-based shuffling by key
|
||||
if self.shuffle_key_exists:
|
||||
key, _ = record
|
||||
h = _hash(key)
|
||||
for channels in self.shuffle_key_channels:
|
||||
num_instances = len(channels) # Downstream instances
|
||||
c = channels[h % num_instances]
|
||||
logger.debug(
|
||||
"[key_shuffle] Push record '{}' to channel {}".format(
|
||||
record, c))
|
||||
target_channels.append(c)
|
||||
elif self.shuffle_exists: # Hash-based shuffling per destination
|
||||
h = _hash(record)
|
||||
for channels in self.shuffle_channels:
|
||||
num_instances = len(channels) # Downstream instances
|
||||
c = channels[h % num_instances]
|
||||
logger.debug("[shuffle] Push record '{}' to channel {}".format(
|
||||
record, c))
|
||||
target_channels.append(c)
|
||||
else: # TODO (john): Handle rescaling
|
||||
pass
|
||||
|
||||
msg_data = pickle.dumps(record)
|
||||
for c in target_channels:
|
||||
# send data to channel
|
||||
self.writer.write(c.qid, msg_data)
|
||||
|
||||
def push_all(self, records):
|
||||
for record in records:
|
||||
self.push(record)
|
||||
@@ -0,0 +1,23 @@
|
||||
class Config:
|
||||
STREAMING_JOB_NAME = "streaming.job.name"
|
||||
STREAMING_OP_NAME = "streaming.op_name"
|
||||
TASK_JOB_ID = "streaming.task_job_id"
|
||||
STREAMING_WORKER_NAME = "streaming.worker_name"
|
||||
# channel
|
||||
CHANNEL_TYPE = "channel_type"
|
||||
MEMORY_CHANNEL = "memory_channel"
|
||||
NATIVE_CHANNEL = "native_channel"
|
||||
CHANNEL_SIZE = "channel_size"
|
||||
CHANNEL_SIZE_DEFAULT = 10**8
|
||||
IS_RECREATE = "streaming.is_recreate"
|
||||
# return from StreamingReader.getBundle if only empty message read in this
|
||||
# interval.
|
||||
TIMER_INTERVAL_MS = "timer_interval_ms"
|
||||
|
||||
STREAMING_RING_BUFFER_CAPACITY = "streaming.ring_buffer_capacity"
|
||||
# write an empty message if there is no data to be written in this
|
||||
# interval.
|
||||
STREAMING_EMPTY_MESSAGE_INTERVAL = "streaming.empty_message_interval"
|
||||
|
||||
# operator type
|
||||
OPERATOR_TYPE = "operator_type"
|
||||
@@ -0,0 +1,8 @@
|
||||
New York City
|
||||
Berlin
|
||||
London
|
||||
Paris
|
||||
United States
|
||||
Germany
|
||||
France
|
||||
United Kingdom
|
||||
@@ -0,0 +1,71 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.streaming.streaming import Environment
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input-file", required=True, help="the input text file")
|
||||
|
||||
|
||||
# A class used to check attribute-based key selection
|
||||
class Record(object):
|
||||
def __init__(self, record):
|
||||
k, _ = record
|
||||
self.word = k
|
||||
self.record = record
|
||||
|
||||
|
||||
# Splits input line into words and outputs objects of type Record
|
||||
# each one consisting of a key (word) and a tuple (word,1)
|
||||
def splitter(line):
|
||||
records = []
|
||||
words = line.split()
|
||||
for w in words:
|
||||
records.append(Record((w, 1)))
|
||||
return records
|
||||
|
||||
|
||||
# Receives an object of type Record and returns the actual tuple
|
||||
def as_tuple(record):
|
||||
return record.record
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get program parameters
|
||||
args = parser.parse_args()
|
||||
input_file = str(args.input_file)
|
||||
|
||||
ray.init()
|
||||
ray.register_custom_serializer(Record, use_dict=True)
|
||||
|
||||
# A Ray streaming environment with the default configuration
|
||||
env = Environment()
|
||||
env.set_parallelism(2) # Each operator will be executed by two actors
|
||||
|
||||
# 'key_by("word")' physically partitions the stream of records
|
||||
# based on the hash value of the 'word' attribute (see Record class above)
|
||||
# 'map(as_tuple)' maps a record of type Record into a tuple
|
||||
# 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
|
||||
stream = env.read_text_file(input_file) \
|
||||
.round_robin() \
|
||||
.flat_map(splitter) \
|
||||
.key_by("word") \
|
||||
.map(as_tuple) \
|
||||
.sum(1) \
|
||||
.inspect(print) # Prints the content of the
|
||||
# stream to stdout
|
||||
start = time.time()
|
||||
env_handle = env.execute() # Deploys and executes the dataflow
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
end = time.time()
|
||||
logger.info("Elapsed time: {} secs".format(end - start))
|
||||
logger.debug("Output stream id: {}".format(stream.id))
|
||||
@@ -0,0 +1,56 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.streaming.config import Config
|
||||
from ray.streaming.streaming import Environment, Conf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input-file", required=True, help="the input text file")
|
||||
|
||||
|
||||
# Test functions
|
||||
def splitter(line):
|
||||
return line.split()
|
||||
|
||||
|
||||
def filter_fn(word):
|
||||
if "f" in word:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init(local_mode=False)
|
||||
|
||||
# A Ray streaming environment with the default configuration
|
||||
env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
|
||||
|
||||
# Stream represents the ouput of the filter and
|
||||
# can be forked into other dataflows
|
||||
stream = env.read_text_file(args.input_file) \
|
||||
.shuffle() \
|
||||
.flat_map(splitter) \
|
||||
.set_parallelism(2) \
|
||||
.filter(filter_fn) \
|
||||
.set_parallelism(2) \
|
||||
.inspect(lambda x: print("result", x)) # Prints the contents of the
|
||||
# stream to stdout
|
||||
start = time.time()
|
||||
env_handle = env.execute()
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
env.wait_finish()
|
||||
end = time.time()
|
||||
logger.info("Elapsed time: {} secs".format(end - start))
|
||||
logger.debug("Output stream id: {}".format(stream.id))
|
||||
@@ -0,0 +1,5 @@
|
||||
This is
|
||||
a test file
|
||||
to test if example
|
||||
works
|
||||
fine
|
||||
@@ -0,0 +1,109 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
import wikipedia
|
||||
from ray.streaming.streaming import Environment
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--titles-file",
|
||||
required=True,
|
||||
help="the file containing the wikipedia titles to lookup")
|
||||
|
||||
|
||||
# A custom data source that reads articles from wikipedia
|
||||
# Custom data sources need to implement a get_next() method
|
||||
# that returns the next data element, in this case sentences
|
||||
class Wikipedia(object):
|
||||
def __init__(self, title_file):
|
||||
# Titles in this file will be as queries
|
||||
self.title_file = title_file
|
||||
# TODO (john): Handle possible exception here
|
||||
self.title_reader = iter(list(open(self.title_file, "r").readlines()))
|
||||
self.done = False
|
||||
self.article_done = True
|
||||
self.sentences = iter([])
|
||||
|
||||
# Returns next sentence from a wikipedia article
|
||||
def get_next(self):
|
||||
if self.done:
|
||||
return None # Source exhausted
|
||||
while True:
|
||||
if self.article_done:
|
||||
try: # Try next title
|
||||
next_title = next(self.title_reader)
|
||||
except StopIteration:
|
||||
self.done = True # Source exhausted
|
||||
return None
|
||||
# Get next article
|
||||
logger.debug("Next article: {}".format(next_title))
|
||||
article = wikipedia.page(next_title).content
|
||||
# Split article in sentences
|
||||
self.sentences = iter(article.split("."))
|
||||
self.article_done = False
|
||||
try: # Try next sentence
|
||||
sentence = next(self.sentences)
|
||||
logger.debug("Next sentence: {}".format(sentence))
|
||||
return sentence
|
||||
except StopIteration:
|
||||
self.article_done = True
|
||||
|
||||
|
||||
# Splits input line into words and
|
||||
# outputs records of the form (word,1)
|
||||
def splitter(line):
|
||||
records = []
|
||||
words = line.split()
|
||||
for w in words:
|
||||
records.append((w, 1))
|
||||
return records
|
||||
|
||||
|
||||
# Returns the first attribute of a tuple
|
||||
def key_selector(tuple):
|
||||
return tuple[0]
|
||||
|
||||
|
||||
# Returns the second attribute of a tuple
|
||||
def attribute_selector(tuple):
|
||||
return tuple[1]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get program parameters
|
||||
args = parser.parse_args()
|
||||
titles_file = str(args.titles_file)
|
||||
|
||||
ray.init()
|
||||
|
||||
# A Ray streaming environment with the default configuration
|
||||
env = Environment()
|
||||
env.set_parallelism(2) # Each operator will be executed by two actors
|
||||
|
||||
# The following dataflow is a simple streaming wordcount
|
||||
# with a rolling sum operator.
|
||||
# It reads articles from wikipedia, splits them in words,
|
||||
# shuffles words, and counts the occurences of each word.
|
||||
stream = env.source(Wikipedia(titles_file)) \
|
||||
.round_robin() \
|
||||
.flat_map(splitter) \
|
||||
.key_by(key_selector) \
|
||||
.sum(attribute_selector) \
|
||||
.inspect(print) # Prints the contents of the
|
||||
# stream to stdout
|
||||
start = time.time()
|
||||
env_handle = env.execute() # Deploys and executes the dataflow
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
env.wait_finish()
|
||||
end = time.time()
|
||||
logger.info("Elapsed time: {} secs".format(end - start))
|
||||
logger.debug("Output stream id: {}".format(stream.id))
|
||||
@@ -0,0 +1,153 @@
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: embedsignature = True
|
||||
# cython: language_level = 3
|
||||
# flake8: noqa
|
||||
|
||||
from libc.stdint cimport *
|
||||
from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport shared_ptr
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
from libcpp.list cimport list as c_list
|
||||
from cpython cimport PyObject
|
||||
cimport cpython
|
||||
|
||||
cdef inline object PyObject_to_object(PyObject* o):
|
||||
# Cast to "object" increments reference count
|
||||
cdef object result = <object> o
|
||||
cpython.Py_DECREF(result)
|
||||
return result
|
||||
|
||||
from ray.includes.common cimport (
|
||||
CLanguage,
|
||||
CRayObject,
|
||||
CRayStatus,
|
||||
CRayFunction
|
||||
)
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CJobID,
|
||||
CTaskID,
|
||||
CObjectID,
|
||||
)
|
||||
from ray.includes.libcoreworker cimport CCoreWorker
|
||||
|
||||
cdef extern from "status.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CStreamingStatus "ray::streaming::StreamingStatus":
|
||||
pass
|
||||
cdef CStreamingStatus StatusOK "ray::streaming::StreamingStatus::OK"
|
||||
cdef CStreamingStatus StatusReconstructTimeOut "ray::streaming::StreamingStatus::ReconstructTimeOut"
|
||||
cdef CStreamingStatus StatusQueueIdNotFound "ray::streaming::StreamingStatus::QueueIdNotFound"
|
||||
cdef CStreamingStatus StatusResubscribeFailed "ray::streaming::StreamingStatus::ResubscribeFailed"
|
||||
cdef CStreamingStatus StatusEmptyRingBuffer "ray::streaming::StreamingStatus::EmptyRingBuffer"
|
||||
cdef CStreamingStatus StatusFullChannel "ray::streaming::StreamingStatus::FullChannel"
|
||||
cdef CStreamingStatus StatusNoSuchItem "ray::streaming::StreamingStatus::NoSuchItem"
|
||||
cdef CStreamingStatus StatusInitQueueFailed "ray::streaming::StreamingStatus::InitQueueFailed"
|
||||
cdef CStreamingStatus StatusGetBundleTimeOut "ray::streaming::StreamingStatus::GetBundleTimeOut"
|
||||
cdef CStreamingStatus StatusSkipSendEmptyMessage "ray::streaming::StreamingStatus::SkipSendEmptyMessage"
|
||||
cdef CStreamingStatus StatusInterrupted "ray::streaming::StreamingStatus::Interrupted"
|
||||
cdef CStreamingStatus StatusWaitQueueTimeOut "ray::streaming::StreamingStatus::WaitQueueTimeOut"
|
||||
cdef CStreamingStatus StatusOutOfMemory "ray::streaming::StreamingStatus::OutOfMemory"
|
||||
cdef CStreamingStatus StatusInvalid "ray::streaming::StreamingStatus::Invalid"
|
||||
cdef CStreamingStatus StatusUnknownError "ray::streaming::StreamingStatus::UnknownError"
|
||||
cdef CStreamingStatus StatusTailStatus "ray::streaming::StreamingStatus::TailStatus"
|
||||
|
||||
cdef cppclass CStreamingCommon "ray::streaming::StreamingCommon":
|
||||
void SetConfig(const uint8_t *, uint32_t size)
|
||||
|
||||
|
||||
cdef extern from "runtime_context.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CRuntimeContext "ray::streaming::RuntimeContext":
|
||||
CRuntimeContext()
|
||||
void SetConfig(const uint8_t *data, uint32_t size)
|
||||
inline void MarkMockTest()
|
||||
inline c_bool IsMockTest()
|
||||
|
||||
cdef extern from "message/message.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CStreamingMessageType "ray::streaming::StreamingMessageType":
|
||||
pass
|
||||
cdef CStreamingMessageType MessageTypeBarrier "ray::streaming::StreamingMessageType::Barrier"
|
||||
cdef CStreamingMessageType MessageTypeMessage "ray::streaming::StreamingMessageType::Message"
|
||||
cdef cppclass CStreamingMessage "ray::streaming::StreamingMessage":
|
||||
inline uint8_t *RawData() const
|
||||
inline uint32_t GetDataSize() const
|
||||
inline CStreamingMessageType GetMessageType() const
|
||||
inline uint64_t GetMessageSeqId() const
|
||||
|
||||
cdef extern from "message/message_bundle.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CStreamingMessageBundleType "ray::streaming::StreamingMessageBundleType":
|
||||
pass
|
||||
cdef CStreamingMessageBundleType BundleTypeEmpty "ray::streaming::StreamingMessageBundleType::Empty"
|
||||
cdef CStreamingMessageBundleType BundleTypeBarrier "ray::streaming::StreamingMessageBundleType::Barrier"
|
||||
cdef CStreamingMessageBundleType BundleTypeBundle "ray::streaming::StreamingMessageBundleType::Bundle"
|
||||
|
||||
cdef cppclass CStreamingMessageBundleMeta "ray::streaming::StreamingMessageBundleMeta":
|
||||
CStreamingMessageBundleMeta()
|
||||
inline uint64_t GetMessageBundleTs() const
|
||||
inline uint64_t GetLastMessageId() const
|
||||
inline uint32_t GetMessageListSize() const
|
||||
inline CStreamingMessageBundleType GetBundleType() const
|
||||
inline c_bool IsBarrier()
|
||||
inline c_bool IsBundle()
|
||||
|
||||
ctypedef shared_ptr[CStreamingMessageBundleMeta] CStreamingMessageBundleMetaPtr
|
||||
uint32_t kMessageBundleHeaderSize "ray::streaming::kMessageBundleHeaderSize"
|
||||
cdef cppclass CStreamingMessageBundle "ray::streaming::StreamingMessageBundle"(CStreamingMessageBundleMeta):
|
||||
@staticmethod
|
||||
void GetMessageListFromRawData(const uint8_t *data, uint32_t size, uint32_t msg_nums,
|
||||
c_list[shared_ptr[CStreamingMessage]] &msg_list);
|
||||
|
||||
cdef extern from "queue/queue_client.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CReaderClient "ray::streaming::ReaderClient":
|
||||
CReaderClient(CCoreWorker *core_worker,
|
||||
CRayFunction &async_func,
|
||||
CRayFunction &sync_func)
|
||||
void OnReaderMessage(shared_ptr[CLocalMemoryBuffer] buffer);
|
||||
shared_ptr[CLocalMemoryBuffer] OnReaderMessageSync(shared_ptr[CLocalMemoryBuffer] buffer);
|
||||
|
||||
cdef cppclass CWriterClient "ray::streaming::WriterClient":
|
||||
CWriterClient(CCoreWorker *core_worker,
|
||||
CRayFunction &async_func,
|
||||
CRayFunction &sync_func)
|
||||
void OnWriterMessage(shared_ptr[CLocalMemoryBuffer] buffer);
|
||||
shared_ptr[CLocalMemoryBuffer] OnWriterMessageSync(shared_ptr[CLocalMemoryBuffer] buffer);
|
||||
|
||||
|
||||
cdef extern from "data_reader.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CDataBundle "ray::streaming::DataBundle":
|
||||
uint8_t *data
|
||||
uint32_t data_size
|
||||
CObjectID c_from "from"
|
||||
uint64_t seq_id
|
||||
CStreamingMessageBundleMetaPtr meta
|
||||
|
||||
cdef cppclass CDataReader "ray::streaming::DataReader"(CStreamingCommon):
|
||||
CDataReader(shared_ptr[CRuntimeContext] &runtime_context)
|
||||
void Init(const c_vector[CObjectID] &input_ids,
|
||||
const c_vector[CActorID] &actor_ids,
|
||||
const c_vector[uint64_t] &seq_ids,
|
||||
const c_vector[uint64_t] &msg_ids,
|
||||
int64_t timer_interval);
|
||||
CStreamingStatus GetBundle(const uint32_t timeout_ms,
|
||||
shared_ptr[CDataBundle] &message)
|
||||
void Stop()
|
||||
|
||||
|
||||
cdef extern from "data_writer.h" namespace "ray::streaming" nogil:
|
||||
cdef cppclass CDataWriter "ray::streaming::DataWriter"(CStreamingCommon):
|
||||
CDataWriter(shared_ptr[CRuntimeContext] &runtime_context)
|
||||
CStreamingStatus Init(const c_vector[CObjectID] &channel_ids,
|
||||
const c_vector[CActorID] &actor_ids,
|
||||
const c_vector[uint64_t] &message_ids,
|
||||
const c_vector[uint64_t] &queue_size_vec);
|
||||
long WriteMessageToBufferRing(
|
||||
const CObjectID &q_id, uint8_t *data, uint32_t data_size)
|
||||
void Run()
|
||||
void Stop()
|
||||
|
||||
|
||||
cdef extern from "ray/common/buffer.h" nogil:
|
||||
cdef cppclass CLocalMemoryBuffer "ray::LocalMemoryBuffer":
|
||||
uint8_t *Data() const
|
||||
size_t Size() const
|
||||
@@ -0,0 +1,323 @@
|
||||
# flake8: noqa
|
||||
|
||||
from libc.stdint cimport *
|
||||
from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport shared_ptr, make_shared, dynamic_pointer_cast
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
from libcpp.list cimport list as c_list
|
||||
|
||||
from ray.includes.common cimport (
|
||||
CRayFunction,
|
||||
LANGUAGE_PYTHON,
|
||||
CBuffer
|
||||
)
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CObjectID
|
||||
)
|
||||
from ray._raylet cimport (
|
||||
Buffer,
|
||||
CoreWorker,
|
||||
ActorID,
|
||||
ObjectID,
|
||||
string_vector_from_list
|
||||
)
|
||||
|
||||
from ray.includes.libcoreworker cimport CCoreWorker
|
||||
|
||||
cimport ray.streaming.includes.libstreaming as libstreaming
|
||||
from ray.streaming.includes.libstreaming cimport (
|
||||
CStreamingStatus,
|
||||
CStreamingMessage,
|
||||
CStreamingMessageBundle,
|
||||
CRuntimeContext,
|
||||
CDataBundle,
|
||||
CDataWriter,
|
||||
CDataReader,
|
||||
CReaderClient,
|
||||
CWriterClient,
|
||||
CLocalMemoryBuffer,
|
||||
)
|
||||
|
||||
import logging
|
||||
from ray.function_manager import FunctionDescriptor
|
||||
|
||||
|
||||
channel_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
cdef class ReaderClient:
|
||||
cdef:
|
||||
CReaderClient *client
|
||||
|
||||
def __cinit__(self,
|
||||
CoreWorker worker,
|
||||
async_func: FunctionDescriptor,
|
||||
sync_func: FunctionDescriptor):
|
||||
cdef:
|
||||
CCoreWorker *core_worker = worker.core_worker.get()
|
||||
CRayFunction async_native_func
|
||||
CRayFunction sync_native_func
|
||||
async_native_func = CRayFunction(
|
||||
LANGUAGE_PYTHON, string_vector_from_list(async_func.get_function_descriptor_list()))
|
||||
sync_native_func = CRayFunction(
|
||||
LANGUAGE_PYTHON, string_vector_from_list(sync_func.get_function_descriptor_list()))
|
||||
self.client = new CReaderClient(core_worker, async_native_func, sync_native_func)
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.client
|
||||
self.client = NULL
|
||||
|
||||
def on_reader_message(self, const unsigned char[:] value):
|
||||
cdef:
|
||||
size_t size = value.nbytes
|
||||
shared_ptr[CLocalMemoryBuffer] local_buf = \
|
||||
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
|
||||
with nogil:
|
||||
self.client.OnReaderMessage(local_buf)
|
||||
|
||||
def on_reader_message_sync(self, const unsigned char[:] value):
|
||||
cdef:
|
||||
size_t size = value.nbytes
|
||||
shared_ptr[CLocalMemoryBuffer] local_buf = \
|
||||
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
|
||||
shared_ptr[CLocalMemoryBuffer] result_buffer
|
||||
with nogil:
|
||||
result_buffer = self.client.OnReaderMessageSync(local_buf)
|
||||
return Buffer.make(dynamic_pointer_cast[CBuffer, CLocalMemoryBuffer](result_buffer))
|
||||
|
||||
|
||||
cdef class WriterClient:
|
||||
cdef:
|
||||
CWriterClient * client
|
||||
|
||||
def __cinit__(self,
|
||||
CoreWorker worker,
|
||||
async_func: FunctionDescriptor,
|
||||
sync_func: FunctionDescriptor):
|
||||
cdef:
|
||||
CCoreWorker *core_worker = worker.core_worker.get()
|
||||
CRayFunction async_native_func
|
||||
CRayFunction sync_native_func
|
||||
async_native_func = CRayFunction(
|
||||
LANGUAGE_PYTHON, string_vector_from_list(async_func.get_function_descriptor_list()))
|
||||
sync_native_func = CRayFunction(
|
||||
LANGUAGE_PYTHON, string_vector_from_list(sync_func.get_function_descriptor_list()))
|
||||
self.client = new CWriterClient(core_worker, async_native_func, sync_native_func)
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.client
|
||||
self.client = NULL
|
||||
|
||||
def on_writer_message(self, const unsigned char[:] value):
|
||||
cdef:
|
||||
size_t size = value.nbytes
|
||||
shared_ptr[CLocalMemoryBuffer] local_buf = \
|
||||
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
|
||||
with nogil:
|
||||
self.client.OnWriterMessage(local_buf)
|
||||
|
||||
def on_writer_message_sync(self, const unsigned char[:] value):
|
||||
cdef:
|
||||
size_t size = value.nbytes
|
||||
shared_ptr[CLocalMemoryBuffer] local_buf = \
|
||||
make_shared[CLocalMemoryBuffer](<uint8_t *>(&value[0]), size, True)
|
||||
shared_ptr[CLocalMemoryBuffer] result_buffer
|
||||
with nogil:
|
||||
result_buffer = self.client.OnWriterMessageSync(local_buf)
|
||||
return Buffer.make(dynamic_pointer_cast[CBuffer, CLocalMemoryBuffer](result_buffer))
|
||||
|
||||
|
||||
cdef class DataWriter:
|
||||
cdef:
|
||||
CDataWriter *writer
|
||||
|
||||
def __init__(self):
|
||||
raise Exception("use create() to create DataWriter")
|
||||
|
||||
@staticmethod
|
||||
def create(list py_output_channels,
|
||||
list output_actor_ids: list[ActorID],
|
||||
uint64_t queue_size,
|
||||
list py_msg_ids,
|
||||
bytes config_bytes,
|
||||
c_bool is_mock):
|
||||
cdef:
|
||||
c_vector[CObjectID] channel_ids = bytes_list_to_qid_vec(py_output_channels)
|
||||
c_vector[CActorID] actor_ids
|
||||
c_vector[uint64_t] msg_ids
|
||||
CDataWriter *c_writer
|
||||
cdef const unsigned char[:] config_data
|
||||
for actor_id in output_actor_ids:
|
||||
actor_ids.push_back((<ActorID>actor_id).data)
|
||||
for py_msg_id in py_msg_ids:
|
||||
msg_ids.push_back(<uint64_t>py_msg_id)
|
||||
|
||||
cdef shared_ptr[CRuntimeContext] ctx = make_shared[CRuntimeContext]()
|
||||
if is_mock:
|
||||
ctx.get().MarkMockTest()
|
||||
if config_bytes:
|
||||
config_data = config_bytes
|
||||
channel_logger.info("load config, config bytes size: %s", config_data.nbytes)
|
||||
ctx.get().SetConfig(<uint8_t *>(&config_data[0]), config_data.nbytes)
|
||||
c_writer = new CDataWriter(ctx)
|
||||
cdef:
|
||||
c_vector[CObjectID] remain_id_vec
|
||||
c_vector[uint64_t] queue_size_vec
|
||||
for i in range(channel_ids.size()):
|
||||
queue_size_vec.push_back(queue_size)
|
||||
cdef CStreamingStatus status = c_writer.Init(channel_ids, actor_ids, msg_ids, queue_size_vec)
|
||||
if remain_id_vec.size() != 0:
|
||||
channel_logger.warning("failed queue amounts => %s", remain_id_vec.size())
|
||||
if <uint32_t>status != <uint32_t> libstreaming.StatusOK:
|
||||
msg = "initialize writer failed, status={}".format(<uint32_t>status)
|
||||
channel_logger.error(msg)
|
||||
del c_writer
|
||||
import ray.streaming.runtime.transfer as transfer
|
||||
raise transfer.ChannelInitException(msg, qid_vector_to_list(remain_id_vec))
|
||||
|
||||
c_writer.Run()
|
||||
channel_logger.info("create native writer succeed")
|
||||
cdef DataWriter writer = DataWriter.__new__(DataWriter)
|
||||
writer.writer = c_writer
|
||||
return writer
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.writer != NULL:
|
||||
del self.writer
|
||||
channel_logger.info("deleted DataWriter")
|
||||
self.writer = NULL
|
||||
|
||||
def write(self, ObjectID qid, const unsigned char[:] value):
|
||||
"""support zero-copy bytes, bytearray, array of unsigned char"""
|
||||
cdef:
|
||||
CObjectID native_id = qid.data
|
||||
uint64_t msg_id
|
||||
uint8_t *data = <uint8_t *>(&value[0])
|
||||
uint32_t size = value.nbytes
|
||||
with nogil:
|
||||
msg_id = self.writer.WriteMessageToBufferRing(native_id, data, size)
|
||||
return msg_id
|
||||
|
||||
def stop(self):
|
||||
self.writer.Stop()
|
||||
channel_logger.info("stopped DataWriter")
|
||||
|
||||
|
||||
cdef class DataReader:
|
||||
cdef:
|
||||
CDataReader *reader
|
||||
readonly bytes meta
|
||||
readonly bytes data
|
||||
|
||||
def __init__(self):
|
||||
raise Exception("use create() to create DataReader")
|
||||
|
||||
@staticmethod
|
||||
def create(list py_input_queues,
|
||||
list input_actor_ids: list[ActorID],
|
||||
list py_seq_ids,
|
||||
list py_msg_ids,
|
||||
int64_t timer_interval,
|
||||
c_bool is_recreate,
|
||||
bytes config_bytes,
|
||||
c_bool is_mock):
|
||||
cdef:
|
||||
c_vector[CObjectID] queue_id_vec = bytes_list_to_qid_vec(py_input_queues)
|
||||
c_vector[CActorID] actor_ids
|
||||
c_vector[uint64_t] seq_ids
|
||||
c_vector[uint64_t] msg_ids
|
||||
CDataReader *c_reader
|
||||
cdef const unsigned char[:] config_data
|
||||
for actor_id in input_actor_ids:
|
||||
actor_ids.push_back((<ActorID>actor_id).data)
|
||||
for py_seq_id in py_seq_ids:
|
||||
seq_ids.push_back(<uint64_t>py_seq_id)
|
||||
for py_msg_id in py_msg_ids:
|
||||
msg_ids.push_back(<uint64_t>py_msg_id)
|
||||
cdef shared_ptr[CRuntimeContext] ctx = make_shared[CRuntimeContext]()
|
||||
if config_bytes:
|
||||
config_data = config_bytes
|
||||
channel_logger.info("load config, config bytes size: %s", config_data.nbytes)
|
||||
ctx.get().SetConfig(<uint8_t *>(&(config_data[0])), config_data.nbytes)
|
||||
if is_mock:
|
||||
ctx.get().MarkMockTest()
|
||||
c_reader = new CDataReader(ctx)
|
||||
c_reader.Init(queue_id_vec, actor_ids, seq_ids, msg_ids, timer_interval)
|
||||
channel_logger.info("create native reader succeed")
|
||||
cdef DataReader reader = DataReader.__new__(DataReader)
|
||||
reader.reader = c_reader
|
||||
return reader
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.reader != NULL:
|
||||
del self.reader
|
||||
channel_logger.info("deleted DataReader")
|
||||
self.reader = NULL
|
||||
|
||||
def read(self, uint32_t timeout_millis):
|
||||
cdef:
|
||||
shared_ptr[CDataBundle] bundle
|
||||
CStreamingStatus status
|
||||
with nogil:
|
||||
status = self.reader.GetBundle(timeout_millis, bundle)
|
||||
cdef uint32_t bundle_type = <uint32_t>(bundle.get().meta.get().GetBundleType())
|
||||
if <uint32_t> status != <uint32_t> libstreaming.StatusOK:
|
||||
if <uint32_t> status == <uint32_t> libstreaming.StatusInterrupted:
|
||||
# avoid cyclic import
|
||||
import ray.streaming.runtime.transfer as transfer
|
||||
raise transfer.ChannelInterruptException("reader interrupted")
|
||||
elif <uint32_t> status == <uint32_t> libstreaming.StatusInitQueueFailed:
|
||||
raise Exception("init channel failed")
|
||||
elif <uint32_t> status == <uint32_t> libstreaming.StatusWaitQueueTimeOut:
|
||||
raise Exception("wait channel object timeout")
|
||||
cdef:
|
||||
uint32_t msg_nums
|
||||
CObjectID queue_id
|
||||
c_list[shared_ptr[CStreamingMessage]] msg_list
|
||||
list msgs = []
|
||||
uint64_t timestamp
|
||||
uint64_t msg_id
|
||||
if bundle_type == <uint32_t> libstreaming.BundleTypeBundle:
|
||||
msg_nums = bundle.get().meta.get().GetMessageListSize()
|
||||
CStreamingMessageBundle.GetMessageListFromRawData(
|
||||
bundle.get().data + libstreaming.kMessageBundleHeaderSize,
|
||||
bundle.get().data_size - libstreaming.kMessageBundleHeaderSize,
|
||||
msg_nums,
|
||||
msg_list)
|
||||
timestamp = bundle.get().meta.get().GetMessageBundleTs()
|
||||
for msg in msg_list:
|
||||
msg_bytes = msg.get().RawData()[:msg.get().GetDataSize()]
|
||||
qid_bytes = queue_id.Binary()
|
||||
msg_id = msg.get().GetMessageSeqId()
|
||||
msgs.append((msg_bytes, msg_id, timestamp, qid_bytes))
|
||||
return msgs
|
||||
elif bundle_type == <uint32_t> libstreaming.BundleTypeEmpty:
|
||||
return []
|
||||
else:
|
||||
raise Exception("Unsupported bundle type {}".format(bundle_type))
|
||||
|
||||
def stop(self):
|
||||
self.reader.Stop()
|
||||
channel_logger.info("stopped DataReader")
|
||||
|
||||
|
||||
cdef c_vector[CObjectID] bytes_list_to_qid_vec(list py_queue_ids) except *:
|
||||
assert len(py_queue_ids) > 0
|
||||
cdef:
|
||||
c_vector[CObjectID] queue_id_vec
|
||||
c_string q_id_data
|
||||
for q_id in py_queue_ids:
|
||||
q_id_data = q_id
|
||||
assert q_id_data.size() == CObjectID.Size()
|
||||
obj_id = CObjectID.FromBinary(q_id_data)
|
||||
queue_id_vec.push_back(obj_id)
|
||||
return queue_id_vec
|
||||
|
||||
cdef c_vector[c_string] qid_vector_to_list(c_vector[CObjectID] queue_id_vec):
|
||||
queues = []
|
||||
for obj_id in queue_id_vec:
|
||||
queues.append(obj_id.Binary())
|
||||
return queues
|
||||
@@ -0,0 +1,124 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import pickle
|
||||
import threading
|
||||
|
||||
import ray
|
||||
import ray.streaming._streaming as _streaming
|
||||
from ray.streaming.config import Config
|
||||
from ray.function_manager import FunctionDescriptor
|
||||
from ray.streaming.communication import DataInput, DataOutput
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ray.remote
|
||||
class JobWorker(object):
|
||||
"""A streaming job worker.
|
||||
|
||||
Attributes:
|
||||
worker_id: The id of the instance.
|
||||
input_channels: The input gate that manages input channels of
|
||||
the instance (see: DataInput in communication.py).
|
||||
output_channels (DataOutput): The output gate that manages output
|
||||
channels of the instance (see: DataOutput in communication.py).
|
||||
the operator instance.
|
||||
"""
|
||||
|
||||
def __init__(self, worker_id, operator, input_channels, output_channels):
|
||||
self.env = None
|
||||
self.worker_id = worker_id
|
||||
self.operator = operator
|
||||
processor_name = operator.processor_class.__name__
|
||||
processor_instance = operator.processor_class(operator)
|
||||
self.processor_name = processor_name
|
||||
self.processor_instance = processor_instance
|
||||
self.input_channels = input_channels
|
||||
self.output_channels = output_channels
|
||||
self.input_gate = None
|
||||
self.output_gate = None
|
||||
self.reader_client = None
|
||||
self.writer_client = None
|
||||
|
||||
def init(self, env):
|
||||
"""init streaming actor"""
|
||||
env = pickle.loads(env)
|
||||
self.env = env
|
||||
logger.info("init operator instance %s", self.processor_name)
|
||||
|
||||
if env.config.channel_type == Config.NATIVE_CHANNEL:
|
||||
core_worker = ray.worker.global_worker.core_worker
|
||||
reader_async_func = FunctionDescriptor(
|
||||
__name__, self.on_reader_message.__name__,
|
||||
self.__class__.__name__)
|
||||
reader_sync_func = FunctionDescriptor(
|
||||
__name__, self.on_reader_message_sync.__name__,
|
||||
self.__class__.__name__)
|
||||
self.reader_client = _streaming.ReaderClient(
|
||||
core_worker, reader_async_func, reader_sync_func)
|
||||
writer_async_func = FunctionDescriptor(
|
||||
__name__, self.on_writer_message.__name__,
|
||||
self.__class__.__name__)
|
||||
writer_sync_func = FunctionDescriptor(
|
||||
__name__, self.on_writer_message_sync.__name__,
|
||||
self.__class__.__name__)
|
||||
self.writer_client = _streaming.WriterClient(
|
||||
core_worker, writer_async_func, writer_sync_func)
|
||||
if len(self.input_channels) > 0:
|
||||
self.input_gate = DataInput(env, self.input_channels)
|
||||
self.input_gate.init()
|
||||
if len(self.output_channels) > 0:
|
||||
self.output_gate = DataOutput(
|
||||
env, self.output_channels,
|
||||
self.operator.partitioning_strategies)
|
||||
self.output_gate.init()
|
||||
logger.info("init operator instance %s succeed", self.processor_name)
|
||||
return True
|
||||
|
||||
# Starts the actor
|
||||
def start(self):
|
||||
self.t = threading.Thread(target=self.run, daemon=True)
|
||||
self.t.start()
|
||||
actor_id = ray.worker.global_worker.actor_id
|
||||
logger.info("%s %s started, actor id %s", self.__class__.__name__,
|
||||
self.processor_name, actor_id)
|
||||
|
||||
def run(self):
|
||||
logger.info("%s start running", self.processor_name)
|
||||
self.processor_instance.run(self.input_gate, self.output_gate)
|
||||
logger.info("%s finished running", self.processor_name)
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
if self.input_gate:
|
||||
self.input_gate.close()
|
||||
if self.output_gate:
|
||||
self.output_gate.close()
|
||||
|
||||
def is_finished(self):
|
||||
return not self.t.is_alive()
|
||||
|
||||
def on_reader_message(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
self.reader_client.on_reader_message(buffer)
|
||||
|
||||
def on_reader_message_sync(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
if self.reader_client is None:
|
||||
return b" " * 4 # special flag to indicate this actor not ready
|
||||
result = self.reader_client.on_reader_message_sync(buffer)
|
||||
return result.to_pybytes()
|
||||
|
||||
def on_writer_message(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
self.writer_client.on_writer_message(buffer)
|
||||
|
||||
def on_writer_message_sync(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
if self.writer_client is None:
|
||||
return b" " * 4 # special flag to indicate this actor not ready
|
||||
result = self.writer_client.on_writer_message_sync(buffer)
|
||||
return result.to_pybytes()
|
||||
@@ -0,0 +1,113 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import enum
|
||||
import logging
|
||||
|
||||
import cloudpickle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel("DEBUG")
|
||||
|
||||
|
||||
# Stream partitioning schemes
|
||||
class PScheme(object):
|
||||
def __init__(self, strategy, partition_fn=None):
|
||||
self.strategy = strategy
|
||||
self.partition_fn = partition_fn
|
||||
|
||||
def __repr__(self):
|
||||
return "({},{})".format(self.strategy, self.partition_fn)
|
||||
|
||||
|
||||
# Partitioning strategies
|
||||
class PStrategy(enum.Enum):
|
||||
Forward = 0 # Default
|
||||
Shuffle = 1
|
||||
Rescale = 2
|
||||
RoundRobin = 3
|
||||
Broadcast = 4
|
||||
Custom = 5
|
||||
ShuffleByKey = 6
|
||||
# ...
|
||||
|
||||
|
||||
# Operator types
|
||||
class OpType(enum.Enum):
|
||||
Source = 0
|
||||
Map = 1
|
||||
FlatMap = 2
|
||||
Filter = 3
|
||||
TimeWindow = 4
|
||||
KeyBy = 5
|
||||
Sink = 6
|
||||
WindowJoin = 7
|
||||
Inspect = 8
|
||||
ReadTextFile = 9
|
||||
Reduce = 10
|
||||
Sum = 11
|
||||
# ...
|
||||
|
||||
|
||||
# A logical dataflow operator
|
||||
class Operator(object):
|
||||
def __init__(self,
|
||||
id,
|
||||
op_type,
|
||||
processor_class,
|
||||
name="",
|
||||
logic=None,
|
||||
num_instances=1,
|
||||
other=None,
|
||||
state_actor=None):
|
||||
self.id = id
|
||||
self.type = op_type
|
||||
self.processor_class = processor_class
|
||||
self.name = name
|
||||
self._logic = cloudpickle.dumps(logic) # The operator's logic
|
||||
self.num_instances = num_instances
|
||||
# One partitioning strategy per downstream operator (default: forward)
|
||||
self.partitioning_strategies = {}
|
||||
self.other_args = other # Depends on the type of the operator
|
||||
self.state_actor = state_actor # Actor to query state
|
||||
|
||||
# Sets the partitioning scheme for an output stream of the operator
|
||||
def _set_partition_strategy(self,
|
||||
stream_id,
|
||||
partitioning_scheme,
|
||||
dest_operator=None):
|
||||
self.partitioning_strategies[stream_id] = (partitioning_scheme,
|
||||
dest_operator)
|
||||
|
||||
# Retrieves the partitioning scheme for the given
|
||||
# output stream of the operator
|
||||
# Returns None is no strategy has been defined for the particular stream
|
||||
def _get_partition_strategy(self, stream_id):
|
||||
return self.partitioning_strategies.get(stream_id)
|
||||
|
||||
# Cleans metatada from all partitioning strategies that lack a
|
||||
# destination operator
|
||||
# Valid entries are re-organized as
|
||||
# 'destination operator id -> partitioning scheme'
|
||||
# Should be called only after the logical dataflow has been constructed
|
||||
def _clean(self):
|
||||
strategies = {}
|
||||
for _, v in self.partitioning_strategies.items():
|
||||
strategy, destination_operator = v
|
||||
if destination_operator is not None:
|
||||
strategies.setdefault(destination_operator, strategy)
|
||||
self.partitioning_strategies = strategies
|
||||
|
||||
def print(self):
|
||||
log = "Operator<\nID = {}\nName = {}\nprocessor_class = {}\n"
|
||||
log += "Logic = {}\nNumber_of_Instances = {}\n"
|
||||
log += "Partitioning_Scheme = {}\nOther_Args = {}>\n"
|
||||
logger.debug(
|
||||
log.format(self.id, self.name, self.processor_class, self.logic,
|
||||
self.num_instances, self.partitioning_strategies,
|
||||
self.other_args))
|
||||
|
||||
@property
|
||||
def logic(self):
|
||||
return cloudpickle.loads(self._logic)
|
||||
@@ -0,0 +1,226 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import types
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel("INFO")
|
||||
|
||||
|
||||
def _identity(element):
|
||||
return element
|
||||
|
||||
|
||||
class ReadTextFile:
|
||||
"""A source operator instance that reads a text file line by line.
|
||||
|
||||
Attributes:
|
||||
filepath (string): The path to the input file.
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
self.filepath = operator.other_args
|
||||
# TODO (john): Handle possible exception here
|
||||
self.reader = open(self.filepath, "r")
|
||||
|
||||
# Read input file line by line
|
||||
def run(self, input_gate, output_gate):
|
||||
while True:
|
||||
record = self.reader.readline()
|
||||
# Reader returns empty string ('') on EOF
|
||||
if not record:
|
||||
self.reader.close()
|
||||
return
|
||||
output_gate.push(
|
||||
record[:-1]) # Push after removing newline characters
|
||||
|
||||
|
||||
class Map:
|
||||
"""A map operator instance that applies a user-defined
|
||||
stream transformation.
|
||||
|
||||
A map produces exactly one output record for each record in
|
||||
the input stream.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
self.map_fn = operator.logic
|
||||
|
||||
# Applies the mapper each record of the input stream(s)
|
||||
# and pushes resulting records to the output stream(s)
|
||||
def run(self, input_gate, output_gate):
|
||||
elements = 0
|
||||
while True:
|
||||
record = input_gate.pull()
|
||||
if record is None:
|
||||
return
|
||||
output_gate.push(self.map_fn(record))
|
||||
elements += 1
|
||||
|
||||
|
||||
class FlatMap:
|
||||
"""A map operator instance that applies a user-defined
|
||||
stream transformation.
|
||||
|
||||
A flatmap produces one or more output records for each record in
|
||||
the input stream.
|
||||
|
||||
Attributes:
|
||||
flatmap_fn (function): The user-defined function.
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
self.flatmap_fn = operator.logic
|
||||
|
||||
# Applies the splitter to the records of the input stream(s)
|
||||
# and pushes resulting records to the output stream(s)
|
||||
def run(self, input_gate, output_gate):
|
||||
while True:
|
||||
record = input_gate.pull()
|
||||
if record is None:
|
||||
return
|
||||
output_gate.push_all(self.flatmap_fn(record))
|
||||
|
||||
|
||||
class Filter:
|
||||
"""A filter operator instance that applies a user-defined filter to
|
||||
each record of the stream.
|
||||
|
||||
Output records are those that pass the filter, i.e. those for which
|
||||
the filter function returns True.
|
||||
|
||||
Attributes:
|
||||
filter_fn (function): The user-defined boolean function.
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
self.filter_fn = operator.logic
|
||||
|
||||
# Applies the filter to the records of the input stream(s)
|
||||
# and pushes resulting records to the output stream(s)
|
||||
def run(self, input_gate, output_gate):
|
||||
while True:
|
||||
record = input_gate.pull()
|
||||
if record is None:
|
||||
return
|
||||
if self.filter_fn(record):
|
||||
output_gate.push(record)
|
||||
|
||||
|
||||
class Inspect:
|
||||
"""A inspect operator instance that inspects the content of the stream.
|
||||
Inspect is useful for printing the records in the stream.
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
self.inspect_fn = operator.logic
|
||||
|
||||
def run(self, input_gate, output_gate):
|
||||
# Applies the inspect logic (e.g. print) to the records of
|
||||
# the input stream(s)
|
||||
# and leaves stream unaffected by simply pushing the records to
|
||||
# the output stream(s)
|
||||
while True:
|
||||
record = input_gate.pull()
|
||||
if record is None:
|
||||
return
|
||||
if output_gate:
|
||||
output_gate.push(record)
|
||||
self.inspect_fn(record)
|
||||
|
||||
|
||||
class Reduce:
|
||||
"""A reduce operator instance that combines a new value for a key
|
||||
with the last reduced one according to a user-defined logic.
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
self.reduce_fn = operator.logic
|
||||
# Set the attribute selector
|
||||
self.attribute_selector = operator.other_args
|
||||
if self.attribute_selector is None:
|
||||
self.attribute_selector = _identity
|
||||
elif isinstance(self.attribute_selector, int):
|
||||
self.key_index = self.attribute_selector
|
||||
self.attribute_selector =\
|
||||
lambda record: record[self.attribute_selector]
|
||||
elif isinstance(self.attribute_selector, str):
|
||||
self.attribute_selector =\
|
||||
lambda record: vars(record)[self.attribute_selector]
|
||||
elif not isinstance(self.attribute_selector, types.FunctionType):
|
||||
sys.exit("Unrecognized or unsupported key selector.")
|
||||
self.state = {} # key -> value
|
||||
|
||||
# Combines the input value for a key with the last reduced
|
||||
# value for that key to produce a new value.
|
||||
# Outputs the result as (key,new value)
|
||||
def run(self, input_gate, output_gate):
|
||||
while True:
|
||||
record = input_gate.pull()
|
||||
if record is None:
|
||||
return
|
||||
key, rest = record
|
||||
new_value = self.attribute_selector(rest)
|
||||
# TODO (john): Is there a way to update state with
|
||||
# a single dictionary lookup?
|
||||
try:
|
||||
old_value = self.state[key]
|
||||
new_value = self.reduce_fn(old_value, new_value)
|
||||
self.state[key] = new_value
|
||||
except KeyError: # Key does not exist in state
|
||||
self.state.setdefault(key, new_value)
|
||||
output_gate.push((key, new_value))
|
||||
|
||||
# Returns the state of the actor
|
||||
def get_state(self):
|
||||
return self.state
|
||||
|
||||
|
||||
class KeyBy:
|
||||
"""A key_by operator instance that physically partitions the
|
||||
stream based on a key.
|
||||
"""
|
||||
|
||||
def __init__(self, operator):
|
||||
# Set the key selector
|
||||
self.key_selector = operator.other_args
|
||||
if isinstance(self.key_selector, int):
|
||||
self.key_selector = lambda r: r[self.key_selector]
|
||||
elif isinstance(self.key_selector, str):
|
||||
self.key_selector = lambda record: vars(record)[self.key_selector]
|
||||
elif not isinstance(self.key_selector, types.FunctionType):
|
||||
sys.exit("Unrecognized or unsupported key selector.")
|
||||
|
||||
# The actual partitioning is done by the output gate
|
||||
def run(self, input_gate, output_gate):
|
||||
while True:
|
||||
record = input_gate.pull()
|
||||
if record is None:
|
||||
return
|
||||
key = self.key_selector(record)
|
||||
output_gate.push((key, record))
|
||||
|
||||
|
||||
# A custom source actor
|
||||
class Source:
|
||||
def __init__(self, operator):
|
||||
# The user-defined source with a get_next() method
|
||||
self.source = operator.logic
|
||||
|
||||
# Starts the source by calling get_next() repeatedly
|
||||
def run(self, input_gate, output_gate):
|
||||
start = time.time()
|
||||
elements = 0
|
||||
while True:
|
||||
record = self.source.get_next()
|
||||
if not record:
|
||||
logger.debug("[writer] puts per second: {}".format(
|
||||
elements / (time.time() - start)))
|
||||
return
|
||||
output_gate.push(record)
|
||||
elements += 1
|
||||
@@ -0,0 +1,291 @@
|
||||
import logging
|
||||
import random
|
||||
from queue import Queue
|
||||
from typing import List
|
||||
|
||||
import ray
|
||||
import ray.streaming._streaming as _streaming
|
||||
import ray.streaming.generated.streaming_pb2 as streaming_pb
|
||||
from ray.actor import ActorHandle, ActorID
|
||||
from ray.streaming.config import Config
|
||||
|
||||
CHANNEL_ID_LEN = 20
|
||||
|
||||
|
||||
class ChannelID:
|
||||
"""
|
||||
ChannelID is used to identify a transfer channel between
|
||||
a upstream worker and downstream worker.
|
||||
"""
|
||||
|
||||
def __init__(self, channel_id_str: str):
|
||||
"""
|
||||
Args:
|
||||
channel_id_str: string representation of channel id
|
||||
"""
|
||||
self.channel_id_str = channel_id_str
|
||||
self.object_qid = ray.ObjectID(channel_id_str_to_bytes(channel_id_str))
|
||||
|
||||
def __eq__(self, other):
|
||||
if other is None:
|
||||
return False
|
||||
if type(other) is ChannelID:
|
||||
return self.channel_id_str == other.channel_id_str
|
||||
else:
|
||||
return False
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.channel_id_str)
|
||||
|
||||
def __repr__(self):
|
||||
return self.channel_id_str
|
||||
|
||||
@staticmethod
|
||||
def gen_random_id():
|
||||
"""Generate a random channel id string
|
||||
"""
|
||||
res = ""
|
||||
for i in range(CHANNEL_ID_LEN * 2):
|
||||
res += str(chr(random.randint(0, 5) + ord("A")))
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def gen_id(from_index, to_index, ts):
|
||||
"""Generate channel id, which is 20 character"""
|
||||
channel_id = bytearray(20)
|
||||
for i in range(11, 7, -1):
|
||||
channel_id[i] = ts & 0xff
|
||||
ts >>= 8
|
||||
channel_id[16] = (from_index & 0xffff) >> 8
|
||||
channel_id[17] = (from_index & 0xff)
|
||||
channel_id[18] = (to_index & 0xffff) >> 8
|
||||
channel_id[19] = (to_index & 0xff)
|
||||
return channel_bytes_to_str(bytes(channel_id))
|
||||
|
||||
|
||||
def channel_id_str_to_bytes(channel_id_str):
|
||||
"""
|
||||
Args:
|
||||
channel_id_str: string representation of channel id
|
||||
|
||||
Returns:
|
||||
bytes representation of channel id
|
||||
"""
|
||||
assert type(channel_id_str) in [str, bytes]
|
||||
if isinstance(channel_id_str, bytes):
|
||||
return channel_id_str
|
||||
qid_bytes = bytes.fromhex(channel_id_str)
|
||||
assert len(qid_bytes) == CHANNEL_ID_LEN
|
||||
return qid_bytes
|
||||
|
||||
|
||||
def channel_bytes_to_str(id_bytes):
|
||||
"""
|
||||
Args:
|
||||
id_bytes: bytes representation of channel id
|
||||
|
||||
Returns:
|
||||
string representation of channel id
|
||||
"""
|
||||
assert type(id_bytes) in [str, bytes]
|
||||
if isinstance(id_bytes, str):
|
||||
return id_bytes
|
||||
return bytes.hex(id_bytes)
|
||||
|
||||
|
||||
class DataMessage:
|
||||
"""
|
||||
DataMessage represents data between upstream and downstream operator
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
body,
|
||||
timestamp,
|
||||
channel_id,
|
||||
message_id_,
|
||||
is_empty_message=False):
|
||||
self.__body = body
|
||||
self.__timestamp = timestamp
|
||||
self.__channel_id = channel_id
|
||||
self.__message_id = message_id_
|
||||
self.__is_empty_message = is_empty_message
|
||||
|
||||
def __len__(self):
|
||||
return len(self.__body)
|
||||
|
||||
def body(self):
|
||||
"""Message data"""
|
||||
return self.__body
|
||||
|
||||
def timestamp(self):
|
||||
"""Get timestamp when item is written by upstream DataWriter
|
||||
"""
|
||||
return self.__timestamp
|
||||
|
||||
def channel_id(self):
|
||||
"""Get string id of channel where data is coming from
|
||||
"""
|
||||
return self.__channel_id
|
||||
|
||||
def is_empty_message(self):
|
||||
"""Whether this message is an empty message.
|
||||
Upstream DataWriter will send an empty message when this is no data
|
||||
in specified interval.
|
||||
"""
|
||||
return self.__is_empty_message
|
||||
|
||||
@property
|
||||
def message_id(self):
|
||||
return self.__message_id
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataWriter:
|
||||
"""Data Writer is a wrapper of streaming c++ DataWriter, which sends data
|
||||
to downstream workers
|
||||
"""
|
||||
|
||||
def __init__(self, output_channels, to_actors: List[ActorHandle],
|
||||
conf: dict):
|
||||
"""Get DataWriter of output channels
|
||||
Args:
|
||||
output_channels: output channels ids
|
||||
to_actors: downstream output actors
|
||||
Returns:
|
||||
DataWriter
|
||||
"""
|
||||
assert len(output_channels) > 0
|
||||
py_output_channels = [
|
||||
channel_id_str_to_bytes(qid_str) for qid_str in output_channels
|
||||
]
|
||||
output_actor_ids: List[ActorID] = [
|
||||
handle._ray_actor_id for handle in to_actors
|
||||
]
|
||||
channel_size = conf.get(Config.CHANNEL_SIZE,
|
||||
Config.CHANNEL_SIZE_DEFAULT)
|
||||
py_msg_ids = [0 for _ in range(len(output_channels))]
|
||||
config_bytes = _to_native_conf(conf)
|
||||
is_mock = conf[Config.CHANNEL_TYPE] == Config.MEMORY_CHANNEL
|
||||
self.writer = _streaming.DataWriter.create(
|
||||
py_output_channels, output_actor_ids, channel_size, py_msg_ids,
|
||||
config_bytes, is_mock)
|
||||
|
||||
logger.info("create DataWriter succeed")
|
||||
|
||||
def write(self, channel_id: ChannelID, item: bytes):
|
||||
"""Write data into native channel
|
||||
Args:
|
||||
channel_id: channel id
|
||||
item: bytes data
|
||||
Returns:
|
||||
msg_id
|
||||
"""
|
||||
assert type(item) == bytes
|
||||
msg_id = self.writer.write(channel_id.object_qid, item)
|
||||
return msg_id
|
||||
|
||||
def stop(self):
|
||||
logger.info("stopping channel writer.")
|
||||
self.writer.stop()
|
||||
# destruct DataWriter
|
||||
self.writer = None
|
||||
|
||||
def close(self):
|
||||
logger.info("closing channel writer.")
|
||||
|
||||
|
||||
class DataReader:
|
||||
"""Data Reader is wrapper of streaming c++ DataReader, which read data
|
||||
from channels of upstream workers
|
||||
"""
|
||||
|
||||
def __init__(self, input_channels: List, from_actors: List[ActorHandle],
|
||||
conf: dict):
|
||||
"""Get DataReader of input channels
|
||||
Args:
|
||||
input_channels: input channels
|
||||
from_actors: upstream input actors
|
||||
Returns:
|
||||
DataReader
|
||||
"""
|
||||
assert len(input_channels) > 0
|
||||
py_input_channels = [
|
||||
channel_id_str_to_bytes(qid_str) for qid_str in input_channels
|
||||
]
|
||||
input_actor_ids: List[ActorID] = [
|
||||
handle._ray_actor_id for handle in from_actors
|
||||
]
|
||||
py_seq_ids = [0 for _ in range(len(input_channels))]
|
||||
py_msg_ids = [0 for _ in range(len(input_channels))]
|
||||
timer_interval = int(conf.get(Config.TIMER_INTERVAL_MS, -1))
|
||||
is_recreate = bool(conf.get(Config.IS_RECREATE, False))
|
||||
config_bytes = _to_native_conf(conf)
|
||||
self.__queue = Queue(10000)
|
||||
is_mock = conf[Config.CHANNEL_TYPE] == Config.MEMORY_CHANNEL
|
||||
self.reader = _streaming.DataReader.create(
|
||||
py_input_channels, input_actor_ids, py_seq_ids, py_msg_ids,
|
||||
timer_interval, is_recreate, config_bytes, is_mock)
|
||||
logger.info("create DataReader succeed")
|
||||
|
||||
def read(self, timeout_millis):
|
||||
"""Read data from channel
|
||||
Args:
|
||||
timeout_millis: timeout millis when there is no data in channel
|
||||
for this duration
|
||||
Returns:
|
||||
channel item
|
||||
"""
|
||||
if self.__queue.empty():
|
||||
msgs = self.reader.read(timeout_millis)
|
||||
for msg in msgs:
|
||||
msg_bytes, msg_id, timestamp, qid_bytes = msg
|
||||
data_msg = DataMessage(msg_bytes, timestamp,
|
||||
channel_bytes_to_str(qid_bytes), msg_id)
|
||||
self.__queue.put(data_msg)
|
||||
if self.__queue.empty():
|
||||
return None
|
||||
return self.__queue.get()
|
||||
|
||||
def stop(self):
|
||||
logger.info("stopping Data Reader.")
|
||||
self.reader.stop()
|
||||
# destruct DataReader
|
||||
self.reader = None
|
||||
|
||||
def close(self):
|
||||
logger.info("closing Data Reader.")
|
||||
|
||||
|
||||
def _to_native_conf(conf):
|
||||
config = streaming_pb.StreamingConfig()
|
||||
if Config.STREAMING_JOB_NAME in conf:
|
||||
config.job_name = conf[Config.STREAMING_JOB_NAME]
|
||||
if Config.TASK_JOB_ID in conf:
|
||||
job_id = conf[Config.TASK_JOB_ID]
|
||||
config.task_job_id = job_id.hex()
|
||||
if Config.STREAMING_WORKER_NAME in conf:
|
||||
config.worker_name = conf[Config.STREAMING_WORKER_NAME]
|
||||
if Config.STREAMING_OP_NAME in conf:
|
||||
config.op_name = conf[Config.STREAMING_OP_NAME]
|
||||
# TODO set operator type
|
||||
if Config.STREAMING_RING_BUFFER_CAPACITY in conf:
|
||||
config.ring_buffer_capacity = \
|
||||
conf[Config.STREAMING_RING_BUFFER_CAPACITY]
|
||||
if Config.STREAMING_EMPTY_MESSAGE_INTERVAL in conf:
|
||||
config.empty_message_interval = \
|
||||
conf[Config.STREAMING_EMPTY_MESSAGE_INTERVAL]
|
||||
logger.info("conf: %s", str(config))
|
||||
return config.SerializeToString()
|
||||
|
||||
|
||||
class ChannelInitException(Exception):
|
||||
def __init__(self, msg, abnormal_channels):
|
||||
self.abnormal_channels = abnormal_channels
|
||||
self.msg = msg
|
||||
|
||||
|
||||
class ChannelInterruptException(Exception):
|
||||
def __init__(self, msg=None):
|
||||
self.msg = msg
|
||||
@@ -0,0 +1,693 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import pickle
|
||||
import sys
|
||||
import time
|
||||
|
||||
import networkx as nx
|
||||
import ray
|
||||
import ray.streaming.processor as processor
|
||||
import ray.streaming.runtime.transfer as transfer
|
||||
from ray.streaming.communication import DataChannel
|
||||
from ray.streaming.config import Config
|
||||
from ray.streaming.jobworker import JobWorker
|
||||
from ray.streaming.operator import Operator, OpType
|
||||
from ray.streaming.operator import PScheme, PStrategy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel("INFO")
|
||||
|
||||
|
||||
# Rolling sum's logic
|
||||
def _sum(value_1, value_2):
|
||||
return value_1 + value_2
|
||||
|
||||
|
||||
# Partitioning strategies that require all-to-all instance communication
|
||||
all_to_all_strategies = [
|
||||
PStrategy.Shuffle, PStrategy.ShuffleByKey, PStrategy.Broadcast,
|
||||
PStrategy.RoundRobin
|
||||
]
|
||||
|
||||
|
||||
# Environment configuration
|
||||
class Conf(object):
|
||||
"""Environment configuration.
|
||||
|
||||
This class includes all information about the configuration of the
|
||||
streaming environment.
|
||||
"""
|
||||
|
||||
def __init__(self, parallelism=1, channel_type=Config.MEMORY_CHANNEL):
|
||||
self.parallelism = parallelism
|
||||
self.channel_type = channel_type
|
||||
# ...
|
||||
|
||||
|
||||
class ExecutionGraph:
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
self.physical_topo = nx.DiGraph() # DAG
|
||||
# Handles to all actors in the physical dataflow
|
||||
self.actor_handles = []
|
||||
# (op_id, op_instance_index) -> ActorID
|
||||
self.actors_map = {}
|
||||
# execution graph build time: milliseconds since epoch
|
||||
self.build_time = 0
|
||||
self.task_id_counter = 0
|
||||
self.task_ids = {}
|
||||
self.input_channels = {} # operator id -> input channels
|
||||
self.output_channels = {} # operator id -> output channels
|
||||
|
||||
# Constructs and deploys a Ray actor of a specific type
|
||||
# TODO (john): Actor placement information should be specified in
|
||||
# the environment's configuration
|
||||
def __generate_actor(self, instance_index, operator, input_channels,
|
||||
output_channels):
|
||||
"""Generates an actor that will execute a particular instance of
|
||||
the logical operator
|
||||
|
||||
Attributes:
|
||||
instance_index: The index of the instance the actor will execute.
|
||||
operator: The metadata of the logical operator.
|
||||
input_channels: The input channels of the instance.
|
||||
output_channels The output channels of the instance.
|
||||
"""
|
||||
worker_id = (operator.id, instance_index)
|
||||
# Record the physical dataflow graph (for debugging purposes)
|
||||
self.__add_channel(worker_id, output_channels)
|
||||
# Note direct_call only support pass by value
|
||||
return JobWorker._remote(
|
||||
args=[worker_id, operator, input_channels, output_channels],
|
||||
is_direct_call=True)
|
||||
|
||||
# Constructs and deploys a Ray actor for each instance of
|
||||
# the given operator
|
||||
def __generate_actors(self, operator, upstream_channels,
|
||||
downstream_channels):
|
||||
"""Generates one actor for each instance of the given logical
|
||||
operator.
|
||||
|
||||
Attributes:
|
||||
operator (Operator): The logical operator metadata.
|
||||
upstream_channels (list): A list of all upstream channels for
|
||||
all instances of the operator.
|
||||
downstream_channels (list): A list of all downstream channels
|
||||
for all instances of the operator.
|
||||
"""
|
||||
num_instances = operator.num_instances
|
||||
logger.info("Generating {} actors of type {}...".format(
|
||||
num_instances, operator.type))
|
||||
handles = []
|
||||
for i in range(num_instances):
|
||||
# Collect input and output channels for the particular instance
|
||||
ip = [c for c in upstream_channels if c.dst_instance_index == i]
|
||||
op = [c for c in downstream_channels if c.src_instance_index == i]
|
||||
log = "Constructed {} input and {} output channels "
|
||||
log += "for the {}-th instance of the {} operator."
|
||||
logger.debug(log.format(len(ip), len(op), i, operator.type))
|
||||
handle = self.__generate_actor(i, operator, ip, op)
|
||||
if handle:
|
||||
handles.append(handle)
|
||||
self.actors_map[(operator.id, i)] = handle
|
||||
return handles
|
||||
|
||||
# Adds a channel/edge to the physical dataflow graph
|
||||
def __add_channel(self, actor_id, output_channels):
|
||||
for c in output_channels:
|
||||
dest_actor_id = (c.dst_operator_id, c.dst_instance_index)
|
||||
self.physical_topo.add_edge(actor_id, dest_actor_id)
|
||||
|
||||
# Generates all required data channels between an operator
|
||||
# and its downstream operators
|
||||
def _generate_channels(self, operator):
|
||||
"""Generates all output data channels
|
||||
(see: DataChannel in communication.py) for all instances of
|
||||
the given logical operator.
|
||||
|
||||
The function constructs one data channel for each pair of
|
||||
communicating operator instances (instance_1,instance_2),
|
||||
where instance_1 is an instance of the given operator and instance_2
|
||||
is an instance of a direct downstream operator.
|
||||
|
||||
The number of total channels generated depends on the partitioning
|
||||
strategy specified by the user.
|
||||
"""
|
||||
channels = {} # destination operator id -> channels
|
||||
strategies = operator.partitioning_strategies
|
||||
for dst_operator, p_scheme in strategies.items():
|
||||
num_dest_instances = self.env.operators[dst_operator].num_instances
|
||||
entry = channels.setdefault(dst_operator, [])
|
||||
if p_scheme.strategy == PStrategy.Forward:
|
||||
for i in range(operator.num_instances):
|
||||
# ID of destination instance to connect
|
||||
id = i % num_dest_instances
|
||||
qid = self._gen_str_qid(operator.id, i, dst_operator, id)
|
||||
c = DataChannel(operator.id, i, dst_operator, id, qid)
|
||||
entry.append(c)
|
||||
elif p_scheme.strategy in all_to_all_strategies:
|
||||
for i in range(operator.num_instances):
|
||||
for j in range(num_dest_instances):
|
||||
qid = self._gen_str_qid(operator.id, i, dst_operator,
|
||||
j)
|
||||
c = DataChannel(operator.id, i, dst_operator, j, qid)
|
||||
entry.append(c)
|
||||
else:
|
||||
# TODO (john): Add support for other partitioning strategies
|
||||
sys.exit("Unrecognized or unsupported partitioning strategy.")
|
||||
return channels
|
||||
|
||||
def _gen_str_qid(self, src_operator_id, src_instance_index,
|
||||
dst_operator_id, dst_instance_index):
|
||||
from_task_id = self.env.execution_graph.get_task_id(
|
||||
src_operator_id, src_instance_index)
|
||||
to_task_id = self.env.execution_graph.get_task_id(
|
||||
dst_operator_id, dst_instance_index)
|
||||
return transfer.ChannelID.gen_id(from_task_id, to_task_id,
|
||||
self.build_time)
|
||||
|
||||
def _gen_task_id(self):
|
||||
task_id = self.task_id_counter
|
||||
self.task_id_counter += 1
|
||||
return task_id
|
||||
|
||||
def get_task_id(self, op_id, op_instance_id):
|
||||
return self.task_ids[(op_id, op_instance_id)]
|
||||
|
||||
def get_actor(self, op_id, op_instance_id):
|
||||
return self.actors_map[(op_id, op_instance_id)]
|
||||
|
||||
# Prints the physical dataflow graph
|
||||
def print_physical_graph(self):
|
||||
logger.info("===================================")
|
||||
logger.info("======Physical Dataflow Graph======")
|
||||
logger.info("===================================")
|
||||
# Print all data channels between operator instances
|
||||
log = "(Source Operator ID,Source Operator Name,Source Instance ID)"
|
||||
log += " --> "
|
||||
log += "(Destination Operator ID,Destination Operator Name,"
|
||||
log += "Destination Instance ID)"
|
||||
logger.info(log)
|
||||
for src_actor_id, dst_actor_id in self.physical_topo.edges:
|
||||
src_operator_id, src_instance_index = src_actor_id
|
||||
dst_operator_id, dst_instance_index = dst_actor_id
|
||||
logger.info("({},{},{}) --> ({},{},{})".format(
|
||||
src_operator_id, self.env.operators[src_operator_id].name,
|
||||
src_instance_index, dst_operator_id,
|
||||
self.env.operators[dst_operator_id].name, dst_instance_index))
|
||||
|
||||
def build_graph(self):
|
||||
self.build_channels()
|
||||
|
||||
# to support cyclic reference serialization
|
||||
try:
|
||||
ray.register_custom_serializer(Environment, use_pickle=True)
|
||||
ray.register_custom_serializer(ExecutionGraph, use_pickle=True)
|
||||
ray.register_custom_serializer(OpType, use_pickle=True)
|
||||
ray.register_custom_serializer(PStrategy, use_pickle=True)
|
||||
except Exception:
|
||||
# local mode can't use pickle
|
||||
pass
|
||||
|
||||
# Each operator instance is implemented as a Ray actor
|
||||
# Actors are deployed in topological order, as we traverse the
|
||||
# logical dataflow from sources to sinks.
|
||||
for node in nx.topological_sort(self.env.logical_topo):
|
||||
operator = self.env.operators[node]
|
||||
# Instantiate Ray actors
|
||||
handles = self.__generate_actors(
|
||||
operator, self.input_channels.get(node, []),
|
||||
self.output_channels.get(node, []))
|
||||
if handles:
|
||||
self.actor_handles.extend(handles)
|
||||
|
||||
def build_channels(self):
|
||||
self.build_time = int(time.time() * 1000)
|
||||
# gen auto-incremented unique task id for every operator instance
|
||||
for node in nx.topological_sort(self.env.logical_topo):
|
||||
operator = self.env.operators[node]
|
||||
for i in range(operator.num_instances):
|
||||
operator_instance_id = (operator.id, i)
|
||||
self.task_ids[operator_instance_id] = self._gen_task_id()
|
||||
channels = {}
|
||||
for node in nx.topological_sort(self.env.logical_topo):
|
||||
operator = self.env.operators[node]
|
||||
# Generate downstream data channels
|
||||
downstream_channels = self._generate_channels(operator)
|
||||
channels[node] = downstream_channels
|
||||
# op_id -> channels
|
||||
input_channels = {}
|
||||
output_channels = {}
|
||||
for op_id, all_downstream_channels in channels.items():
|
||||
for dst_op_channels in all_downstream_channels.values():
|
||||
for c in dst_op_channels:
|
||||
dst = input_channels.setdefault(c.dst_operator_id, [])
|
||||
dst.append(c)
|
||||
src = output_channels.setdefault(c.src_operator_id, [])
|
||||
src.append(c)
|
||||
self.input_channels = input_channels
|
||||
self.output_channels = output_channels
|
||||
|
||||
|
||||
# The execution environment for a streaming job
|
||||
class Environment(object):
|
||||
"""A streaming environment.
|
||||
|
||||
This class is responsible for constructing the logical and the
|
||||
physical dataflow.
|
||||
|
||||
Attributes:
|
||||
logical_topo (DiGraph): The user-defined logical topology in
|
||||
NetworkX DiGRaph format.
|
||||
(See: https://networkx.github.io)
|
||||
physical_topo (DiGraph): The physical topology in NetworkX
|
||||
DiGRaph format. The physical dataflow is constructed by the
|
||||
environment based on logical_topo.
|
||||
operators (dict): A mapping from operator ids to operator metadata
|
||||
(See: Operator in operator.py).
|
||||
config (Config): The environment's configuration.
|
||||
topo_cleaned (bool): A flag that indicates whether the logical
|
||||
topology is garbage collected (True) or not (False).
|
||||
actor_handles (list): A list of all Ray actor handles that execute
|
||||
the streaming dataflow.
|
||||
"""
|
||||
|
||||
def __init__(self, config=Conf()):
|
||||
self.logical_topo = nx.DiGraph() # DAG
|
||||
self.operators = {} # operator id --> operator object
|
||||
self.config = config # Environment's configuration
|
||||
self.topo_cleaned = False
|
||||
self.operator_id_counter = 0
|
||||
self.execution_graph = None # set when executed
|
||||
|
||||
def gen_operator_id(self):
|
||||
op_id = self.operator_id_counter
|
||||
self.operator_id_counter += 1
|
||||
return op_id
|
||||
|
||||
# An edge denotes a flow of data between logical operators
|
||||
# and may correspond to multiple data channels in the physical dataflow
|
||||
def _add_edge(self, source, destination):
|
||||
self.logical_topo.add_edge(source, destination)
|
||||
|
||||
# Cleans the logical dataflow graph to construct and
|
||||
# deploy the physical dataflow
|
||||
def _collect_garbage(self):
|
||||
if self.topo_cleaned is True:
|
||||
return
|
||||
for node in self.logical_topo:
|
||||
self.operators[node]._clean()
|
||||
self.topo_cleaned = True
|
||||
|
||||
# Sets the level of parallelism for a registered operator
|
||||
# Overwrites the environment parallelism (if set)
|
||||
def _set_parallelism(self, operator_id, level_of_parallelism):
|
||||
self.operators[operator_id].num_instances = level_of_parallelism
|
||||
|
||||
# Sets the same level of parallelism for all operators in the environment
|
||||
def set_parallelism(self, parallelism):
|
||||
self.config.parallelism = parallelism
|
||||
|
||||
# Creates and registers a user-defined data source
|
||||
# TODO (john): There should be different types of sources, e.g. sources
|
||||
# reading from Kafka, text files, etc.
|
||||
# TODO (john): Handle case where environment parallelism is set
|
||||
def source(self, source):
|
||||
source_id = self.gen_operator_id()
|
||||
source_stream = DataStream(self, source_id)
|
||||
self.operators[source_id] = Operator(
|
||||
source_id, OpType.Source, processor.Source, "Source", logic=source)
|
||||
return source_stream
|
||||
|
||||
# Creates and registers a new data source that reads a
|
||||
# text file line by line
|
||||
# TODO (john): There should be different types of sources,
|
||||
# e.g. sources reading from Kafka, text files, etc.
|
||||
# TODO (john): Handle case where environment parallelism is set
|
||||
def read_text_file(self, filepath):
|
||||
source_id = self.gen_operator_id()
|
||||
source_stream = DataStream(self, source_id)
|
||||
self.operators[source_id] = Operator(
|
||||
source_id,
|
||||
OpType.ReadTextFile,
|
||||
processor.ReadTextFile,
|
||||
"Read Text File",
|
||||
other=filepath)
|
||||
return source_stream
|
||||
|
||||
# Constructs and deploys the physical dataflow
|
||||
def execute(self):
|
||||
"""Deploys and executes the physical dataflow."""
|
||||
self._collect_garbage() # Make sure everything is clean
|
||||
# TODO (john): Check if dataflow has any 'logical inconsistencies'
|
||||
# For example, if there is a forward partitioning strategy but
|
||||
# the number of downstream instances is larger than the number of
|
||||
# upstream instances, some of the downstream instances will not be
|
||||
# used at all
|
||||
|
||||
self.execution_graph = ExecutionGraph(self)
|
||||
self.execution_graph.build_graph()
|
||||
logger.info("init...")
|
||||
# init
|
||||
init_waits = []
|
||||
for actor_handle in self.execution_graph.actor_handles:
|
||||
init_waits.append(actor_handle.init.remote(pickle.dumps(self)))
|
||||
for wait in init_waits:
|
||||
assert ray.get(wait) is True
|
||||
logger.info("running...")
|
||||
# start
|
||||
exec_handles = []
|
||||
for actor_handle in self.execution_graph.actor_handles:
|
||||
exec_handles.append(actor_handle.start.remote())
|
||||
|
||||
return exec_handles
|
||||
|
||||
def wait_finish(self):
|
||||
for actor_handle in self.execution_graph.actor_handles:
|
||||
while not ray.get(actor_handle.is_finished.remote()):
|
||||
time.sleep(1)
|
||||
|
||||
# Prints the logical dataflow graph
|
||||
def print_logical_graph(self):
|
||||
self._collect_garbage()
|
||||
logger.info("==================================")
|
||||
logger.info("======Logical Dataflow Graph======")
|
||||
logger.info("==================================")
|
||||
# Print operators in topological order
|
||||
for node in nx.topological_sort(self.logical_topo):
|
||||
downstream_neighbors = list(self.logical_topo.neighbors(node))
|
||||
logger.info("======Current Operator======")
|
||||
operator = self.operators[node]
|
||||
operator.print()
|
||||
logger.info("======Downstream Operators======")
|
||||
if len(downstream_neighbors) == 0:
|
||||
logger.info("None\n")
|
||||
for downstream_node in downstream_neighbors:
|
||||
self.operators[downstream_node].print()
|
||||
|
||||
|
||||
# TODO (john): We also need KeyedDataStream and WindowedDataStream as
|
||||
# subclasses of DataStream to prevent ill-defined logical dataflows
|
||||
|
||||
|
||||
# A DataStream corresponds to an edge in the logical dataflow
|
||||
class DataStream(object):
|
||||
"""A data stream.
|
||||
|
||||
This class contains all information about a logical stream, i.e. an edge
|
||||
in the logical topology. It is the main class exposed to the user.
|
||||
|
||||
Attributes:
|
||||
id (UUID): The id of the stream
|
||||
env (Environment): The environment the stream belongs to.
|
||||
src_operator_id (UUID): The id of the source operator of the stream.
|
||||
dst_operator_id (UUID): The id of the destination operator of the
|
||||
stream.
|
||||
is_partitioned (bool): Denotes if there is a partitioning strategy
|
||||
(e.g. shuffle) for the stream or not (default stategy: Forward).
|
||||
"""
|
||||
stream_id_counter = 0
|
||||
|
||||
def __init__(self,
|
||||
environment,
|
||||
source_id=None,
|
||||
dest_id=None,
|
||||
is_partitioned=False):
|
||||
self.env = environment
|
||||
self.id = DataStream.stream_id_counter
|
||||
DataStream.stream_id_counter += 1
|
||||
self.src_operator_id = source_id
|
||||
self.dst_operator_id = dest_id
|
||||
# True if a partitioning strategy for this stream exists,
|
||||
# false otherwise
|
||||
self.is_partitioned = is_partitioned
|
||||
|
||||
# Generates a new stream after a data transformation is applied
|
||||
def __expand(self):
|
||||
stream = DataStream(self.env)
|
||||
assert (self.dst_operator_id is not None)
|
||||
stream.src_operator_id = self.dst_operator_id
|
||||
stream.dst_operator_id = None
|
||||
return stream
|
||||
|
||||
# Assigns the partitioning strategy to a new 'open-ended' stream
|
||||
# and returns the stream. At this point, the partitioning strategy
|
||||
# is not associated with any destination operator. We expect this to
|
||||
# be done later, as we continue assembling the dataflow graph
|
||||
def __partition(self, strategy, partition_fn=None):
|
||||
scheme = PScheme(strategy, partition_fn)
|
||||
source_operator = self.env.operators[self.src_operator_id]
|
||||
new_stream = DataStream(
|
||||
self.env, source_id=source_operator.id, is_partitioned=True)
|
||||
source_operator._set_partition_strategy(new_stream.id, scheme)
|
||||
return new_stream
|
||||
|
||||
# Registers the operator to the environment and returns a new
|
||||
# 'open-ended' stream. The registered operator serves as the destination
|
||||
# of the previously 'open' stream
|
||||
def __register(self, operator):
|
||||
"""Registers the given logical operator to the environment and
|
||||
connects it to its upstream operator (if any).
|
||||
|
||||
A call to this function adds a new edge to the logical topology.
|
||||
|
||||
Attributes:
|
||||
operator (Operator): The metadata of the logical operator.
|
||||
"""
|
||||
self.env.operators[operator.id] = operator
|
||||
self.dst_operator_id = operator.id
|
||||
logger.debug("Adding new dataflow edge ({},{}) --> ({},{})".format(
|
||||
self.src_operator_id,
|
||||
self.env.operators[self.src_operator_id].name,
|
||||
self.dst_operator_id,
|
||||
self.env.operators[self.dst_operator_id].name))
|
||||
# Update logical dataflow graphs
|
||||
self.env._add_edge(self.src_operator_id, self.dst_operator_id)
|
||||
# Keep track of the partitioning strategy and the destination operator
|
||||
src_operator = self.env.operators[self.src_operator_id]
|
||||
if self.is_partitioned is True:
|
||||
partitioning, _ = src_operator._get_partition_strategy(self.id)
|
||||
src_operator._set_partition_strategy(self.id, partitioning,
|
||||
operator.id)
|
||||
elif src_operator.type == OpType.KeyBy:
|
||||
# Set the output partitioning strategy to shuffle by key
|
||||
partitioning = PScheme(PStrategy.ShuffleByKey)
|
||||
src_operator._set_partition_strategy(self.id, partitioning,
|
||||
operator.id)
|
||||
else: # No partitioning strategy has been defined - set default
|
||||
partitioning = PScheme(PStrategy.Forward)
|
||||
src_operator._set_partition_strategy(self.id, partitioning,
|
||||
operator.id)
|
||||
return self.__expand()
|
||||
|
||||
# Sets the level of parallelism for an operator, i.e. its total
|
||||
# number of instances. Each operator instance corresponds to an actor
|
||||
# in the physical dataflow
|
||||
def set_parallelism(self, num_instances):
|
||||
"""Sets the number of instances for the source operator of the stream.
|
||||
|
||||
Attributes:
|
||||
num_instances (int): The level of parallelism for the source
|
||||
operator of the stream.
|
||||
"""
|
||||
assert (num_instances > 0)
|
||||
self.env._set_parallelism(self.src_operator_id, num_instances)
|
||||
return self
|
||||
|
||||
# Stream Partitioning Strategies #
|
||||
# TODO (john): Currently, only forward (default), shuffle,
|
||||
# and broadcast are supported
|
||||
|
||||
# Hash-based record shuffling
|
||||
def shuffle(self):
|
||||
"""Registers a shuffling partitioning strategy for the stream."""
|
||||
return self.__partition(PStrategy.Shuffle)
|
||||
|
||||
# Broadcasts each record to all downstream instances
|
||||
def broadcast(self):
|
||||
"""Registers a broadcast partitioning strategy for the stream."""
|
||||
return self.__partition(PStrategy.Broadcast)
|
||||
|
||||
# Rescales load to downstream instances
|
||||
def rescale(self):
|
||||
"""Registers a rescale partitioning strategy for the stream.
|
||||
|
||||
Same as Flink's rescale (see: https://ci.apache.org/projects/flink/
|
||||
flink-docs-stable/dev/stream/operators/#physical-partitioning).
|
||||
"""
|
||||
return self.__partition(PStrategy.Rescale)
|
||||
|
||||
# Round-robin partitioning
|
||||
def round_robin(self):
|
||||
"""Registers a round-robin partitioning strategy for the stream."""
|
||||
return self.__partition(PStrategy.RoundRobin)
|
||||
|
||||
# User-defined partitioning
|
||||
def partition(self, partition_fn):
|
||||
"""Registers a user-defined partitioning strategy for the stream.
|
||||
|
||||
Attributes:
|
||||
partition_fn (function): The user-defined partitioning function.
|
||||
"""
|
||||
return self.__partition(PStrategy.Custom, partition_fn)
|
||||
|
||||
# Data Trasnformations #
|
||||
# TODO (john): Expand set of supported operators.
|
||||
# TODO (john): To support event-time windows we need a mechanism for
|
||||
# generating and processing watermarks
|
||||
|
||||
# Registers map operator to the environment
|
||||
def map(self, map_fn, name="Map"):
|
||||
"""Applies a map operator to the stream.
|
||||
|
||||
Attributes:
|
||||
map_fn (function): The user-defined logic of the map.
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.Map,
|
||||
processor.Map,
|
||||
name,
|
||||
map_fn,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers flatmap operator to the environment
|
||||
def flat_map(self, flatmap_fn):
|
||||
"""Applies a flatmap operator to the stream.
|
||||
|
||||
Attributes:
|
||||
flatmap_fn (function): The user-defined logic of the flatmap
|
||||
(e.g. split()).
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.FlatMap,
|
||||
processor.FlatMap,
|
||||
"FlatMap",
|
||||
flatmap_fn,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers keyBy operator to the environment
|
||||
# TODO (john): This should returned a KeyedDataStream
|
||||
def key_by(self, key_selector):
|
||||
"""Applies a key_by operator to the stream.
|
||||
|
||||
Attributes:
|
||||
key_attribute_index (int): The index of the key attributed
|
||||
(assuming tuple records).
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.KeyBy,
|
||||
processor.KeyBy,
|
||||
"KeyBy",
|
||||
other=key_selector,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers Reduce operator to the environment
|
||||
def reduce(self, reduce_fn):
|
||||
"""Applies a rolling sum operator to the stream.
|
||||
|
||||
Attributes:
|
||||
sum_attribute_index (int): The index of the attribute to sum
|
||||
(assuming tuple records).
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.Reduce,
|
||||
processor.Reduce,
|
||||
"Sum",
|
||||
reduce_fn,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers Sum operator to the environment
|
||||
def sum(self, attribute_selector, state_keeper=None):
|
||||
"""Applies a rolling sum operator to the stream.
|
||||
|
||||
Attributes:
|
||||
sum_attribute_index (int): The index of the attribute to sum
|
||||
(assuming tuple records).
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.Sum,
|
||||
processor.Reduce,
|
||||
"Sum",
|
||||
_sum,
|
||||
other=attribute_selector,
|
||||
state_actor=state_keeper,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers window operator to the environment.
|
||||
# This is a system time window
|
||||
# TODO (john): This should return a WindowedDataStream
|
||||
def time_window(self, window_width_ms):
|
||||
"""Applies a system time window to the stream.
|
||||
|
||||
Attributes:
|
||||
window_width_ms (int): The length of the window in ms.
|
||||
"""
|
||||
raise Exception("time_window is unsupported")
|
||||
|
||||
# Registers filter operator to the environment
|
||||
def filter(self, filter_fn):
|
||||
"""Applies a filter to the stream.
|
||||
|
||||
Attributes:
|
||||
filter_fn (function): The user-defined filter function.
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.Filter,
|
||||
processor.Filter,
|
||||
"Filter",
|
||||
filter_fn,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# TODO (john): Registers window join operator to the environment
|
||||
def window_join(self, other_stream, join_attribute, window_width):
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.WindowJoin,
|
||||
processor.WindowJoin,
|
||||
"WindowJoin",
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers inspect operator to the environment
|
||||
def inspect(self, inspect_logic):
|
||||
"""Inspects the content of the stream.
|
||||
|
||||
Attributes:
|
||||
inspect_logic (function): The user-defined inspect function.
|
||||
"""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.Inspect,
|
||||
processor.Inspect,
|
||||
"Inspect",
|
||||
inspect_logic,
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
|
||||
# Registers sink operator to the environment
|
||||
# TODO (john): A sink now just drops records but it should be able to
|
||||
# export data to other systems
|
||||
def sink(self):
|
||||
"""Closes the stream with a sink operator."""
|
||||
op = Operator(
|
||||
self.env.gen_operator_id(),
|
||||
OpType.Sink,
|
||||
processor.Sink,
|
||||
"Sink",
|
||||
num_instances=self.env.config.parallelism)
|
||||
return self.__register(op)
|
||||
@@ -0,0 +1,127 @@
|
||||
import pickle
|
||||
import threading
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.streaming._streaming as _streaming
|
||||
import ray.streaming.runtime.transfer as transfer
|
||||
from ray.function_manager import FunctionDescriptor
|
||||
from ray.streaming.config import Config
|
||||
|
||||
|
||||
@ray.remote
|
||||
class Worker:
|
||||
def __init__(self):
|
||||
core_worker = ray.worker.global_worker.core_worker
|
||||
writer_async_func = FunctionDescriptor(
|
||||
__name__, self.on_writer_message.__name__, self.__class__.__name__)
|
||||
writer_sync_func = FunctionDescriptor(
|
||||
__name__, self.on_writer_message_sync.__name__,
|
||||
self.__class__.__name__)
|
||||
self.writer_client = _streaming.WriterClient(
|
||||
core_worker, writer_async_func, writer_sync_func)
|
||||
reader_async_func = FunctionDescriptor(
|
||||
__name__, self.on_reader_message.__name__, self.__class__.__name__)
|
||||
reader_sync_func = FunctionDescriptor(
|
||||
__name__, self.on_reader_message_sync.__name__,
|
||||
self.__class__.__name__)
|
||||
self.reader_client = _streaming.ReaderClient(
|
||||
core_worker, reader_async_func, reader_sync_func)
|
||||
self.writer = None
|
||||
self.output_channel_id = None
|
||||
self.reader = None
|
||||
|
||||
def init_writer(self, output_channel, reader_actor):
|
||||
conf = {
|
||||
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
|
||||
.current_driver_id,
|
||||
Config.CHANNEL_TYPE: Config.NATIVE_CHANNEL
|
||||
}
|
||||
self.writer = transfer.DataWriter([output_channel],
|
||||
[pickle.loads(reader_actor)], conf)
|
||||
self.output_channel_id = transfer.ChannelID(output_channel)
|
||||
|
||||
def init_reader(self, input_channel, writer_actor):
|
||||
conf = {
|
||||
Config.TASK_JOB_ID: ray.runtime_context._get_runtime_context()
|
||||
.current_driver_id,
|
||||
Config.CHANNEL_TYPE: Config.NATIVE_CHANNEL
|
||||
}
|
||||
self.reader = transfer.DataReader([input_channel],
|
||||
[pickle.loads(writer_actor)], conf)
|
||||
|
||||
def start_write(self, msg_nums):
|
||||
self.t = threading.Thread(
|
||||
target=self.run_writer, args=[msg_nums], daemon=True)
|
||||
self.t.start()
|
||||
|
||||
def run_writer(self, msg_nums):
|
||||
for i in range(msg_nums):
|
||||
self.writer.write(self.output_channel_id, pickle.dumps(i))
|
||||
print("WriterWorker done.")
|
||||
|
||||
def start_read(self, msg_nums):
|
||||
self.t = threading.Thread(
|
||||
target=self.run_reader, args=[msg_nums], daemon=True)
|
||||
self.t.start()
|
||||
|
||||
def run_reader(self, msg_nums):
|
||||
count = 0
|
||||
msg = None
|
||||
while count != msg_nums:
|
||||
item = self.reader.read(100)
|
||||
if item is None:
|
||||
time.sleep(0.01)
|
||||
else:
|
||||
msg = pickle.loads(item.body())
|
||||
count += 1
|
||||
assert msg == msg_nums - 1
|
||||
print("ReaderWorker done.")
|
||||
|
||||
def is_finished(self):
|
||||
return not self.t.is_alive()
|
||||
|
||||
def on_reader_message(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
self.reader_client.on_reader_message(buffer)
|
||||
|
||||
def on_reader_message_sync(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
if self.reader_client is None:
|
||||
return b" " * 4 # special flag to indicate this actor not ready
|
||||
result = self.reader_client.on_reader_message_sync(buffer)
|
||||
return result.to_pybytes()
|
||||
|
||||
def on_writer_message(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
self.writer_client.on_writer_message(buffer)
|
||||
|
||||
def on_writer_message_sync(self, buffer: bytes):
|
||||
"""used in direct call mode"""
|
||||
if self.writer_client is None:
|
||||
return b" " * 4 # special flag to indicate this actor not ready
|
||||
result = self.writer_client.on_writer_message_sync(buffer)
|
||||
return result.to_pybytes()
|
||||
|
||||
|
||||
def test_queue():
|
||||
ray.init()
|
||||
writer = Worker._remote(is_direct_call=True)
|
||||
reader = Worker._remote(is_direct_call=True)
|
||||
channel_id_str = transfer.ChannelID.gen_random_id()
|
||||
inits = [
|
||||
writer.init_writer.remote(channel_id_str, pickle.dumps(reader)),
|
||||
reader.init_reader.remote(channel_id_str, pickle.dumps(writer))
|
||||
]
|
||||
ray.get(inits)
|
||||
msg_nums = 1000
|
||||
print("start read/write")
|
||||
reader.start_read.remote(msg_nums)
|
||||
writer.start_write.remote(msg_nums)
|
||||
while not ray.get(reader.is_finished.remote()):
|
||||
time.sleep(0.1)
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_queue()
|
||||
@@ -0,0 +1,210 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.streaming.streaming import Environment, ExecutionGraph
|
||||
from ray.streaming.operator import OpType, PStrategy
|
||||
|
||||
|
||||
def test_parallelism():
|
||||
"""Tests operator parallelism."""
|
||||
env = Environment()
|
||||
# Try setting a common parallelism for all operators
|
||||
env.set_parallelism(2)
|
||||
stream = env.source(None).map(None).filter(None).flat_map(None)
|
||||
env._collect_garbage()
|
||||
for operator in env.operators.values():
|
||||
if operator.type == OpType.Source:
|
||||
# TODO (john): Currently each source has only one instance
|
||||
assert operator.num_instances == 1, (operator.num_instances, 1)
|
||||
else:
|
||||
assert operator.num_instances == 2, (operator.num_instances, 2)
|
||||
# Check again after adding an operator with different parallelism
|
||||
stream.map(None, "Map1").shuffle().set_parallelism(3).map(
|
||||
None, "Map2").set_parallelism(4)
|
||||
env._collect_garbage()
|
||||
for operator in env.operators.values():
|
||||
if operator.type == OpType.Source:
|
||||
assert operator.num_instances == 1, (operator.num_instances, 1)
|
||||
elif operator.name != "Map1" and operator.name != "Map2":
|
||||
assert operator.num_instances == 2, (operator.num_instances, 2)
|
||||
elif operator.name != "Map2":
|
||||
assert operator.num_instances == 3, (operator.num_instances, 3)
|
||||
else:
|
||||
assert operator.num_instances == 4, (operator.num_instances, 4)
|
||||
|
||||
|
||||
def test_partitioning():
|
||||
"""Tests stream partitioning."""
|
||||
env = Environment()
|
||||
# Try defining multiple partitioning strategies for the same stream
|
||||
_ = env.source(None).shuffle().rescale().broadcast().map(
|
||||
None).broadcast().shuffle()
|
||||
env._collect_garbage()
|
||||
for operator in env.operators.values():
|
||||
p_schemes = operator.partitioning_strategies
|
||||
for scheme in p_schemes.values():
|
||||
# Only last defined strategy should be kept
|
||||
if operator.type == OpType.Source:
|
||||
assert scheme.strategy == PStrategy.Broadcast, (
|
||||
scheme.strategy, PStrategy.Broadcast)
|
||||
else:
|
||||
assert scheme.strategy == PStrategy.Shuffle, (
|
||||
scheme.strategy, PStrategy.Shuffle)
|
||||
|
||||
|
||||
def test_forking():
|
||||
"""Tests stream forking."""
|
||||
env = Environment()
|
||||
# Try forking a stream
|
||||
stream = env.source(None).map(None).set_parallelism(2)
|
||||
# First branch with a shuffle partitioning strategy
|
||||
_ = stream.shuffle().key_by(0).sum(1)
|
||||
# Second branch with the default partitioning strategy
|
||||
_ = stream.key_by(1).sum(2)
|
||||
env._collect_garbage()
|
||||
# Operator ids
|
||||
source_id = None
|
||||
map_id = None
|
||||
keyby1_id = None
|
||||
keyby2_id = None
|
||||
sum1_id = None
|
||||
sum2_id = None
|
||||
# Collect ids
|
||||
for id, operator in env.operators.items():
|
||||
if operator.type == OpType.Source:
|
||||
source_id = id
|
||||
elif operator.type == OpType.Map:
|
||||
map_id = id
|
||||
elif operator.type == OpType.KeyBy:
|
||||
if operator.other_args == 0:
|
||||
keyby1_id = id
|
||||
else:
|
||||
assert operator.other_args == 1, (operator.other_args, 1)
|
||||
keyby2_id = id
|
||||
elif operator.type == OpType.Sum:
|
||||
if operator.other_args == 1:
|
||||
sum1_id = id
|
||||
else:
|
||||
assert operator.other_args == 2, (operator.other_args, 2)
|
||||
sum2_id = id
|
||||
# Check generated streams and their partitioning
|
||||
for source, destination in env.logical_topo.edges:
|
||||
operator = env.operators[source]
|
||||
if source == source_id:
|
||||
assert destination == map_id, (destination, map_id)
|
||||
elif source == map_id:
|
||||
p_scheme = operator.partitioning_strategies[destination]
|
||||
strategy = p_scheme.strategy
|
||||
key_index = env.operators[destination].other_args
|
||||
if key_index == 0: # This must be the first branch
|
||||
assert strategy == PStrategy.Shuffle, (strategy,
|
||||
PStrategy.Shuffle)
|
||||
assert destination == keyby1_id, (destination, keyby1_id)
|
||||
else: # This must be the second branch
|
||||
assert key_index == 1, (key_index, 1)
|
||||
assert strategy == PStrategy.Forward, (strategy,
|
||||
PStrategy.Forward)
|
||||
assert destination == keyby2_id, (destination, keyby2_id)
|
||||
elif source == keyby1_id or source == keyby2_id:
|
||||
p_scheme = operator.partitioning_strategies[destination]
|
||||
strategy = p_scheme.strategy
|
||||
key_index = env.operators[destination].other_args
|
||||
if key_index == 1: # This must be the first branch
|
||||
assert strategy == PStrategy.ShuffleByKey, (
|
||||
strategy, PStrategy.ShuffleByKey)
|
||||
assert destination == sum1_id, (destination, sum1_id)
|
||||
else: # This must be the second branch
|
||||
assert key_index == 2, (key_index, 2)
|
||||
assert strategy == PStrategy.ShuffleByKey, (
|
||||
strategy, PStrategy.ShuffleByKey)
|
||||
assert destination == sum2_id, (destination, sum2_id)
|
||||
else: # This must be a sum operator
|
||||
assert operator.type == OpType.Sum, (operator.type, OpType.Sum)
|
||||
|
||||
|
||||
def _test_shuffle_channels():
|
||||
"""Tests shuffling connectivity."""
|
||||
env = Environment()
|
||||
# Try defining a shuffle
|
||||
_ = env.source(None).shuffle().map(None).set_parallelism(4)
|
||||
expected = [(0, 0), (0, 1), (0, 2), (0, 3)]
|
||||
_test_channels(env, expected)
|
||||
|
||||
|
||||
def _test_forward_channels():
|
||||
"""Tests forward connectivity."""
|
||||
env = Environment()
|
||||
# Try the default partitioning strategy
|
||||
_ = env.source(None).set_parallelism(4).map(None).set_parallelism(2)
|
||||
expected = [(0, 0), (1, 1), (2, 0), (3, 1)]
|
||||
_test_channels(env, expected)
|
||||
|
||||
|
||||
def _test_broadcast_channels():
|
||||
"""Tests broadcast connectivity."""
|
||||
env = Environment()
|
||||
# Try broadcasting
|
||||
_ = env.source(None).set_parallelism(4).broadcast().map(
|
||||
None).set_parallelism(2)
|
||||
expected = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), (3, 1)]
|
||||
_test_channels(env, expected)
|
||||
|
||||
|
||||
def _test_round_robin_channels():
|
||||
"""Tests round-robin connectivity."""
|
||||
env = Environment()
|
||||
# Try broadcasting
|
||||
_ = env.source(None).round_robin().map(None).set_parallelism(2)
|
||||
expected = [(0, 0), (0, 1)]
|
||||
_test_channels(env, expected)
|
||||
|
||||
|
||||
def _test_channels(environment, expected_channels):
|
||||
"""Tests operator connectivity."""
|
||||
environment._collect_garbage()
|
||||
map_id = None
|
||||
# Get id
|
||||
for id, operator in environment.operators.items():
|
||||
if operator.type == OpType.Map:
|
||||
map_id = id
|
||||
# Collect channels
|
||||
environment.execution_graph = ExecutionGraph(environment)
|
||||
environment.execution_graph.build_channels()
|
||||
channels_per_destination = []
|
||||
for operator in environment.operators.values():
|
||||
channels_per_destination.append(
|
||||
environment.execution_graph._generate_channels(operator))
|
||||
# Check actual connectivity
|
||||
actual = []
|
||||
for destination in channels_per_destination:
|
||||
for channels in destination.values():
|
||||
for channel in channels:
|
||||
src_instance_index = channel.src_instance_index
|
||||
dst_instance_index = channel.dst_instance_index
|
||||
connection = (src_instance_index, dst_instance_index)
|
||||
assert channel.dst_operator_id == map_id, (
|
||||
channel.dst_operator_id, map_id)
|
||||
actual.append(connection)
|
||||
# Make sure connections are as expected
|
||||
set_1 = set(expected_channels)
|
||||
set_2 = set(actual)
|
||||
assert set_1 == set_2, (set_1, set_2)
|
||||
|
||||
|
||||
def test_channel_generation():
|
||||
"""Tests data channel generation."""
|
||||
_test_shuffle_channels()
|
||||
_test_broadcast_channels()
|
||||
_test_round_robin_channels()
|
||||
_test_forward_channels()
|
||||
|
||||
|
||||
# TODO (john): Add simple wordcount test
|
||||
def test_wordcount():
|
||||
"""Tests a simple streaming wordcount."""
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_channel_generation()
|
||||
@@ -0,0 +1,20 @@
|
||||
import ray
|
||||
from ray.streaming.config import Config
|
||||
from ray.streaming.streaming import Environment, Conf
|
||||
|
||||
|
||||
def test_word_count():
|
||||
ray.init()
|
||||
env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
|
||||
env.read_text_file(__file__) \
|
||||
.set_parallelism(1) \
|
||||
.filter(lambda x: "word" in x) \
|
||||
.inspect(lambda x: print("result", x))
|
||||
env_handle = env.execute()
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
env.wait_finish()
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_word_count()
|
||||
Reference in New Issue
Block a user