[Streaming] Streaming Python API (#6755)

This commit is contained in:
chaokunyang
2020-02-25 10:33:33 +08:00
committed by GitHub
parent 2c1f4fd82c
commit 8b6784de06
71 changed files with 2701 additions and 1928 deletions
+229 -95
View File
@@ -1,109 +1,243 @@
from abc import ABC, abstractmethod
import enum
import logging
import cloudpickle
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
from ray import streaming
from ray.streaming import function
from ray.streaming import message
# Stream partitioning schemes
class PScheme:
def __init__(self, strategy, partition_fn=None):
self.strategy = strategy
self.partition_fn = partition_fn
def __repr__(self):
return "({},{})".format(self.strategy, self.partition_fn)
class OperatorType(enum.Enum):
SOURCE = 0 # Sources are where your program reads its input from
ONE_INPUT = 1 # This operator has one data stream as it's input stream.
TWO_INPUT = 2 # This operator has two data stream as it's input stream.
# Partitioning strategies
class PStrategy(enum.Enum):
Forward = 0 # Default
Shuffle = 1
Rescale = 2
RoundRobin = 3
Broadcast = 4
Custom = 5
ShuffleByKey = 6
# ...
class Operator(ABC):
"""
Abstract base class for all operators.
An operator is used to run a :class:`function.Function`.
"""
@abstractmethod
def open(self, collectors, runtime_context):
pass
@abstractmethod
def finish(self):
pass
@abstractmethod
def close(self):
pass
@abstractmethod
def operator_type(self) -> OperatorType:
pass
# Operator types
class OpType(enum.Enum):
Source = 0
Map = 1
FlatMap = 2
Filter = 3
TimeWindow = 4
KeyBy = 5
Sink = 6
WindowJoin = 7
Inspect = 8
ReadTextFile = 9
Reduce = 10
Sum = 11
# ...
class OneInputOperator(Operator, ABC):
"""Interface for stream operators with one input."""
@abstractmethod
def process_element(self, record):
pass
def operator_type(self):
return OperatorType.ONE_INPUT
# A logical dataflow operator
class Operator:
def __init__(self,
id,
op_type,
processor_class,
name="",
logic=None,
num_instances=1,
other=None,
state_actor=None):
self.id = id
self.type = op_type
self.processor_class = processor_class
self.name = name
self._logic = cloudpickle.dumps(logic) # The operator's logic
self.num_instances = num_instances
# One partitioning strategy per downstream operator (default: forward)
self.partitioning_strategies = {}
self.other_args = other # Depends on the type of the operator
self.state_actor = state_actor # Actor to query state
class TwoInputOperator(Operator, ABC):
"""Interface for stream operators with two input"""
# Sets the partitioning scheme for an output stream of the operator
def _set_partition_strategy(self,
stream_id,
partitioning_scheme,
dest_operator=None):
self.partitioning_strategies[stream_id] = (partitioning_scheme,
dest_operator)
@abstractmethod
def process_element(self, record1, record2):
pass
# Retrieves the partitioning scheme for the given
# output stream of the operator
# Returns None is no strategy has been defined for the particular stream
def _get_partition_strategy(self, stream_id):
return self.partitioning_strategies.get(stream_id)
def operator_type(self):
return OperatorType.TWO_INPUT
# Cleans metatada from all partitioning strategies that lack a
# destination operator
# Valid entries are re-organized as
# 'destination operator id -> partitioning scheme'
# Should be called only after the logical dataflow has been constructed
def _clean(self):
strategies = {}
for _, v in self.partitioning_strategies.items():
strategy, destination_operator = v
if destination_operator is not None:
strategies.setdefault(destination_operator, strategy)
self.partitioning_strategies = strategies
def print(self):
log = "Operator<\nID = {}\nName = {}\nprocessor_class = {}\n"
log += "Logic = {}\nNumber_of_Instances = {}\n"
log += "Partitioning_Scheme = {}\nOther_Args = {}>\n"
logger.debug(
log.format(self.id, self.name, self.processor_class, self.logic,
self.num_instances, self.partitioning_strategies,
self.other_args))
class StreamOperator(Operator, ABC):
"""
Basic interface for stream operators. Implementers would implement one of
:class:`OneInputOperator` or :class:`TwoInputOperator` to to create
operators that process elements.
"""
@property
def logic(self):
return cloudpickle.loads(self._logic)
def __init__(self, func):
self.func = func
self.collectors = None
self.runtime_context = None
def open(self, collectors, runtime_context):
self.collectors = collectors
self.runtime_context = runtime_context
def finish(self):
pass
def close(self):
pass
def collect(self, record):
for collector in self.collectors:
collector.collect(record)
class SourceOperator(StreamOperator):
"""
Operator to run a :class:`function.SourceFunction`
"""
class SourceContextImpl(function.SourceContext):
def __init__(self, collectors):
self.collectors = collectors
def collect(self, value):
for collector in self.collectors:
collector.collect(message.Record(value))
def __init__(self, func):
assert isinstance(func, function.SourceFunction)
super().__init__(func)
self.source_context = None
def open(self, collectors, runtime_context):
super().open(collectors, runtime_context)
self.source_context = SourceOperator.SourceContextImpl(collectors)
self.func.init(runtime_context.get_parallelism(),
runtime_context.get_task_index())
def run(self):
self.func.run(self.source_context)
def operator_type(self):
return OperatorType.SOURCE
class MapOperator(StreamOperator, OneInputOperator):
"""
Operator to run a :class:`function.MapFunction`
"""
def __init__(self, map_func: function.MapFunction):
assert isinstance(map_func, function.MapFunction)
super().__init__(map_func)
def process_element(self, record):
self.collect(message.Record(self.func.map(record.value)))
class FlatMapOperator(StreamOperator, OneInputOperator):
"""
Operator to run a :class:`function.FlatMapFunction`
"""
def __init__(self, flat_map_func: function.FlatMapFunction):
assert isinstance(flat_map_func, function.FlatMapFunction)
super().__init__(flat_map_func)
self.collection_collector = None
def open(self, collectors, runtime_context):
super().open(collectors, runtime_context)
self.collection_collector = streaming.collector.CollectionCollector(
collectors)
def process_element(self, record):
self.func.flat_map(record.value, self.collection_collector)
class FilterOperator(StreamOperator, OneInputOperator):
"""
Operator to run a :class:`function.FilterFunction`
"""
def __init__(self, filter_func: function.FilterFunction):
assert isinstance(filter_func, function.FilterFunction)
super().__init__(filter_func)
def process_element(self, record):
if self.func.filter(record.value):
self.collect(record)
class KeyByOperator(StreamOperator, OneInputOperator):
"""
Operator to run a :class:`function.KeyFunction`
"""
def __init__(self, key_func: function.KeyFunction):
assert isinstance(key_func, function.KeyFunction)
super().__init__(key_func)
def process_element(self, record):
key = self.func.key_by(record.value)
self.collect(message.KeyRecord(key, record.value))
class ReduceOperator(StreamOperator, OneInputOperator):
"""
Operator to run a :class:`function.ReduceFunction`
"""
def __init__(self, reduce_func: function.ReduceFunction):
assert isinstance(reduce_func, function.ReduceFunction)
super().__init__(reduce_func)
self.reduce_state = {}
def open(self, collectors, runtime_context):
super().open(collectors, runtime_context)
def process_element(self, record: message.KeyRecord):
key = record.key
value = record.value
if key in self.reduce_state:
old_value = self.reduce_state[key]
new_value = self.func.reduce(old_value, value)
self.reduce_state[key] = new_value
self.collect(message.Record(new_value))
else:
self.reduce_state[key] = value
self.collect(record)
class SinkOperator(StreamOperator, OneInputOperator):
"""
Operator to run a :class:`function.SinkFunction`
"""
def __init__(self, sink_func: function.SinkFunction):
assert isinstance(sink_func, function.SinkFunction)
super().__init__(sink_func)
def process_element(self, record):
self.func.sink(record.value)
_function_to_operator = {
function.SourceFunction: SourceOperator,
function.MapFunction: MapOperator,
function.FlatMapFunction: FlatMapOperator,
function.FilterFunction: FilterOperator,
function.KeyFunction: KeyByOperator,
function.ReduceFunction: ReduceOperator,
function.SinkFunction: SinkOperator,
}
def create_operator(func: function.Function):
"""Create an operator according to a :class:`function.Function`
Args:
func: a subclass of function.Function
Returns:
an operator
"""
operator_class = None
super_classes = func.__class__.mro()
for super_class in super_classes:
operator_class = _function_to_operator.get(super_class, None)
if operator_class is not None:
break
assert operator_class is not None
return operator_class(func)