[Streaming] Streaming data transfer java (#6474)

This commit is contained in:
Chaokun Yang
2019-12-22 10:56:05 +08:00
committed by Hao Chen
parent 1b14fbe179
commit 7bbfa85c66
146 changed files with 3923 additions and 786 deletions
@@ -0,0 +1,26 @@
package org.ray.streaming.api.collector;
import java.util.List;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.message.Record;
/**
* Combination of multiple collectors.
*
* @param <T> The type of output data.
*/
public class CollectionCollector<T> implements Collector<T> {
private List<Collector> collectorList;
public CollectionCollector(List<Collector> collectorList) {
this.collectorList = collectorList;
}
@Override
public void collect(T value) {
for (Collector collector : collectorList) {
collector.collect(new Record(value));
}
}
}
@@ -0,0 +1,13 @@
package org.ray.streaming.api.collector;
/**
* The collector that collects data from an upstream operator, and emits data to downstream
* operators.
*
* @param <T> Type of the data to collect.
*/
public interface Collector<T> {
void collect(T value);
}
@@ -0,0 +1,18 @@
package org.ray.streaming.api.context;
/**
* Encapsulate the runtime information of a streaming task.
*/
public interface RuntimeContext {
int getTaskId();
int getTaskIndex();
int getParallelism();
Long getBatchId();
Long getMaxBatch();
}
@@ -0,0 +1,70 @@
package org.ray.streaming.api.context;
import com.google.common.base.Preconditions;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.ServiceLoader;
import java.util.concurrent.atomic.AtomicInteger;
import org.ray.streaming.api.stream.StreamSink;
import org.ray.streaming.plan.Plan;
import org.ray.streaming.plan.PlanBuilder;
import org.ray.streaming.schedule.JobScheduler;
/**
* Encapsulate the context information of a streaming Job.
*/
public class StreamingContext implements Serializable {
private transient AtomicInteger idGenerator;
/**
* The sinks of this streaming job.
*/
private List<StreamSink> streamSinks;
private Map<String, Object> jobConfig;
/**
* The logic plan.
*/
private Plan plan;
private StreamingContext() {
this.idGenerator = new AtomicInteger(0);
this.streamSinks = new ArrayList<>();
this.jobConfig = new HashMap<>();
}
public static StreamingContext buildContext() {
return new StreamingContext();
}
/**
* Construct job DAG, and execute the job.
*/
public void execute() {
PlanBuilder planBuilder = new PlanBuilder(this.streamSinks);
this.plan = planBuilder.buildPlan();
plan.printPlan();
ServiceLoader<JobScheduler> serviceLoader = ServiceLoader.load(JobScheduler.class);
Iterator<JobScheduler> iterator = serviceLoader.iterator();
Preconditions.checkArgument(iterator.hasNext(),
"No JobScheduler implementation has been provided.");
JobScheduler jobSchedule = iterator.next();
jobSchedule.schedule(plan, jobConfig);
}
public int generateId() {
return this.idGenerator.incrementAndGet();
}
public void addSink(StreamSink streamSink) {
streamSinks.add(streamSink);
}
public void withConfig(Map<String, Object> jobConfig) {
this.jobConfig = jobConfig;
}
}
@@ -0,0 +1,10 @@
package org.ray.streaming.api.function;
import java.io.Serializable;
/**
* Interface of streaming functions.
*/
public interface Function extends Serializable {
}
@@ -0,0 +1,23 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of aggregate functions.
*
* @param <I> Type of the input data.
* @param <A> Type of the intermediate data.
* @param <O> Type of the output data.
*/
public interface AggregateFunction<I, A, O> extends Function {
A createAccumulator();
void add(I value, A accumulator);
O getResult(A accumulator);
A merge(A a, A b);
void retract(A acc, I value);
}
@@ -0,0 +1,16 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.api.function.Function;
/**
* Interface of flat-map functions.
*
* @param <T> Type of the input data.
* @param <R> Type of the output data.
*/
@FunctionalInterface
public interface FlatMapFunction<T, R> extends Function {
void flatMap(T value, Collector<R> collector);
}
@@ -0,0 +1,17 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of join functions.
*
* @param <T> Type of the left input data.
* @param <O> Type of the right input data.
* @param <R> Type of the output data.
*/
@FunctionalInterface
public interface JoinFunction<T, O, R> extends Function {
R join(T left, O right);
}
@@ -0,0 +1,15 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of key-by functions.
*
* @param <T> Type of the input data.
* @param <K> Type of the key-by field.
*/
@FunctionalInterface
public interface KeyFunction<T, K> extends Function {
K keyBy(T value);
}
@@ -0,0 +1,15 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of map functions.
*
* @param <T> type of the input data.
* @param <R> type of the output data.
*/
@FunctionalInterface
public interface MapFunction<T, R> extends Function {
R map(T value);
}
@@ -0,0 +1,14 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of process functions.
*
* @param <T> Type of the input data.
*/
@FunctionalInterface
public interface ProcessFunction<T> extends Function {
void process(T value);
}
@@ -0,0 +1,14 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of reduce functions.
*
* @param <T> Type of the input data.
*/
@FunctionalInterface
public interface ReduceFunction<T> extends Function {
T reduce(T oldValue, T newValue);
}
@@ -0,0 +1,14 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of sink functions.
*
* @param <T> Type of the sink data.
*/
@FunctionalInterface
public interface SinkFunction<T> extends Function {
void sink(T value);
}
@@ -0,0 +1,23 @@
package org.ray.streaming.api.function.impl;
import org.ray.streaming.api.function.Function;
/**
* Interface of Source functions.
*
* @param <T> Type of the data output by the source.
*/
public interface SourceFunction<T> extends Function {
void init(int parallel, int index);
void run(SourceContext<T> ctx) throws Exception;
void close();
interface SourceContext<T> {
void collect(T element) throws Exception;
}
}
@@ -0,0 +1,37 @@
package org.ray.streaming.api.function.internal;
import java.util.ArrayList;
import java.util.Collection;
import org.ray.streaming.api.function.impl.SourceFunction;
/**
* The SourceFunction that fetch data from a Java Collection object.
*
* @param <T> Type of the data output by the source.
*/
public class CollectionSourceFunction<T> implements SourceFunction<T> {
private Collection<T> values;
public CollectionSourceFunction(Collection<T> values) {
this.values = values;
}
@Override
public void init(int parallel, int index) {
}
@Override
public void run(SourceContext<T> ctx) throws Exception {
for (T value : values) {
ctx.collect(value);
}
// empty collection
values = new ArrayList<>();
}
@Override
public void close() {
}
}
@@ -0,0 +1,23 @@
package org.ray.streaming.api.partition;
import org.ray.streaming.api.function.Function;
/**
* Interface of the partitioning strategy.
*
* @param <T> Type of the input data.
*/
@FunctionalInterface
public interface Partition<T> extends Function {
/**
* Given a record and downstream partitions, determine which partition(s) should receive the
* record.
*
* @param record The record.
* @param numPartition num of partitions
* @return IDs of the downstream partitions that should receive the record.
*/
int[] partition(T record, int numPartition);
}
@@ -0,0 +1,24 @@
package org.ray.streaming.api.partition.impl;
import java.util.stream.IntStream;
import org.ray.streaming.api.partition.Partition;
/**
* Broadcast the record to all downstream partitions.
*/
public class BroadcastPartition<T> implements Partition<T> {
private int[] partitions = new int[0];
public BroadcastPartition() {
}
@Override
public int[] partition(T value, int numPartition) {
if (partitions.length != numPartition) {
partitions = IntStream.rangeClosed(0, numPartition - 1).toArray();
}
return partitions;
}
}
@@ -0,0 +1,20 @@
package org.ray.streaming.api.partition.impl;
import org.ray.streaming.api.partition.Partition;
import org.ray.streaming.message.KeyRecord;
/**
* Partition the record by the key.
*
* @param <K> Type of the partition key.
* @param <T> Type of the input record.
*/
public class KeyPartition<K, T> implements Partition<KeyRecord<K, T>> {
private int[] partitions = new int[1];
@Override
public int[] partition(KeyRecord<K, T> keyRecord, int numPartition) {
partitions[0] = Math.abs(keyRecord.getKey().hashCode() % numPartition);
return partitions;
}
}
@@ -0,0 +1,24 @@
package org.ray.streaming.api.partition.impl;
import org.ray.streaming.api.partition.Partition;
/**
* Partition record to downstream tasks in a round-robin matter.
*
* @param <T> Type of the input record.
*/
public class RoundRobinPartition<T> implements Partition<T> {
private int seq;
private int[] partitions = new int[1];
public RoundRobinPartition() {
this.seq = 0;
}
@Override
public int[] partition(T value, int numPartition) {
seq = (seq + 1) % numPartition;
partitions[0] = seq;
return partitions;
}
}
@@ -0,0 +1,136 @@
package org.ray.streaming.api.stream;
import org.ray.streaming.api.context.StreamingContext;
import org.ray.streaming.api.function.impl.FlatMapFunction;
import org.ray.streaming.api.function.impl.KeyFunction;
import org.ray.streaming.api.function.impl.MapFunction;
import org.ray.streaming.api.function.impl.SinkFunction;
import org.ray.streaming.api.partition.Partition;
import org.ray.streaming.api.partition.impl.BroadcastPartition;
import org.ray.streaming.operator.StreamOperator;
import org.ray.streaming.operator.impl.FlatMapOperator;
import org.ray.streaming.operator.impl.KeyByOperator;
import org.ray.streaming.operator.impl.MapOperator;
import org.ray.streaming.operator.impl.SinkOperator;
/**
* Represents a stream of data.
*
* This class defines all the streaming operations.
*
* @param <T> Type of data in the stream.
*/
public class DataStream<T> extends Stream<T> {
public DataStream(StreamingContext streamingContext, StreamOperator streamOperator) {
super(streamingContext, streamOperator);
}
public DataStream(DataStream input, StreamOperator streamOperator) {
super(input, streamOperator);
}
/**
* Apply a map function to this stream.
*
* @param mapFunction The map function.
* @param <R> Type of data returned by the map function.
* @return A new DataStream.
*/
public <R> DataStream<R> map(MapFunction<T, R> mapFunction) {
return new DataStream<>(this, new MapOperator(mapFunction));
}
/**
* Apply a flat-map function to this stream.
*
* @param flatMapFunction The FlatMapFunction
* @param <R> Type of data returned by the flatmap function.
* @return A new DataStream
*/
public <R> DataStream<R> flatMap(FlatMapFunction<T, R> flatMapFunction) {
return new DataStream(this, new FlatMapOperator(flatMapFunction));
}
/**
* Apply a union transformation to this stream, with another stream.
*
* @param other Another stream.
* @return A new UnionStream.
*/
public UnionStream<T> union(DataStream<T> other) {
return new UnionStream(this, null, other);
}
/**
* Apply a join transformation to this stream, with another stream.
*
* @param other Another stream.
* @param <O> The type of the other stream data.
* @param <R> The type of the data in the joined stream.
* @return A new JoinStream.
*/
public <O, R> JoinStream<T, O, R> join(DataStream<O> other) {
return new JoinStream<>(this, other);
}
public <R> DataStream<R> process() {
// TODO(zhenxuanpan): Need to add processFunction.
return new DataStream(this, null);
}
/**
* Apply a sink function and get a StreamSink.
*
* @param sinkFunction The sink function.
* @return A new StreamSink.
*/
public StreamSink<T> sink(SinkFunction<T> sinkFunction) {
return new StreamSink<>(this, new SinkOperator(sinkFunction));
}
/**
* Apply a key-by function to this stream.
*
* @param keyFunction the key function.
* @param <K> The type of the key.
* @return A new KeyDataStream.
*/
public <K> KeyDataStream<K, T> keyBy(KeyFunction<T, K> keyFunction) {
return new KeyDataStream<>(this, new KeyByOperator(keyFunction));
}
/**
* Apply broadcast to this stream.
*
* @return This stream.
*/
public DataStream<T> broadcast() {
this.partition = new BroadcastPartition<>();
return this;
}
/**
* Apply a partition to this stream.
*
* @param partition The partitioning strategy.
* @return This stream.
*/
public DataStream<T> partitionBy(Partition<T> partition) {
this.partition = partition;
return this;
}
/**
* Set parallelism to current transformation.
*
* @param parallelism The parallelism to set.
* @return This stream.
*/
public DataStream<T> setParallelism(int parallelism) {
this.parallelism = parallelism;
return this;
}
}
@@ -0,0 +1,82 @@
package org.ray.streaming.api.stream;
import java.io.Serializable;
import org.ray.streaming.api.context.StreamingContext;
import org.ray.streaming.api.function.impl.JoinFunction;
import org.ray.streaming.api.function.impl.KeyFunction;
import org.ray.streaming.operator.StreamOperator;
/**
* Represents a DataStream of two joined DataStream.
*
* @param <L> Type of the data in the left stream.
* @param <R> Type of the data in the right stream.
* @param <J> Type of the data in the joined stream.
*/
public class JoinStream<L, R, J> extends DataStream<L> {
public JoinStream(StreamingContext streamingContext, StreamOperator streamOperator) {
super(streamingContext, streamOperator);
}
public JoinStream(DataStream<L> leftStream, DataStream<R> rightStream) {
super(leftStream, null);
}
/**
* Apply key-by to the left join stream.
*/
public <K> Where<L, R, J, K> where(KeyFunction<L, K> keyFunction) {
return new Where<>(this, keyFunction);
}
/**
* Where clause of the join transformation.
*
* @param <L> Type of the data in the left stream.
* @param <R> Type of the data in the right stream.
* @param <J> Type of the data in the joined stream.
* @param <K> Type of the join key.
*/
class Where<L, R, J, K> implements Serializable {
private JoinStream<L, R, J> joinStream;
private KeyFunction<L, K> leftKeyByFunction;
public Where(JoinStream<L, R, J> joinStream, KeyFunction<L, K> leftKeyByFunction) {
this.joinStream = joinStream;
this.leftKeyByFunction = leftKeyByFunction;
}
public Equal<L, R, J, K> equalLo(KeyFunction<R, K> rightKeyFunction) {
return new Equal<>(joinStream, leftKeyByFunction, rightKeyFunction);
}
}
/**
* Equal clause of the join transformation.
*
* @param <L> Type of the data in the left stream.
* @param <R> Type of the data in the right stream.
* @param <J> Type of the data in the joined stream.
* @param <K> Type of the join key.
*/
class Equal<L, R, J, K> implements Serializable {
private JoinStream<L, R, J> joinStream;
private KeyFunction<L, K> leftKeyByFunction;
private KeyFunction<R, K> rightKeyByFunction;
public Equal(JoinStream<L, R, J> joinStream, KeyFunction<L, K> leftKeyByFunction,
KeyFunction<R, K> rightKeyByFunction) {
this.joinStream = joinStream;
this.leftKeyByFunction = leftKeyByFunction;
this.rightKeyByFunction = rightKeyByFunction;
}
public DataStream<J> with(JoinFunction<L, R, J> joinFunction) {
return (DataStream<J>) joinStream;
}
}
}
@@ -0,0 +1,53 @@
package org.ray.streaming.api.stream;
import org.ray.streaming.api.context.StreamingContext;
import org.ray.streaming.api.function.impl.AggregateFunction;
import org.ray.streaming.api.function.impl.ReduceFunction;
import org.ray.streaming.api.partition.impl.KeyPartition;
import org.ray.streaming.operator.StreamOperator;
import org.ray.streaming.operator.impl.ReduceOperator;
/**
* Represents a DataStream returned by a key-by operation.
*
* @param <K> Type of the key.
* @param <T> Type of the data.
*/
public class KeyDataStream<K, T> extends DataStream<T> {
public KeyDataStream(StreamingContext streamingContext, StreamOperator streamOperator) {
super(streamingContext, streamOperator);
}
public KeyDataStream(DataStream<T> input, StreamOperator streamOperator) {
super(input, streamOperator);
this.partition = new KeyPartition();
}
/**
* Apply a reduce function to this stream.
*
* @param reduceFunction The reduce function.
* @return A new DataStream.
*/
public DataStream<T> reduce(ReduceFunction reduceFunction) {
return new DataStream<>(this, new ReduceOperator(reduceFunction));
}
/**
* Apply an aggregate Function to this stream.
*
* @param aggregateFunction The aggregate function
* @param <A> The type of aggregated intermediate data.
* @param <O> The type of result data.
* @return A new DataStream.
*/
public <A, O> DataStream<O> aggregate(AggregateFunction<T, A, O> aggregateFunction) {
return new DataStream<>(this, null);
}
public KeyDataStream<K, T> setParallelism(int parallelism) {
this.parallelism = parallelism;
return this;
}
}
@@ -0,0 +1,71 @@
package org.ray.streaming.api.stream;
import java.io.Serializable;
import org.ray.streaming.api.context.StreamingContext;
import org.ray.streaming.api.partition.Partition;
import org.ray.streaming.api.partition.impl.RoundRobinPartition;
import org.ray.streaming.operator.StreamOperator;
/**
* Abstract base class of all stream types.
*
* @param <T> Type of the data in the stream.
*/
public abstract class Stream<T> implements Serializable {
protected int id;
protected int parallelism = 1;
protected StreamOperator operator;
protected Stream<T> inputStream;
protected StreamingContext streamingContext;
protected Partition<T> partition;
public Stream(StreamingContext streamingContext, StreamOperator streamOperator) {
this.streamingContext = streamingContext;
this.operator = streamOperator;
this.id = streamingContext.generateId();
this.partition = new RoundRobinPartition<>();
}
public Stream(Stream<T> inputStream, StreamOperator streamOperator) {
this.inputStream = inputStream;
this.parallelism = inputStream.getParallelism();
this.streamingContext = this.inputStream.getStreamingContext();
this.operator = streamOperator;
this.id = streamingContext.generateId();
this.partition = new RoundRobinPartition<>();
}
public Stream<T> getInputStream() {
return inputStream;
}
public StreamOperator getOperator() {
return operator;
}
public StreamingContext getStreamingContext() {
return streamingContext;
}
public int getParallelism() {
return parallelism;
}
public Stream<T> setParallelism(int parallelism) {
this.parallelism = parallelism;
return this;
}
public int getId() {
return id;
}
public Partition<T> getPartition() {
return partition;
}
public void setPartition(Partition<T> partition) {
this.partition = partition;
}
}
@@ -0,0 +1,21 @@
package org.ray.streaming.api.stream;
import org.ray.streaming.operator.impl.SinkOperator;
/**
* Represents a sink of the DataStream.
*
* @param <T> Type of the input data of this sink.
*/
public class StreamSink<T> extends Stream<T> {
public StreamSink(DataStream<T> input, SinkOperator sinkOperator) {
super(input, sinkOperator);
this.streamingContext.addSink(this);
}
public StreamSink<T> setParallelism(int parallelism) {
this.parallelism = parallelism;
return this;
}
}
@@ -0,0 +1,36 @@
package org.ray.streaming.api.stream;
import java.util.Collection;
import org.ray.streaming.api.context.StreamingContext;
import org.ray.streaming.api.function.impl.SourceFunction;
import org.ray.streaming.api.function.internal.CollectionSourceFunction;
import org.ray.streaming.operator.impl.SourceOperator;
/**
* Represents a source of the DataStream.
*
* @param <T> The type of StreamSource data.
*/
public class StreamSource<T> extends DataStream<T> {
public StreamSource(StreamingContext streamingContext, SourceFunction<T> sourceFunction) {
super(streamingContext, new SourceOperator<>(sourceFunction));
}
/**
* Build a StreamSource source from a collection.
*
* @param context Stream context.
* @param values A collection of values.
* @param <T> The type of source data.
* @return A StreamSource.
*/
public static <T> StreamSource<T> buildSource(StreamingContext context, Collection<T> values) {
return new StreamSource(context, new CollectionSourceFunction(values));
}
public StreamSource<T> setParallelism(int parallelism) {
this.parallelism = parallelism;
return this;
}
}
@@ -0,0 +1,25 @@
package org.ray.streaming.api.stream;
import java.util.ArrayList;
import java.util.List;
import org.ray.streaming.operator.StreamOperator;
/**
* Represents a union DataStream.
*
* @param <T> The type of union data.
*/
public class UnionStream<T> extends DataStream<T> {
private List<DataStream> unionStreams;
public UnionStream(DataStream input, StreamOperator streamOperator, DataStream<T> other) {
super(input, streamOperator);
this.unionStreams = new ArrayList<>();
this.unionStreams.add(other);
}
public List<DataStream> getUnionStreams() {
return unionStreams;
}
}
@@ -0,0 +1,20 @@
package org.ray.streaming.message;
public class KeyRecord<K, T> extends Record<T> {
private K key;
public KeyRecord(K key, T value) {
super(value);
this.key = key;
}
public K getKey() {
return key;
}
public void setKey(K key) {
this.key = key;
}
}
@@ -0,0 +1,64 @@
package org.ray.streaming.message;
import com.google.common.collect.Lists;
import java.io.Serializable;
import java.util.List;
public class Message implements Serializable {
private int taskId;
private long batchId;
private String stream;
private List<Record> recordList;
public Message(int taskId, long batchId, String stream, List<Record> recordList) {
this.taskId = taskId;
this.batchId = batchId;
this.stream = stream;
this.recordList = recordList;
}
public Message(int taskId, long batchId, String stream, Record record) {
this.taskId = taskId;
this.batchId = batchId;
this.stream = stream;
this.recordList = Lists.newArrayList(record);
}
public int getTaskId() {
return taskId;
}
public void setTaskId(int taskId) {
this.taskId = taskId;
}
public long getBatchId() {
return batchId;
}
public void setBatchId(long batchId) {
this.batchId = batchId;
}
public String getStream() {
return stream;
}
public void setStream(String stream) {
this.stream = stream;
}
public List<Record> getRecordList() {
return recordList;
}
public void setRecordList(List<Record> recordList) {
this.recordList = recordList;
}
public Record getRecord(int index) {
return recordList.get(0);
}
}
@@ -0,0 +1,35 @@
package org.ray.streaming.message;
import java.io.Serializable;
public class Record<T> implements Serializable {
protected transient String stream;
protected T value;
public Record(T value) {
this.value = value;
}
public T getValue() {
return value;
}
public void setValue(T value) {
this.value = value;
}
public String getStream() {
return stream;
}
public void setStream(String stream) {
this.stream = stream;
}
@Override
public String toString() {
return value.toString();
}
}
@@ -0,0 +1,13 @@
package org.ray.streaming.operator;
import org.ray.streaming.message.Record;
public interface OneInputOperator<T> extends Operator {
void processElement(Record<T> record) throws Exception;
default OperatorType getOpType() {
return OperatorType.ONE_INPUT;
}
}
@@ -0,0 +1,17 @@
package org.ray.streaming.operator;
import java.io.Serializable;
import java.util.List;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.api.context.RuntimeContext;
public interface Operator extends Serializable {
void open(List<Collector> collectors, RuntimeContext runtimeContext);
void finish();
void close();
OperatorType getOpType();
}
@@ -0,0 +1,8 @@
package org.ray.streaming.operator;
public enum OperatorType {
SOURCE,
ONE_INPUT,
TWO_INPUT,
}
@@ -0,0 +1,47 @@
package org.ray.streaming.operator;
import java.util.List;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.api.context.RuntimeContext;
import org.ray.streaming.api.function.Function;
import org.ray.streaming.message.KeyRecord;
import org.ray.streaming.message.Record;
public abstract class StreamOperator<F extends Function> implements Operator {
protected F function;
protected List<Collector> collectorList;
protected RuntimeContext runtimeContext;
public StreamOperator(F function) {
this.function = function;
}
public void open(List<Collector> collectorList, RuntimeContext runtimeContext) {
this.collectorList = collectorList;
this.runtimeContext = runtimeContext;
}
public void finish() {
}
public void close() {
}
protected void collect(Record record) {
for (Collector collector : this.collectorList) {
collector.collect(record);
}
}
protected void collect(KeyRecord keyRecord) {
for (Collector collector : this.collectorList) {
collector.collect(keyRecord);
}
}
}
@@ -0,0 +1,13 @@
package org.ray.streaming.operator;
import org.ray.streaming.message.Record;
public interface TwoInputOperator<T, O> extends Operator {
void processElement(Record<T> record1, Record<O> record2);
default OperatorType getOpType() {
return OperatorType.TWO_INPUT;
}
}
@@ -0,0 +1,31 @@
package org.ray.streaming.operator.impl;
import java.util.List;
import org.ray.streaming.api.collector.CollectionCollector;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.api.context.RuntimeContext;
import org.ray.streaming.api.function.impl.FlatMapFunction;
import org.ray.streaming.message.Record;
import org.ray.streaming.operator.OneInputOperator;
import org.ray.streaming.operator.StreamOperator;
public class FlatMapOperator<T, R> extends StreamOperator<FlatMapFunction<T, R>> implements
OneInputOperator<T> {
private CollectionCollector collectionCollector;
public FlatMapOperator(FlatMapFunction<T, R> flatMapFunction) {
super(flatMapFunction);
}
@Override
public void open(List<Collector> collectorList, RuntimeContext runtimeContext) {
super.open(collectorList, runtimeContext);
this.collectionCollector = new CollectionCollector(collectorList);
}
@Override
public void processElement(Record<T> record) throws Exception {
this.function.flatMap(record.getValue(), (Collector<R>) collectionCollector);
}
}
@@ -0,0 +1,22 @@
package org.ray.streaming.operator.impl;
import org.ray.streaming.api.function.impl.KeyFunction;
import org.ray.streaming.message.KeyRecord;
import org.ray.streaming.message.Record;
import org.ray.streaming.operator.OneInputOperator;
import org.ray.streaming.operator.StreamOperator;
public class KeyByOperator<T, K> extends StreamOperator<KeyFunction<T, K>> implements
OneInputOperator<T> {
public KeyByOperator(KeyFunction<T, K> keyFunction) {
super(keyFunction);
}
@Override
public void processElement(Record<T> record) throws Exception {
K key = this.function.keyBy(record.getValue());
collect(new KeyRecord<>(key, record.getValue()));
}
}
@@ -0,0 +1,20 @@
package org.ray.streaming.operator.impl;
import org.ray.streaming.api.function.impl.MapFunction;
import org.ray.streaming.message.Record;
import org.ray.streaming.operator.OneInputOperator;
import org.ray.streaming.operator.StreamOperator;
public class MapOperator<T, R> extends StreamOperator<MapFunction<T, R>> implements
OneInputOperator<T> {
public MapOperator(MapFunction<T, R> mapFunction) {
super(mapFunction);
}
@Override
public void processElement(Record<T> record) throws Exception {
this.collect(new Record<R>(this.function.map(record.getValue())));
}
}
@@ -0,0 +1,44 @@
package org.ray.streaming.operator.impl;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.api.context.RuntimeContext;
import org.ray.streaming.api.function.impl.ReduceFunction;
import org.ray.streaming.message.KeyRecord;
import org.ray.streaming.message.Record;
import org.ray.streaming.operator.OneInputOperator;
import org.ray.streaming.operator.StreamOperator;
public class ReduceOperator<K, T> extends StreamOperator<ReduceFunction<T>> implements
OneInputOperator<T> {
private Map<K, T> reduceState;
public ReduceOperator(ReduceFunction<T> reduceFunction) {
super(reduceFunction);
}
@Override
public void open(List<Collector> collectorList, RuntimeContext runtimeContext) {
super.open(collectorList, runtimeContext);
this.reduceState = new HashMap<>();
}
@Override
public void processElement(Record<T> record) throws Exception {
KeyRecord<K, T> keyRecord = (KeyRecord<K, T>) record;
K key = keyRecord.getKey();
T value = keyRecord.getValue();
if (reduceState.containsKey(key)) {
T oldValue = reduceState.get(key);
T newValue = this.function.reduce(oldValue, value);
reduceState.put(key, newValue);
collect(new Record(newValue));
} else {
reduceState.put(key, value);
collect(record);
}
}
}
@@ -0,0 +1,20 @@
package org.ray.streaming.operator.impl;
import org.ray.streaming.api.function.impl.SinkFunction;
import org.ray.streaming.message.Record;
import org.ray.streaming.operator.OneInputOperator;
import org.ray.streaming.operator.StreamOperator;
public class SinkOperator<T> extends StreamOperator<SinkFunction<T>> implements
OneInputOperator<T> {
public SinkOperator(SinkFunction<T> sinkFunction) {
super(sinkFunction);
}
@Override
public void processElement(Record<T> record) throws Exception {
this.function.sink(record.getValue());
}
}
@@ -0,0 +1,55 @@
package org.ray.streaming.operator.impl;
import java.util.List;
import org.ray.streaming.api.collector.Collector;
import org.ray.streaming.api.context.RuntimeContext;
import org.ray.streaming.api.function.impl.SourceFunction;
import org.ray.streaming.api.function.impl.SourceFunction.SourceContext;
import org.ray.streaming.message.Record;
import org.ray.streaming.operator.OperatorType;
import org.ray.streaming.operator.StreamOperator;
public class SourceOperator<T> extends StreamOperator<SourceFunction<T>> {
private SourceContextImpl sourceContext;
public SourceOperator(SourceFunction<T> function) {
super(function);
}
@Override
public void open(List<Collector> collectorList, RuntimeContext runtimeContext) {
super.open(collectorList, runtimeContext);
this.sourceContext = new SourceContextImpl(collectorList);
this.function.init(runtimeContext.getParallelism(), runtimeContext.getTaskIndex());
}
public void run() {
try {
this.function.run(this.sourceContext);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public OperatorType getOpType() {
return OperatorType.SOURCE;
}
class SourceContextImpl implements SourceContext<T> {
private List<Collector> collectors;
public SourceContextImpl(List<Collector> collectors) {
this.collectors = collectors;
}
@Override
public void collect(T t) throws Exception {
for (Collector collector : collectors) {
collector.collect(new Record(t));
}
}
}
}
@@ -0,0 +1,58 @@
package org.ray.streaming.plan;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The logical execution plan.
*/
public class Plan implements Serializable {
private static final Logger LOGGER = LoggerFactory.getLogger(Plan.class);
private List<PlanVertex> planVertexList;
private List<PlanEdge> planEdgeList;
public Plan() {
this.planVertexList = new ArrayList<>();
this.planEdgeList = new ArrayList<>();
}
public void addVertex(PlanVertex vertex) {
this.planVertexList.add(vertex);
}
public void addEdge(PlanEdge planEdge) {
this.planEdgeList.add(planEdge);
}
public List<PlanVertex> getPlanVertexList() {
return planVertexList;
}
public List<PlanEdge> getPlanEdgeList() {
return planEdgeList;
}
public String getGraphVizPlan() {
return "";
}
public void printPlan() {
if (!LOGGER.isInfoEnabled()) {
return;
}
LOGGER.info("Printing logic plan:");
for (PlanVertex planVertex : planVertexList) {
LOGGER.info(planVertex.toString());
}
for (PlanEdge planEdge : planEdgeList) {
LOGGER.info(planEdge.toString());
}
}
}
@@ -0,0 +1,62 @@
package org.ray.streaming.plan;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.ray.streaming.api.stream.DataStream;
import org.ray.streaming.api.stream.Stream;
import org.ray.streaming.api.stream.StreamSink;
import org.ray.streaming.api.stream.StreamSource;
import org.ray.streaming.operator.StreamOperator;
public class PlanBuilder {
private Plan plan;
private AtomicInteger edgeIdGenerator;
private List<StreamSink> streamSinkList;
public PlanBuilder(List<StreamSink> streamSinkList) {
this.plan = new Plan();
this.streamSinkList = streamSinkList;
this.edgeIdGenerator = new AtomicInteger(0);
}
public Plan buildPlan() {
for (StreamSink streamSink : streamSinkList) {
processStream(streamSink);
}
return this.plan;
}
private void processStream(Stream stream) {
int vertexId = stream.getId();
int parallelism = stream.getParallelism();
StreamOperator streamOperator = stream.getOperator();
PlanVertex planVertex = null;
if (stream instanceof StreamSink) {
planVertex = new PlanVertex(vertexId, parallelism, VertexType.SINK, streamOperator);
Stream parentStream = stream.getInputStream();
int inputVertexId = parentStream.getId();
PlanEdge planEdge = new PlanEdge(inputVertexId, vertexId, parentStream.getPartition());
this.plan.addEdge(planEdge);
processStream(parentStream);
} else if (stream instanceof StreamSource) {
planVertex = new PlanVertex(vertexId, parallelism, VertexType.SOURCE, streamOperator);
} else if (stream instanceof DataStream) {
planVertex = new PlanVertex(vertexId, parallelism, VertexType.PROCESS, streamOperator);
Stream parentStream = stream.getInputStream();
int inputVertexId = parentStream.getId();
PlanEdge planEdge = new PlanEdge(inputVertexId, vertexId, parentStream.getPartition());
this.plan.addEdge(planEdge);
processStream(parentStream);
}
this.plan.addVertex(planVertex);
}
private int getEdgeId() {
return this.edgeIdGenerator.incrementAndGet();
}
}
@@ -0,0 +1,50 @@
package org.ray.streaming.plan;
import java.io.Serializable;
import org.ray.streaming.api.partition.Partition;
/**
* PlanEdge is connection and partition rules of upstream and downstream execution nodes.
*/
public class PlanEdge implements Serializable {
private int srcVertexId;
private int targetVertexId;
private Partition partition;
public PlanEdge(int srcVertexId, int targetVertexId, Partition partition) {
this.srcVertexId = srcVertexId;
this.targetVertexId = targetVertexId;
this.partition = partition;
}
public int getSrcVertexId() {
return srcVertexId;
}
public void setSrcVertexId(int srcVertexId) {
this.srcVertexId = srcVertexId;
}
public int getTargetVertexId() {
return targetVertexId;
}
public void setTargetVertexId(int targetVertexId) {
this.targetVertexId = targetVertexId;
}
public Partition getPartition() {
return partition;
}
public void setPartition(Partition partition) {
this.partition = partition;
}
@Override
public String toString() {
return "Edge(" + "from:" + srcVertexId + "-" + targetVertexId + "-" + this.partition.getClass()
+ ")";
}
}
@@ -0,0 +1,49 @@
package org.ray.streaming.plan;
import java.io.Serializable;
import org.ray.streaming.operator.StreamOperator;
/**
* PlanVertex is a cell node where logic is executed.
*/
public class PlanVertex implements Serializable {
private int vertexId;
private int parallelism;
private VertexType vertexType;
private StreamOperator streamOperator;
public PlanVertex(int vertexId, int parallelism, VertexType vertexType,
StreamOperator streamOperator) {
this.vertexId = vertexId;
this.parallelism = parallelism;
this.vertexType = vertexType;
this.streamOperator = streamOperator;
}
public int getVertexId() {
return vertexId;
}
public int getParallelism() {
return parallelism;
}
public StreamOperator getStreamOperator() {
return streamOperator;
}
public VertexType getVertexType() {
return vertexType;
}
@Override
public String toString() {
return "PlanVertex{" +
"vertexId=" + vertexId +
", parallelism=" + parallelism +
", vertexType=" + vertexType +
", streamOperator=" + streamOperator +
'}';
}
}
@@ -0,0 +1,11 @@
package org.ray.streaming.plan;
/**
* Different roles for a node.
*/
public enum VertexType {
MASTER,
SOURCE,
PROCESS,
SINK,
}
@@ -0,0 +1,19 @@
package org.ray.streaming.schedule;
import java.util.Map;
import org.ray.streaming.plan.Plan;
/**
* Interface of the job scheduler.
*/
public interface JobScheduler {
/**
* Assign logical plan to physical execution graph, and schedule job to run.
*
* @param plan The logical plan.
*/
void schedule(Plan plan, Map<String, Object> conf);
}
@@ -0,0 +1,44 @@
package org.ray.streaming.util;
public class Config {
/**
* Maximum number of batches to run in a streaming job.
*/
public static final String STREAMING_BATCH_MAX_COUNT = "streaming.batch.max.count";
/**
* batch frequency in milliseconds
*/
public static final String STREAMING_BATCH_FREQUENCY = "streaming.batch.frequency";
public static final long STREAMING_BATCH_FREQUENCY_DEFAULT = 1000;
public static final String STREAMING_JOB_NAME = "streaming.job.name";
public static final String STREAMING_OP_NAME = "streaming.op_name";
public static final String TASK_JOB_ID = "streaming.task_job_id";
public static final String STREAMING_WORKER_NAME = "streaming.worker_name";
// channel
public static final String CHANNEL_TYPE = "channel_type";
public static final String MEMORY_CHANNEL = "memory_channel";
public static final String NATIVE_CHANNEL = "native_channel";
public static final String DEFAULT_CHANNEL_TYPE = NATIVE_CHANNEL;
public static final String CHANNEL_SIZE = "channel_size";
public static final String CHANNEL_SIZE_DEFAULT = String.valueOf((long)Math.pow(10, 8));
public static final String IS_RECREATE = "streaming.is_recreate";
// return from DataReader.getBundle if only empty message read in this interval.
public static final String TIMER_INTERVAL_MS = "timer_interval_ms";
public static final String READ_TIMEOUT_MS = "read_timeout_ms";
public static final String DEFAULT_READ_TIMEOUT_MS = "10";
public static final String STREAMING_RING_BUFFER_CAPACITY = "streaming.ring_buffer_capacity";
// write an empty message if there is no data to be written in this
// interval.
public static final String STREAMING_EMPTY_MESSAGE_INTERVAL = "streaming.empty_message_interval";
// operator type
public static final String OPERATOR_TYPE = "operator_type";
}
@@ -0,0 +1,6 @@
log4j.rootLogger=INFO, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
@@ -0,0 +1,5 @@
ray {
run-mode = SINGLE_PROCESS
resources = "CPU:4"
redis.address = ""
}
@@ -0,0 +1,87 @@
package org.ray.streaming.plan;
import com.google.common.collect.Lists;
import org.ray.streaming.api.context.StreamingContext;
import org.ray.streaming.api.partition.impl.KeyPartition;
import org.ray.streaming.api.partition.impl.RoundRobinPartition;
import org.ray.streaming.api.stream.DataStream;
import org.ray.streaming.api.stream.StreamSink;
import org.ray.streaming.api.stream.StreamSource;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.Test;
public class PlanBuilderTest {
private static final Logger LOGGER = LoggerFactory.getLogger(PlanBuilderTest.class);
@Test
public void testDataSync() {
Plan plan = buildDataSyncPlan();
List<PlanVertex> planVertexList = plan.getPlanVertexList();
List<PlanEdge> planEdgeList = plan.getPlanEdgeList();
Assert.assertEquals(planVertexList.size(), 2);
Assert.assertEquals(planEdgeList.size(), 1);
PlanEdge planEdge = planEdgeList.get(0);
Assert.assertEquals(planEdge.getPartition().getClass(), RoundRobinPartition.class);
PlanVertex sinkVertex = planVertexList.get(1);
PlanVertex sourceVertex = planVertexList.get(0);
Assert.assertEquals(sinkVertex.getVertexType(), VertexType.SINK);
Assert.assertEquals(sourceVertex.getVertexType(), VertexType.SOURCE);
}
public Plan buildDataSyncPlan() {
StreamingContext streamingContext = StreamingContext.buildContext();
DataStream<String> dataStream = StreamSource.buildSource(streamingContext,
Lists.newArrayList("a", "b", "c"));
StreamSink streamSink = dataStream.sink(x -> LOGGER.info(x));
PlanBuilder planBuilder = new PlanBuilder(Lists.newArrayList(streamSink));
Plan plan = planBuilder.buildPlan();
return plan;
}
@Test
public void testKeyByPlan() {
Plan plan = buildKeyByPlan();
List<PlanVertex> planVertexList = plan.getPlanVertexList();
List<PlanEdge> planEdgeList = plan.getPlanEdgeList();
Assert.assertEquals(planVertexList.size(), 3);
Assert.assertEquals(planEdgeList.size(), 2);
PlanVertex source = planVertexList.get(0);
PlanVertex map = planVertexList.get(1);
PlanVertex sink = planVertexList.get(2);
Assert.assertEquals(source.getVertexType(), VertexType.SOURCE);
Assert.assertEquals(map.getVertexType(), VertexType.PROCESS);
Assert.assertEquals(sink.getVertexType(), VertexType.SINK);
PlanEdge keyBy2Sink = planEdgeList.get(0);
PlanEdge source2KeyBy = planEdgeList.get(1);
Assert.assertEquals(keyBy2Sink.getPartition().getClass(), KeyPartition.class);
Assert.assertEquals(source2KeyBy.getPartition().getClass(), RoundRobinPartition.class);
}
public Plan buildKeyByPlan() {
StreamingContext streamingContext = StreamingContext.buildContext();
DataStream<String> dataStream = StreamSource.buildSource(streamingContext,
Lists.newArrayList("1", "2", "3", "4"));
StreamSink streamSink = dataStream.keyBy(x -> x)
.sink(x -> LOGGER.info(x));
PlanBuilder planBuilder = new PlanBuilder(Lists.newArrayList(streamSink));
Plan plan = planBuilder.buildPlan();
return plan;
}
}
@@ -0,0 +1,6 @@
log4j.rootLogger=INFO, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
@@ -0,0 +1,3 @@
ray {
run-mode = SINGLE_PROCESS
}