[Java] Format ray java code (#13056)

This commit is contained in:
chaokunyang
2020-12-29 10:36:16 +08:00
committed by GitHub
parent cc1c2c3dc9
commit d1dd3410c8
422 changed files with 4384 additions and 5035 deletions
@@ -12,9 +12,7 @@ import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Job client: to submit job from api to runtime.
*/
/** Job client: to submit job from api to runtime. */
public class JobClientImpl implements JobClient {
public static final Logger LOG = LoggerFactory.getLogger(JobClientImpl.class);
@@ -23,8 +21,11 @@ public class JobClientImpl implements JobClient {
@Override
public void submit(JobGraph jobGraph, Map<String, String> jobConfig) {
LOG.info("Submitting job [{}] with job graph [{}] and job config [{}].",
jobGraph.getJobName(), jobGraph, jobConfig);
LOG.info(
"Submitting job [{}] with job graph [{}] and job config [{}].",
jobGraph.getJobName(),
jobGraph,
jobConfig);
Map<String, Double> resources = new HashMap<>();
// set job name and id at start
@@ -34,14 +35,12 @@ public class JobClientImpl implements JobClient {
jobGraph.getJobConfig().putAll(jobConfig);
// create job master actor
this.jobMasterActor = Ray.actor(JobMaster::new, jobConfig)
.setResources(resources)
.setMaxRestarts(-1)
.remote();
this.jobMasterActor =
Ray.actor(JobMaster::new, jobConfig).setResources(resources).setMaxRestarts(-1).remote();
try {
ObjectRef<Boolean> submitResult = jobMasterActor.task(JobMaster::submitJob,
jobMasterActor, jobGraph).remote();
ObjectRef<Boolean> submitResult =
jobMasterActor.task(JobMaster::submitJob, jobMasterActor, jobGraph).remote();
if (submitResult.get()) {
LOG.info("Finish submitting job: {}.", jobGraph.getJobName());
@@ -2,9 +2,5 @@ package io.ray.streaming.runtime.config;
import org.aeonbits.owner.Accessible;
/**
* Basic config interface.
*/
public interface Config extends org.aeonbits.owner.Config, Accessible {
}
/** Basic config interface. */
public interface Config extends org.aeonbits.owner.Config, Accessible {}
@@ -3,9 +3,7 @@ package io.ray.streaming.runtime.config;
import java.io.Serializable;
import java.util.Map;
/**
* Streaming config including general, master and worker part.
*/
/** Streaming config including general, master and worker part. */
public class StreamingConfig implements Serializable {
public StreamingMasterConfig masterConfig;
@@ -21,5 +19,4 @@ public class StreamingConfig implements Serializable {
wholeConfigMap.putAll(workerConfigTemplate.configMap);
return wholeConfigMap;
}
}
@@ -15,9 +15,7 @@ import org.aeonbits.owner.ConfigFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Streaming general config. May used by both JobMaster and JobWorker.
*/
/** Streaming general config. May used by both JobMaster and JobWorker. */
public class StreamingGlobalConfig implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(StreamingGlobalConfig.class);
@@ -65,8 +63,7 @@ public class StreamingGlobalConfig implements Serializable {
break;
}
}
Preconditions.checkArgument(configInterface != null,
"Can not get config interface.");
Preconditions.checkArgument(configInterface != null, "Can not get config interface.");
Method[] methods = configInterface.getMethods();
for (Method method : methods) {
@@ -78,8 +75,10 @@ public class StreamingGlobalConfig implements Serializable {
try {
value = method.invoke(config);
} catch (Exception e) {
LOG.warn("Can not get value by method invoking for config key: {}. "
+ "So use default value instead.", ownerKeyAnnotationValue);
LOG.warn(
"Can not get value by method invoking for config key: {}. "
+ "So use default value instead.",
ownerKeyAnnotationValue);
String defaultValue = method.getAnnotation(DefaultValue.class).value();
value = defaultValue;
}
@@ -7,9 +7,7 @@ import org.aeonbits.owner.ConfigFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Streaming job master config.
*/
/** Streaming job master config. */
public class StreamingMasterConfig extends StreamingGlobalConfig {
private static final Logger LOG = LoggerFactory.getLogger(StreamingMasterConfig.class);
@@ -7,9 +7,7 @@ import org.aeonbits.owner.ConfigFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Streaming job worker specified config.
*/
/** Streaming job worker specified config. */
public class StreamingWorkerConfig extends StreamingGlobalConfig {
private static final Logger LOG = LoggerFactory.getLogger(StreamingWorkerConfig.class);
@@ -33,5 +31,4 @@ public class StreamingWorkerConfig extends StreamingGlobalConfig {
}
return result;
}
}
@@ -3,9 +3,7 @@ package io.ray.streaming.runtime.config.global;
import io.ray.streaming.runtime.config.Config;
import org.aeonbits.owner.Mutable;
/**
* Configurations for checkpointing.
*/
/** Configurations for checkpointing. */
public interface CheckpointConfig extends Config, Mutable {
String CP_INTERVAL_SECS = "streaming.checkpoint.interval.secs";
@@ -2,9 +2,7 @@ package io.ray.streaming.runtime.config.global;
import io.ray.streaming.runtime.config.Config;
/**
* Job common config.
*/
/** Job common config. */
public interface CommonConfig extends Config {
String JOB_ID = "streaming.job.id";
@@ -13,7 +11,7 @@ public interface CommonConfig extends Config {
/**
* Ray streaming job id. Non-custom.
*
* @return Job id with string type.
* <p>Returns Job id with string type.
*/
@DefaultValue(value = "default-job-id")
@Key(value = JOB_ID)
@@ -22,7 +20,7 @@ public interface CommonConfig extends Config {
/**
* Ray streaming job name. Non-custom.
*
* @return Job name with string type.
* <p>Returns Job name with string type.
*/
@DefaultValue(value = "default-job-name")
@Key(value = JOB_NAME)
@@ -3,42 +3,30 @@ package io.ray.streaming.runtime.config.global;
import io.ray.streaming.runtime.config.Config;
import io.ray.streaming.runtime.config.types.TransferChannelType;
/**
* Job data transfer config.
*/
/** Job data transfer config. */
public interface TransferConfig extends Config {
/**
* Data transfer channel type, support memory queue and native queue.
*/
/** Data transfer channel type, support memory queue and native queue. */
@DefaultValue(value = "NATIVE_CHANNEL")
@Key(value = io.ray.streaming.util.Config.CHANNEL_TYPE)
TransferChannelType channelType();
/**
* Queue size.
*/
/** Queue size. */
@DefaultValue(value = "100000000")
@Key(value = io.ray.streaming.util.Config.CHANNEL_SIZE)
long channelSize();
/**
* Return from DataReader.getBundle if only empty message read in this interval.
*/
/** Return from DataReader.getBundle if only empty message read in this interval. */
@DefaultValue(value = "-1")
@Key(value = io.ray.streaming.util.Config.TIMER_INTERVAL_MS)
long readerTimerIntervalMs();
/**
* Ring capacity.
*/
/** Ring capacity. */
@DefaultValue(value = "-1")
@Key(value = io.ray.streaming.util.Config.STREAMING_RING_BUFFER_CAPACITY)
int ringBufferCapacity();
/**
* Write an empty message if there is no data to be written in this interval.
*/
/** Write an empty message if there is no data to be written in this interval. */
@DefaultValue(value = "-1")
@Key(value = io.ray.streaming.util.Config.STREAMING_EMPTY_MESSAGE_INTERVAL)
int emptyMsgInterval();
@@ -2,81 +2,54 @@ package io.ray.streaming.runtime.config.master;
import io.ray.streaming.runtime.config.Config;
/**
* Job resource management config.
*/
/** Job resource management config. */
public interface ResourceConfig extends Config {
/**
* Number of actors per container.
*/
/** Number of actors per container. */
String MAX_ACTOR_NUM_PER_CONTAINER = "streaming.container.per.max.actor";
/**
* The interval between detecting ray cluster nodes.
*/
/** The interval between detecting ray cluster nodes. */
String CONTAINER_RESOURCE_CHECk_INTERVAL_SECOND = "streaming.resource.check.interval.second";
/**
* CPU use by per task.
*/
/** CPU use by per task. */
String TASK_RESOURCE_CPU = "streaming.task.resource.cpu";
/**
* Memory use by each task
*/
/** Memory use by each task */
String TASK_RESOURCE_MEM = "streaming.task.resource.mem";
/**
* Whether to enable CPU limit in resource control.
*/
/** Whether to enable CPU limit in resource control. */
String TASK_RESOURCE_CPU_LIMIT_ENABLE = "streaming.task.resource.cpu.limitation.enable";
/**
* Whether to enable memory limit in resource control.
*/
/** Whether to enable memory limit in resource control. */
String TASK_RESOURCE_MEM_LIMIT_ENABLE = "streaming.task.resource.mem.limitation.enable";
/**
* Number of cpu per task.
*/
/** Number of cpu per task. */
@DefaultValue(value = "1.0")
@Key(value = TASK_RESOURCE_CPU)
double taskCpuResource();
/**
* Memory size used by each task.
*/
/** Memory size used by each task. */
@DefaultValue(value = "2.0")
@Key(value = TASK_RESOURCE_MEM)
double taskMemResource();
/**
* Whether to enable CPU limit in resource control.
*/
/** Whether to enable CPU limit in resource control. */
@DefaultValue(value = "false")
@Key(value = TASK_RESOURCE_CPU_LIMIT_ENABLE)
boolean isTaskCpuResourceLimit();
/**
* Whether to enable memory limit in resource control.
*/
/** Whether to enable memory limit in resource control. */
@DefaultValue(value = "false")
@Key(value = TASK_RESOURCE_MEM_LIMIT_ENABLE)
boolean isTaskMemResourceLimit();
/**
* Number of actors per container.
*/
/** Number of actors per container. */
@DefaultValue(value = "500")
@Key(MAX_ACTOR_NUM_PER_CONTAINER)
int actorNumPerContainer();
/**
* The interval between detecting ray cluster nodes.
*/
/** The interval between detecting ray cluster nodes. */
@DefaultValue(value = "1")
@Key(value = CONTAINER_RESOURCE_CHECk_INTERVAL_SECOND)
long resourceCheckIntervalSecond();
}
@@ -2,9 +2,7 @@ package io.ray.streaming.runtime.config.master;
import io.ray.streaming.runtime.config.Config;
/**
* Configuration for job scheduler.
*/
/** Configuration for job scheduler. */
public interface SchedulerConfig extends Config {
String WORKER_INITIATION_WAIT_TIMEOUT_MS = "streaming.scheduler.worker.initiation.timeout.ms";
@@ -13,7 +11,7 @@ public interface SchedulerConfig extends Config {
/**
* The timeout ms of worker initiation. Default is: 10000ms(10s).
*
* @return timeout ms
* <p>Returns timeout ms
*/
@Key(WORKER_INITIATION_WAIT_TIMEOUT_MS)
@DefaultValue(value = "10000")
@@ -22,10 +20,9 @@ public interface SchedulerConfig extends Config {
/**
* The timeout ms of worker starting. Default is: 10000ms(10s).
*
* @return timeout ms
* <p>Returns timeout ms
*/
@Key(WORKER_STARTING_WAIT_TIMEOUT_MS)
@DefaultValue(value = "10000")
int workerStartingWaitTimeoutMs();
}
@@ -2,14 +2,10 @@ package io.ray.streaming.runtime.config.types;
public enum ContextBackendType {
/**
* Memory type
*/
/** Memory type */
MEMORY("memory", 0),
/**
* Local File
*/
/** Local File */
LOCAL_FILE("local_file", 1);
private String name;
@@ -2,9 +2,7 @@ package io.ray.streaming.runtime.config.types;
public enum ResourceAssignStrategyType {
/**
* Resource scheduling strategy based on FF(First Fit) algorithm and pipeline.
*/
/** Resource scheduling strategy based on FF(First Fit) algorithm and pipeline. */
PIPELINE_FIRST_STRATEGY("pipeline_first_strategy", 0);
private String name;
@@ -1,18 +1,12 @@
package io.ray.streaming.runtime.config.types;
/**
* Data transfer channel type.
*/
/** Data transfer channel type. */
public enum TransferChannelType {
/**
* Memory queue.
*/
/** Memory queue. */
MEMORY_CHANNEL("memory_channel", 0),
/**
* Native queue.
*/
/** Native queue. */
NATIVE_CHANNEL("native_channel", 1);
private String value;
@@ -3,24 +3,18 @@ package io.ray.streaming.runtime.config.worker;
import io.ray.streaming.runtime.config.Config;
import org.aeonbits.owner.Mutable;
/**
* This worker config is used by JobMaster to define the internal configuration of JobWorker.
*/
/** This worker config is used by JobMaster to define the internal configuration of JobWorker. */
public interface WorkerInternalConfig extends Config, Mutable {
String WORKER_NAME_INTERNAL = io.ray.streaming.util.Config.STREAMING_WORKER_NAME;
String OP_NAME_INTERNAL = io.ray.streaming.util.Config.STREAMING_OP_NAME;
/**
* The name of the worker inside the system.
*/
/** The name of the worker inside the system. */
@DefaultValue(value = "default-worker-name")
@Key(value = WORKER_NAME_INTERNAL)
String workerName();
/**
* Operator name corresponding to worker.
*/
/** Operator name corresponding to worker. */
@DefaultValue(value = "default-worker-op-name")
@Key(value = OP_NAME_INTERNAL)
String workerOperatorName();
@@ -4,23 +4,22 @@ import io.ray.streaming.runtime.master.JobMaster;
import io.ray.streaming.runtime.worker.JobWorker;
/**
* This interface is used for storing context of {@link JobWorker} and {@link JobMaster}.
* The checkpoint returned by user function is also saved using this interface.
* This interface is used for storing context of {@link JobWorker} and {@link JobMaster}. The
* checkpoint returned by user function is also saved using this interface.
*/
public interface ContextBackend {
/**
* check if key exists in state
*
* @return true if exists
* <p>Returns true if exists
*/
boolean exists(final String key) throws Exception;
/**
* get content by key
*
* @param key key
* @return the StateBackend
* @param key key Returns the StateBackend
*/
byte[] get(final String key) throws Exception;
@@ -38,5 +37,4 @@ public interface ContextBackend {
* @param key key
*/
void remove(final String key) throws Exception;
}
@@ -9,8 +9,8 @@ public class ContextBackendFactory {
public static ContextBackend getContextBackend(final StreamingGlobalConfig config) {
ContextBackend contextBackend;
ContextBackendType type = ContextBackendType.valueOf(
config.contextBackendConfig.stateBackendType().toUpperCase());
ContextBackendType type =
ContextBackendType.valueOf(config.contextBackendConfig.stateBackendType().toUpperCase());
switch (type) {
case MEMORY:
@@ -24,4 +24,4 @@ public class ContextBackendFactory {
}
return contextBackend;
}
}
}
@@ -6,21 +6,17 @@ import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
* This data structure contains state information of a task.
*/
/** This data structure contains state information of a task. */
public class OperatorCheckpointInfo implements Serializable {
/**
* key: channel ID, value: offset
*/
/** key: channel ID, value: offset */
public Map<String, OffsetInfo> inputPoints;
public Map<String, OffsetInfo> outputPoints;
/**
* a serializable checkpoint returned by processor
*/
/** a serializable checkpoint returned by processor */
public Serializable processorCheckpoint;
public long checkpointId;
public OperatorCheckpointInfo() {
@@ -5,8 +5,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Achieves an atomic `put` method.
* known issue: if you crashed while write a key at first time, this code will not work.
* Achieves an atomic `put` method. known issue: if you crashed while write a key at first time,
* this code will not work.
*/
public class AtomicFsBackend extends LocalFileContextBackend {
@@ -6,16 +6,15 @@ import java.io.File;
import org.apache.commons.io.FileUtils;
/**
* This context backend uses local file system and doesn't supports failover in cluster.
* But it supports failover in single node.
* This is a pure file system backend which doesn't support atomic writing, please don't use this
* class, instead, use {@link AtomicFsBackend} which extends this class.
* This context backend uses local file system and doesn't supports failover in cluster. But it
* supports failover in single node. This is a pure file system backend which doesn't support atomic
* writing, please don't use this class, instead, use {@link AtomicFsBackend} which extends this
* class.
*/
public class LocalFileContextBackend implements ContextBackend {
private final String rootPath;
public LocalFileContextBackend(ContextBackendConfig config) {
rootPath = config.fileStateRootPath();
}
@@ -8,8 +8,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This context backend uses memory and doesn't supports failover.
* Data will be lost after worker died.
* This context backend uses memory and doesn't supports failover. Data will be lost after worker
* died.
*/
public class MemoryContextBackend implements ContextBackend {
@@ -36,13 +36,15 @@ public class OutputCollector implements Collector<Record> {
this.writer = writer;
this.outputQueues = outputChannelIds.stream().map(ChannelId::from).toArray(ChannelId[]::new);
this.targetActors = targetActors;
this.targetLanguages = targetActors.stream()
.map(actor -> actor instanceof PyActorHandle ? Language.PYTHON :
Language.JAVA)
.toArray(Language[]::new);
this.targetLanguages =
targetActors.stream()
.map(actor -> actor instanceof PyActorHandle ? Language.PYTHON : Language.JAVA)
.toArray(Language[]::new);
this.partition = partition;
LOGGER.debug("OutputCollector constructed, outputChannelIds:{}, partition:{}.",
outputChannelIds, this.partition);
LOGGER.debug(
"OutputCollector constructed, outputChannelIds:{}, partition:{}.",
outputChannelIds,
this.partition);
}
@Override
@@ -76,5 +78,4 @@ public class OutputCollector implements Collector<Record> {
}
}
}
}
@@ -5,9 +5,7 @@ import io.ray.streaming.runtime.core.resource.ContainerId;
import java.io.Serializable;
import java.util.UUID;
/**
* Streaming system unique identity base class. For example, ${@link ContainerId }
*/
/** Streaming system unique identity base class. For example, ${@link ContainerId } */
public class AbstractId implements Serializable {
private UUID id;
@@ -27,8 +25,6 @@ public class AbstractId implements Serializable {
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("id", id)
.toString();
return MoreObjects.toStringHelper(this).add("id", id).toString();
}
}
@@ -4,29 +4,19 @@ import com.google.common.base.MoreObjects;
import io.ray.streaming.api.partition.Partition;
import java.io.Serializable;
/**
* An edge that connects two execution vertices.
*/
/** An edge that connects two execution vertices. */
public class ExecutionEdge implements Serializable {
/**
* The source(upstream) execution vertex.
*/
/** The source(upstream) execution vertex. */
private final ExecutionVertex sourceExecutionVertex;
/**
* The target(downstream) execution vertex.
*/
/** The target(downstream) execution vertex. */
private final ExecutionVertex targetExecutionVertex;
/**
* The partition of current execution edge's execution job edge.
*/
/** The partition of current execution edge's execution job edge. */
private final Partition partition;
/**
* An unique id for execution edge.
*/
/** An unique id for execution edge. */
private final String executionEdgeIndex;
public ExecutionEdge(
@@ -40,7 +30,8 @@ public class ExecutionEdge implements Serializable {
}
private String generateExecutionEdgeIndex() {
return sourceExecutionVertex.getExecutionVertexId() + ""
return sourceExecutionVertex.getExecutionVertexId()
+ ""
+ targetExecutionVertex.getExecutionVertexId();
}
@@ -17,62 +17,36 @@ import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Physical plan.
*/
/** Physical plan. */
public class ExecutionGraph implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(ExecutionGraph.class);
/**
* Name of the job.
*/
/** Name of the job. */
private final String jobName;
/**
* Configuration of the job.
*/
/** Configuration of the job. */
private Map<String, String> jobConfig;
/**
* Data map for execution job vertex. key: job vertex id. value: execution job vertex.
*/
/** Data map for execution job vertex. key: job vertex id. value: execution job vertex. */
private Map<Integer, ExecutionJobVertex> executionJobVertexMap;
/**
* Data map for execution vertex.
* key: execution vertex id.
* value: execution vertex.
*/
/** Data map for execution vertex. key: execution vertex id. value: execution vertex. */
private Map<Integer, ExecutionVertex> executionVertexMap;
/**
* Data map for execution vertex.
* key: actor id.
* value: execution vertex.
*/
/** Data map for execution vertex. key: actor id. value: execution vertex. */
private Map<ActorId, ExecutionVertex> actorIdExecutionVertexMap;
/**
* key: channel ID
* value: actors in both sides of this channel
*/
/** key: channel ID value: actors in both sides of this channel */
private Map<String, Set<BaseActorHandle>> channelGroupedActors;
/**
* The max parallelism of the whole graph.
*/
/** The max parallelism of the whole graph. */
private int maxParallelism;
/**
* Build time.
*/
/** Build time. */
private long buildTime;
/**
* A monotonic increasing number, used for vertex's id(immutable).
*/
/** A monotonic increasing number, used for vertex's id(immutable). */
private AtomicInteger executionVertexIdGenerator = new AtomicInteger(0);
public ExecutionGraph(String jobName) {
@@ -96,10 +70,9 @@ public class ExecutionGraph implements Serializable {
this.executionJobVertexMap = executionJobVertexMap;
}
/**
* generate relation mappings between actors, execution vertices and channels
* this method must be called after worker actor is set.
* generate relation mappings between actors, execution vertices and channels this method must be
* called after worker actor is set.
*/
public void generateActorMappings() {
LOG.info("Setup queue actors relation.");
@@ -107,29 +80,33 @@ public class ExecutionGraph implements Serializable {
channelGroupedActors = new HashMap<>();
actorIdExecutionVertexMap = new HashMap<>();
getAllExecutionVertices().forEach(curVertex -> {
getAllExecutionVertices()
.forEach(
curVertex -> {
// current
actorIdExecutionVertexMap.put(curVertex.getActorId(), curVertex);
// current
actorIdExecutionVertexMap.put(curVertex.getActorId(), curVertex);
// input
List<ExecutionEdge> inputEdges = curVertex.getInputEdges();
inputEdges.forEach(inputEdge -> {
ExecutionVertex inputVertex = inputEdge.getSourceExecutionVertex();
String channelId = curVertex.getChannelIdByPeerVertex(inputVertex);
addActorToChannelGroupedActors(channelGroupedActors, channelId,
inputVertex.getWorkerActor());
});
// input
List<ExecutionEdge> inputEdges = curVertex.getInputEdges();
inputEdges.forEach(
inputEdge -> {
ExecutionVertex inputVertex = inputEdge.getSourceExecutionVertex();
String channelId = curVertex.getChannelIdByPeerVertex(inputVertex);
addActorToChannelGroupedActors(
channelGroupedActors, channelId, inputVertex.getWorkerActor());
});
// output
List<ExecutionEdge> outputEdges = curVertex.getOutputEdges();
outputEdges.forEach(outputEdge -> {
ExecutionVertex outputVertex = outputEdge.getTargetExecutionVertex();
String channelId = curVertex.getChannelIdByPeerVertex(outputVertex);
addActorToChannelGroupedActors(channelGroupedActors, channelId,
outputVertex.getWorkerActor());
});
});
// output
List<ExecutionEdge> outputEdges = curVertex.getOutputEdges();
outputEdges.forEach(
outputEdge -> {
ExecutionVertex outputVertex = outputEdge.getTargetExecutionVertex();
String channelId = curVertex.getChannelIdByPeerVertex(outputVertex);
addActorToChannelGroupedActors(
channelGroupedActors, channelId, outputVertex.getWorkerActor());
});
});
LOG.debug("Channel grouped actors is: {}.", channelGroupedActors);
}
@@ -179,7 +156,7 @@ public class ExecutionGraph implements Serializable {
/**
* Get all execution vertices from current execution graph.
*
* @return all execution vertices.
* <p>Returns all execution vertices.
*/
public List<ExecutionVertex> getAllExecutionVertices() {
return executionJobVertexMap.values().stream()
@@ -191,7 +168,7 @@ public class ExecutionGraph implements Serializable {
/**
* Get all execution vertices whose status is 'TO_ADD' from current execution graph.
*
* @return all added execution vertices.
* <p>Returns all added execution vertices.
*/
public List<ExecutionVertex> getAllAddedExecutionVertices() {
return executionJobVertexMap.values().stream()
@@ -204,8 +181,7 @@ public class ExecutionGraph implements Serializable {
/**
* Get specified execution vertex from current execution graph by execution vertex id.
*
* @param executionVertexId execution vertex id.
* @return the specified execution vertex.
* @param executionVertexId execution vertex id. Returns the specified execution vertex.
*/
public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertexId) {
if (executionVertexMap.containsKey(executionVertexId)) {
@@ -214,53 +190,46 @@ public class ExecutionGraph implements Serializable {
throw new RuntimeException("Vertex " + executionVertexId + " does not exist!");
}
/**
* Get specified execution vertex from current execution graph by actor id.
*
* @param actorId the actor id of execution vertex.
* @return the specified execution vertex.
* @param actorId the actor id of execution vertex. Returns the specified execution vertex.
*/
public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) {
return actorIdExecutionVertexMap.get(actorId);
}
/**
* Get specified actor by actor id.
*
* @param actorId the actor id of execution vertex.
* @return the specified actor handle.
* @param actorId the actor id of execution vertex. Returns the specified actor handle.
*/
public Optional<BaseActorHandle> getActorById(ActorId actorId) {
return getAllActors().stream()
.filter(actor -> actor.getId().equals(actorId))
.findFirst();
return getAllActors().stream().filter(actor -> actor.getId().equals(actorId)).findFirst();
}
/**
* Get the peer actor in the other side of channelName of a given actor
*
* @param actor actor in this side
* @param channelName the channel name
* @return the peer actor in the other side
* @param channelName the channel name Returns the peer actor in the other side
*/
public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) {
Set<BaseActorHandle> set = getActorsByChannelId(channelName);
final BaseActorHandle[] res = new BaseActorHandle[1];
set.forEach(anActor -> {
if (!anActor.equals(actor)) {
res[0] = anActor;
}
});
set.forEach(
anActor -> {
if (!anActor.equals(actor)) {
res[0] = anActor;
}
});
return res[0];
}
/**
* Get actors in both sides of a channelId
*
* @param channelId the channelId
* @return actors in both sides
* @param channelId the channelId Returns actors in both sides
*/
public Set<BaseActorHandle> getActorsByChannelId(String channelId) {
return channelGroupedActors.getOrDefault(channelId, Sets.newHashSet());
@@ -269,7 +238,7 @@ public class ExecutionGraph implements Serializable {
/**
* Get all actors by graph.
*
* @return actor list
* <p>Returns actor list
*/
public List<BaseActorHandle> getAllActors() {
return getActorsFromJobVertices(getExecutionJobVertexList());
@@ -278,12 +247,13 @@ public class ExecutionGraph implements Serializable {
/**
* Get source actors by graph.
*
* @return actor list
* <p>Returns actor list
*/
public List<BaseActorHandle> getSourceActors() {
List<ExecutionJobVertex> executionJobVertices = getExecutionJobVertexList().stream()
.filter(ExecutionJobVertex::isSourceVertex)
.collect(Collectors.toList());
List<ExecutionJobVertex> executionJobVertices =
getExecutionJobVertexList().stream()
.filter(ExecutionJobVertex::isSourceVertex)
.collect(Collectors.toList());
return getActorsFromJobVertices(executionJobVertices);
}
@@ -291,16 +261,16 @@ public class ExecutionGraph implements Serializable {
/**
* Get transformation and sink actors by graph.
*
* @return actor list
* <p>Returns actor list
*/
public List<BaseActorHandle> getNonSourceActors() {
List<ExecutionJobVertex> executionJobVertices = getExecutionJobVertexList().stream()
.filter(executionJobVertex ->
executionJobVertex
.isTransformationVertex()
|| executionJobVertex
.isSinkVertex())
.collect(Collectors.toList());
List<ExecutionJobVertex> executionJobVertices =
getExecutionJobVertexList().stream()
.filter(
executionJobVertex ->
executionJobVertex.isTransformationVertex()
|| executionJobVertex.isSinkVertex())
.collect(Collectors.toList());
return getActorsFromJobVertices(executionJobVertices);
}
@@ -308,12 +278,13 @@ public class ExecutionGraph implements Serializable {
/**
* Get sink actors by graph.
*
* @return actor list
* <p>Returns actor list
*/
public List<BaseActorHandle> getSinkActors() {
List<ExecutionJobVertex> executionJobVertices = getExecutionJobVertexList().stream()
.filter(ExecutionJobVertex::isSinkVertex)
.collect(Collectors.toList());
List<ExecutionJobVertex> executionJobVertices =
getExecutionJobVertexList().stream()
.filter(ExecutionJobVertex::isSinkVertex)
.collect(Collectors.toList());
return getActorsFromJobVertices(executionJobVertices);
}
@@ -321,8 +292,7 @@ public class ExecutionGraph implements Serializable {
/**
* Get actors according to job vertices.
*
* @param executionJobVertices specified job vertices
* @return actor list
* @param executionJobVertices specified job vertices Returns actor list
*/
public List<BaseActorHandle> getActorsFromJobVertices(
List<ExecutionJobVertex> executionJobVertices) {
@@ -351,9 +321,6 @@ public class ExecutionGraph implements Serializable {
}
public List<ActorId> getAllActorsId() {
return getAllActors().stream()
.map(BaseActorHandle::getId)
.collect(Collectors.toList());
return getAllActors().stream().map(BaseActorHandle::getId).collect(Collectors.toList());
}
}
@@ -5,29 +5,19 @@ import io.ray.streaming.api.partition.Partition;
import io.ray.streaming.jobgraph.JobEdge;
import java.io.Serializable;
/**
* An edge that connects two execution job vertices.
*/
/** An edge that connects two execution job vertices. */
public class ExecutionJobEdge implements Serializable {
/**
* The source(upstream) execution job vertex.
*/
/** The source(upstream) execution job vertex. */
private final ExecutionJobVertex sourceExecutionJobVertex;
/**
* The target(downstream) execution job vertex.
*/
/** The target(downstream) execution job vertex. */
private final ExecutionJobVertex targetExecutionJobVertex;
/**
* The partition of the execution job edge.
*/
/** The partition of the execution job edge. */
private final Partition partition;
/**
* An unique id for execution job edge.
*/
/** An unique id for execution job edge. */
private final String executionJobEdgeIndex;
public ExecutionJobEdge(
@@ -41,7 +31,8 @@ public class ExecutionJobEdge implements Serializable {
}
private String generateExecutionJobEdgeIndex() {
return sourceExecutionJobVertex.getExecutionJobVertexId() + ""
return sourceExecutionJobVertex.getExecutionJobVertexId()
+ ""
+ targetExecutionJobVertex.getExecutionJobVertexId();
}
@@ -18,41 +18,35 @@ import org.aeonbits.owner.ConfigFactory;
/**
* Physical job vertex.
* <p>Execution job vertex is the physical form of {@link JobVertex} and
* every execution job vertex is corresponding to a group of {@link ExecutionVertex}.
*
* <p>Execution job vertex is the physical form of {@link JobVertex} and every execution job vertex
* is corresponding to a group of {@link ExecutionVertex}.
*/
public class ExecutionJobVertex implements Serializable {
/**
* Unique id. Use {@link JobVertex}'s id directly.
*/
/** Unique id. Use {@link JobVertex}'s id directly. */
private final int executionJobVertexId;
/**
* Use jobVertex id and operator(use {@link StreamOperator}'s name) as name. e.g.
* 1-SourceOperator
* Use jobVertex id and operator(use {@link StreamOperator}'s name) as name. e.g. 1-SourceOperator
*/
private final String executionJobVertexName;
private final StreamOperator streamOperator;
private final VertexType vertexType;
private final Language language;
private final Map<String, String> jobConfig;
private final long buildTime;
/**
* Parallelism of current execution job vertex(operator).
*/
/** Parallelism of current execution job vertex(operator). */
private int parallelism;
/**
* Sub execution vertices of current execution job vertex(operator).
*/
/** Sub execution vertices of current execution job vertex(operator). */
private List<ExecutionVertex> executionVertices;
/**
* Input and output edges of current execution job vertex.
*/
/** Input and output edges of current execution job vertex. */
private List<ExecutionJobEdge> inputEdges = new ArrayList<>();
private List<ExecutionJobEdge> outputEdges = new ArrayList<>();
public ExecutionJobVertex(
@@ -61,8 +55,9 @@ public class ExecutionJobVertex implements Serializable {
AtomicInteger idGenerator,
long buildTime) {
this.executionJobVertexId = jobVertex.getVertexId();
this.executionJobVertexName = generateExecutionJobVertexName(
executionJobVertexId, jobVertex.getStreamOperator().getName());
this.executionJobVertexName =
generateExecutionJobVertexName(
executionJobVertexId, jobVertex.getStreamOperator().getName());
this.streamOperator = jobVertex.getStreamOperator();
this.vertexType = jobVertex.getVertexType();
this.language = jobVertex.getLanguage();
@@ -77,8 +72,8 @@ public class ExecutionJobVertex implements Serializable {
ResourceConfig resourceConfig = ConfigFactory.create(ResourceConfig.class, jobConfig);
for (int subIndex = 0; subIndex < parallelism; subIndex++) {
executionVertices.add(new ExecutionVertex(
idGenerator.getAndIncrement(), subIndex, this, resourceConfig));
executionVertices.add(
new ExecutionVertex(idGenerator.getAndIncrement(), subIndex, this, resourceConfig));
}
return executionVertices;
}
@@ -91,14 +86,14 @@ public class ExecutionJobVertex implements Serializable {
Map<Integer, BaseActorHandle> executionVertexWorkersMap = new HashMap<>();
Preconditions.checkArgument(
executionVertices != null && !executionVertices.isEmpty(),
"Empty execution vertex.");
executionVertices.stream().forEach(vertex -> {
Preconditions.checkArgument(
vertex.getWorkerActor() != null,
"Empty execution vertex worker actor.");
executionVertexWorkersMap.put(vertex.getExecutionVertexId(), vertex.getWorkerActor());
});
executionVertices != null && !executionVertices.isEmpty(), "Empty execution vertex.");
executionVertices.stream()
.forEach(
vertex -> {
Preconditions.checkArgument(
vertex.getWorkerActor() != null, "Empty execution vertex worker actor.");
executionVertexWorkersMap.put(vertex.getExecutionVertexId(), vertex.getWorkerActor());
});
return executionVertexWorkersMap;
}
@@ -114,7 +109,7 @@ public class ExecutionJobVertex implements Serializable {
/**
* e.g. 1-SourceOperator
*
* @return operator name with index
* <p>Returns operator name with index
*/
public String getExecutionJobVertexNameWithIndex() {
return executionJobVertexId + "-" + executionJobVertexName;
@@ -128,8 +123,7 @@ public class ExecutionJobVertex implements Serializable {
return executionVertices;
}
public void setExecutionVertices(
List<ExecutionVertex> executionVertex) {
public void setExecutionVertices(List<ExecutionVertex> executionVertex) {
this.executionVertices = executionVertex;
}
@@ -137,8 +131,7 @@ public class ExecutionJobVertex implements Serializable {
return outputEdges;
}
public void setOutputEdges(
List<ExecutionJobEdge> outputEdges) {
public void setOutputEdges(List<ExecutionJobEdge> outputEdges) {
this.outputEdges = outputEdges;
}
@@ -146,8 +139,7 @@ public class ExecutionJobVertex implements Serializable {
return inputEdges;
}
public void setInputEdges(
List<ExecutionJobEdge> inputEdges) {
public void setInputEdges(List<ExecutionJobEdge> inputEdges) {
this.inputEdges = inputEdges;
}
@@ -18,34 +18,25 @@ import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
/**
* Physical vertex, correspond to {@link ExecutionJobVertex}.
*/
/** Physical vertex, correspond to {@link ExecutionJobVertex}. */
public class ExecutionVertex implements Serializable {
/**
* Unique id for execution vertex.
*/
/** Unique id for execution vertex. */
private final int executionVertexId;
/**
* Immutable field inherited from {@link ExecutionJobVertex}.
*/
/** Immutable field inherited from {@link ExecutionJobVertex}. */
private final int executionJobVertexId;
private final String executionJobVertexName;
private final StreamOperator streamOperator;
private final VertexType vertexType;
private final Language language;
private final long buildTime;
/**
* Resource used by ExecutionVertex.
*/
/** Resource used by ExecutionVertex. */
private final Map<String, Double> resource;
/**
* Parallelism of current vertex's operator.
*/
/** Parallelism of current vertex's operator. */
private int parallelism;
/**
@@ -56,21 +47,15 @@ public class ExecutionVertex implements Serializable {
private ExecutionVertexState state = ExecutionVertexState.TO_ADD;
/**
* The id of the container which this vertex's worker actor belongs to.
*/
/** The id of the container which this vertex's worker actor belongs to. */
private ContainerId containerId;
private String pid;
/**
* Worker actor handle.
*/
/** Worker actor handle. */
private BaseActorHandle workerActor;
/**
* Op config + job config.
*/
/** Op config + job config. */
private Map<String, String> workerConfig;
private List<ExecutionEdge> inputEdges = new ArrayList<>();
@@ -83,7 +68,6 @@ public class ExecutionVertex implements Serializable {
private transient List<BaseActorHandle> inputActorList;
private Map<Integer, String> exeVertexChannelMap;
public ExecutionVertex(
int globalIndex,
int index,
@@ -182,8 +166,7 @@ public class ExecutionVertex implements Serializable {
return inputEdges;
}
public void setInputEdges(
List<ExecutionEdge> inputEdges) {
public void setInputEdges(List<ExecutionEdge> inputEdges) {
this.inputEdges = inputEdges;
}
@@ -191,8 +174,7 @@ public class ExecutionVertex implements Serializable {
return outputEdges;
}
public void setOutputEdges(
List<ExecutionEdge> outputEdges) {
public void setOutputEdges(List<ExecutionEdge> outputEdges) {
this.outputEdges = outputEdges;
}
@@ -279,7 +261,6 @@ public class ExecutionVertex implements Serializable {
return inputActorList;
}
public String getChannelIdByPeerVertex(ExecutionVertex peerVertex) {
if (exeVertexChannelMap == null) {
generateActorChannelInfo();
@@ -287,7 +268,6 @@ public class ExecutionVertex implements Serializable {
return exeVertexChannelMap.get(peerVertex.getExecutionVertexId());
}
private void generateActorChannelInfo() {
inputChannelIdList = new ArrayList<>();
inputActorList = new ArrayList<>();
@@ -297,10 +277,11 @@ public class ExecutionVertex implements Serializable {
List<ExecutionEdge> inputEdges = getInputEdges();
for (ExecutionEdge edge : inputEdges) {
String channelId = ChannelId.genIdStr(
edge.getSourceExecutionVertex().getExecutionVertexId(),
getExecutionVertexId(),
getBuildTime());
String channelId =
ChannelId.genIdStr(
edge.getSourceExecutionVertex().getExecutionVertexId(),
getExecutionVertexId(),
getBuildTime());
inputChannelIdList.add(channelId);
inputActorList.add(edge.getSourceExecutionVertex().getWorkerActor());
exeVertexChannelMap.put(edge.getSourceExecutionVertex().getExecutionVertexId(), channelId);
@@ -308,17 +289,17 @@ public class ExecutionVertex implements Serializable {
List<ExecutionEdge> outputEdges = getOutputEdges();
for (ExecutionEdge edge : outputEdges) {
String channelId = ChannelId.genIdStr(
getExecutionVertexId(),
edge.getTargetExecutionVertex().getExecutionVertexId(),
getBuildTime());
String channelId =
ChannelId.genIdStr(
getExecutionVertexId(),
edge.getTargetExecutionVertex().getExecutionVertexId(),
getBuildTime());
outputChannelIdList.add(channelId);
outputActorList.add(edge.getTargetExecutionVertex().getWorkerActor());
exeVertexChannelMap.put(edge.getTargetExecutionVertex().getExecutionVertexId(), channelId);
}
}
private Map<String, Double> generateResources(ResourceConfig resourceConfig) {
Map<String, Double> resourceMap = new HashMap<>();
if (resourceConfig.isTaskCpuResourceLimit()) {
@@ -2,29 +2,19 @@ package io.ray.streaming.runtime.core.graph.executiongraph;
import java.io.Serializable;
/**
* Vertex state.
*/
/** Vertex state. */
public enum ExecutionVertexState implements Serializable {
/**
* Vertex(Worker) to be added.
*/
/** Vertex(Worker) to be added. */
TO_ADD(1, "TO_ADD"),
/**
* Vertex(Worker) to be deleted.
*/
/** Vertex(Worker) to be deleted. */
TO_DEL(2, "TO_DEL"),
/**
* Vertex(Worker) is running.
*/
/** Vertex(Worker) is running. */
RUNNING(3, "RUNNING"),
/**
* Unknown status,
*/
/** Unknown status, */
UNKNOWN(-1, "UNKNOWN");
public final int code;
@@ -34,5 +24,4 @@ public enum ExecutionVertexState implements Serializable {
this.code = code;
this.msg = msg;
}
}
@@ -14,7 +14,9 @@ public class ProcessBuilder {
public static StreamProcessor buildProcessor(StreamOperator streamOperator) {
OperatorType type = streamOperator.getOpType();
LOGGER.info("Building StreamProcessor, operator type = {}, operator = {}.", type,
LOGGER.info(
"Building StreamProcessor, operator type = {}, operator = {}.",
type,
streamOperator.getClass().getSimpleName());
switch (type) {
case SOURCE:
@@ -12,14 +12,10 @@ public interface Processor<T> extends Serializable {
void process(T t);
/**
* See {@link Function#saveCheckpoint()}.
*/
/** See {@link Function#saveCheckpoint()}. */
Serializable saveCheckpoint();
/**
* See {@link Function#loadCheckpoint(Serializable)}.
*/
/** See {@link Function#loadCheckpoint(Serializable)}. */
void loadCheckpoint(Serializable checkpointObject);
void close();
@@ -24,7 +24,5 @@ public class SourceProcessor<T> extends StreamProcessor<Record, SourceOperator<T
}
@Override
public void close() {
}
public void close() {}
}
@@ -21,48 +21,31 @@ public class Container implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(Container.class);
/**
* container id
*/
/** container id */
private ContainerId id;
/**
* Container address
*/
/** Container address */
private String address;
/**
* Container hostname
*/
/** Container hostname */
private String hostname;
/**
* Container unique id fetched from raylet
*/
/** Container unique id fetched from raylet */
private UniqueId nodeId;
/**
* Container available resources
*/
/** Container available resources */
private Map<String, Double> availableResources = new HashMap<>();
/**
* List of {@link ExecutionVertex} ids belong to the container.
*/
/** List of {@link ExecutionVertex} ids belong to the container. */
private List<Integer> executionVertexIds = new ArrayList<>();
/**
* Capacity is max actor number could be allocated in the container
*/
/** Capacity is max actor number could be allocated in the container */
private int capacity = 0;
public Container() {
}
public Container() {}
public Container(
String address,
UniqueId nodeId, String hostname,
Map<String, Double> availableResources) {
String address, UniqueId nodeId, String hostname, Map<String, Double> availableResources) {
this.id = new ContainerId();
this.address = address;
@@ -73,11 +56,7 @@ public class Container implements Serializable {
public static Container from(NodeInfo nodeInfo) {
return new Container(
nodeInfo.nodeAddress,
nodeInfo.nodeId,
nodeInfo.nodeHostname,
nodeInfo.resources
);
nodeInfo.nodeAddress, nodeInfo.nodeId, nodeInfo.nodeHostname, nodeInfo.resources);
}
public ContainerId getId() {
@@ -112,7 +91,6 @@ public class Container implements Serializable {
return capacity;
}
public void updateCapacity(int capacity) {
LOG.info("Update container capacity, old value: {}, new value: {}.", this.capacity, capacity);
this.capacity = capacity;
@@ -150,8 +128,10 @@ public class Container implements Serializable {
executionVertexIds.removeIf(id -> id == vertex.getExecutionVertexId());
reclaimResource(vertex.getResource());
} else {
throw new RuntimeException(String.format("Current container [%s] not found vertex [%s].",
this, vertex.getExecutionJobVertexName()));
throw new RuntimeException(
String.format(
"Current container [%s] not found vertex [%s].",
this, vertex.getExecutionJobVertexName()));
}
}
@@ -160,24 +140,36 @@ public class Container implements Serializable {
}
private void decreaseResource(Map<String, Double> allocatedResource) {
allocatedResource.forEach((k, v) -> {
Preconditions.checkArgument(this.availableResources.get(k) >= v,
String.format("Available resource %s not >= decreased resource %s",
this.availableResources.get(k), v));
Double newValue = this.availableResources.get(k) - v;
LOG.info("Decrease container {} resource [{}], from {} to {}.",
this.address, k, this.availableResources.get(k), newValue);
this.availableResources.put(k, newValue);
});
allocatedResource.forEach(
(k, v) -> {
Preconditions.checkArgument(
this.availableResources.get(k) >= v,
String.format(
"Available resource %s not >= decreased resource %s",
this.availableResources.get(k), v));
Double newValue = this.availableResources.get(k) - v;
LOG.info(
"Decrease container {} resource [{}], from {} to {}.",
this.address,
k,
this.availableResources.get(k),
newValue);
this.availableResources.put(k, newValue);
});
}
private void reclaimResource(Map<String, Double> allocatedResource) {
allocatedResource.forEach((k, v) -> {
Double newValue = this.availableResources.get(k) + v;
LOG.info("Reclaim container {} resource [{}], from {} to {}.",
this.address, k, this.availableResources.get(k), newValue);
this.availableResources.put(k, newValue);
});
allocatedResource.forEach(
(k, v) -> {
Double newValue = this.availableResources.get(k) + v;
LOG.info(
"Reclaim container {} resource [{}], from {} to {}.",
this.address,
k,
this.availableResources.get(k),
newValue);
this.availableResources.put(k, newValue);
});
}
@Override
@@ -192,4 +184,4 @@ public class Container implements Serializable {
.add("capacity", capacity)
.toString();
}
}
}
@@ -2,9 +2,5 @@ package io.ray.streaming.runtime.core.resource;
import io.ray.streaming.runtime.core.common.AbstractId;
/**
* Container unique identifier.
*/
public class ContainerId extends AbstractId {
}
/** Container unique identifier. */
public class ContainerId extends AbstractId {}
@@ -1,23 +1,15 @@
package io.ray.streaming.runtime.core.resource;
/**
* Key for different type of resources.
*/
/** Key for different type of resources. */
public enum ResourceType {
/**
* Cpu resource key.
*/
/** Cpu resource key. */
CPU("CPU"),
/**
* Gpu resource key.
*/
/** Gpu resource key. */
GPU("GPU"),
/**
* Memory resource key.
*/
/** Memory resource key. */
MEM("MEM");
private String value;
@@ -29,5 +21,4 @@ public enum ResourceType {
public String getValue() {
return value;
}
}
@@ -11,25 +11,20 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Resource description of ResourceManager.
*/
/** Resource description of ResourceManager. */
public class Resources implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(Resources.class);
/**
* Available containers registered to ResourceManager.
*/
/** Available containers registered to ResourceManager. */
private List<Container> registerContainers = new ArrayList<>();
public Resources() {
}
public Resources() {}
/**
* Get registered containers, the container list is read-only.
*
* @return container list.
* <p>Returns container list.
*/
public ImmutableList<Container> getRegisteredContainers() {
return ImmutableList.copyOf(registerContainers);
@@ -52,9 +47,9 @@ public class Resources implements Serializable {
}
public ImmutableMap<UniqueId, Container> getRegisteredContainerMap() {
return ImmutableMap.copyOf(registerContainers.stream()
.collect(java.util.stream.Collectors
.toMap(Container::getNodeId, c -> c)));
return ImmutableMap.copyOf(
registerContainers.stream()
.collect(java.util.stream.Collectors.toMap(Container::getNodeId, c -> c)));
}
@Override
@@ -67,8 +67,8 @@ public class JobMaster {
runtimeContext = new JobMasterRuntimeContext(streamingConfig);
// load checkpoint if is recover
if (!Ray.getRuntimeContext().isSingleProcess() && Ray.getRuntimeContext()
.wasCurrentActorRestarted()) {
if (!Ray.getRuntimeContext().isSingleProcess()
&& Ray.getRuntimeContext().wasCurrentActorRestarted()) {
loadMasterCheckpoint();
}
@@ -101,7 +101,7 @@ public class JobMaster {
/**
* Init JobMaster. To initiate or recover other components(like metrics and extra coordinators).
*
* @return init result
* <p>Returns init result
*/
public Boolean init(boolean isRecover) {
LOG.info("Initializing job master, isRecover={}.", isRecover);
@@ -128,15 +128,15 @@ public class JobMaster {
/**
* Submit job to run:
*
* <ol>
* <li> Using GraphManager to build physical plan according to the logical plan.</li>
* <li> Using ResourceManager to manage and allocate the resources.</li>
* <li> Using JobScheduler to schedule the job to run.</li>
* <li>Using GraphManager to build physical plan according to the logical plan.
* <li>Using ResourceManager to manage and allocate the resources.
* <li>Using JobScheduler to schedule the job to run.
* </ol>
*
* @param jobMasterActor JobMaster actor
* @param jobGraph logical plan
* @return submit result
* @param jobGraph logical plan Returns submit result
*/
public boolean submitJob(ActorHandle<JobMaster> jobMasterActor, JobGraph jobGraph) {
LOG.info("Begin submitting job using logical plan: {}.", jobGraph);
@@ -168,8 +168,8 @@ public class JobMaster {
LOG.debug("Save JobMaster context.");
byte[] contextBytes = Serializer.encode(runtimeContext);
CheckpointStateUtil
.put(contextBackend, getJobMasterRuntimeContextKey(getConf()), contextBytes);
CheckpointStateUtil.put(
contextBackend, getJobMasterRuntimeContextKey(getConf()), contextBytes);
}
}
@@ -180,8 +180,11 @@ public class JobMaster {
reportPb = RemoteCall.BaseWorkerCmd.parseFrom(reportBytes);
ActorId actorId = ActorId.fromBytes(reportPb.getActorId().toByteArray());
long remoteCallCost = System.currentTimeMillis() - reportPb.getTimestamp();
LOG.info("Vertex {}, request job worker commit cost {}ms, actorId={}.",
getExecutionVertex(actorId), remoteCallCost, actorId);
LOG.info(
"Vertex {}, request job worker commit cost {}ms, actorId={}.",
getExecutionVertex(actorId),
remoteCallCost,
actorId);
RemoteCall.WorkerCommitReport commit =
reportPb.getDetail().unpack(RemoteCall.WorkerCommitReport.class);
WorkerCommitReport report = new WorkerCommitReport(actorId, commit.getCommitCheckpointId());
@@ -206,27 +209,31 @@ public class JobMaster {
return RemoteCall.BoolResult.newBuilder().setBoolRes(false).build().toByteArray();
}
ExecutionVertex exeVertex = getExecutionVertex(actorId);
LOG.info("Vertex {}, request job worker rollback cost {}ms, actorId={}.",
exeVertex, remoteCallCost, actorId);
RemoteCall.WorkerRollbackRequest rollbackPb
= RemoteCall.WorkerRollbackRequest.parseFrom(requestPb.getDetail().getValue());
LOG.info(
"Vertex {}, request job worker rollback cost {}ms, actorId={}.",
exeVertex,
remoteCallCost,
actorId);
RemoteCall.WorkerRollbackRequest rollbackPb =
RemoteCall.WorkerRollbackRequest.parseFrom(requestPb.getDetail().getValue());
exeVertex.setPid(rollbackPb.getWorkerPid());
// To find old container where slot is located in.
String hostname = "";
Optional<Container> container = ResourceUtil.getContainerById(
resourceManager.getRegisteredContainers(),
exeVertex.getContainerId()
);
Optional<Container> container =
ResourceUtil.getContainerById(
resourceManager.getRegisteredContainers(), exeVertex.getContainerId());
if (container.isPresent()) {
hostname = container.get().getHostname();
}
WorkerRollbackRequest request = new WorkerRollbackRequest(
actorId, rollbackPb.getExceptionMsg(), hostname, exeVertex.getPid()
);
WorkerRollbackRequest request =
new WorkerRollbackRequest(
actorId, rollbackPb.getExceptionMsg(), hostname, exeVertex.getPid());
ret = failoverCoordinator.requestJobWorkerRollback(request);
LOG.info("Vertex {} request rollback, exception msg : {}.",
exeVertex, rollbackPb.getExceptionMsg());
LOG.info(
"Vertex {} request rollback, exception msg : {}.",
exeVertex,
rollbackPb.getExceptionMsg());
} catch (Throwable e) {
LOG.error("Parse job worker rollback has exception.", e);
@@ -257,5 +264,4 @@ public class JobMaster {
public StreamingMasterConfig getConf() {
return conf;
}
}
@@ -8,6 +8,7 @@ import java.io.Serializable;
/**
* Runtime context for job master.
*
* <p>Including: graph, resource, checkpoint info, etc.
*/
public class JobRuntimeContext implements Serializable {
@@ -52,5 +53,4 @@ public class JobRuntimeContext implements Serializable {
.add("conf", conf.getMap())
.toString();
}
}
@@ -77,5 +77,4 @@ public class JobMasterRuntimeContext implements Serializable {
.add("conf", conf.getMap())
.toString();
}
}
@@ -25,8 +25,9 @@ public abstract class BaseCoordinator implements Runnable {
}
public void start() {
thread = new Thread(Ray.wrapRunnable(this),
this.getClass().getName() + "-" + System.currentTimeMillis());
thread =
new Thread(
Ray.wrapRunnable(this), this.getClass().getName() + "-" + System.currentTimeMillis());
thread.start();
}
@@ -20,8 +20,8 @@ import org.slf4j.LoggerFactory;
/**
* CheckpointCoordinator is the controller of checkpoint, responsible for triggering checkpoint,
* collecting {@link JobWorker}'s reports and calling {@link JobWorker} to clear expired
* checkpoints when new checkpoint finished.
* collecting {@link JobWorker}'s reports and calling {@link JobWorker} to clear expired checkpoints
* when new checkpoint finished.
*/
public class CheckpointCoordinator extends BaseCoordinator {
@@ -58,7 +58,8 @@ public class CheckpointCoordinator extends BaseCoordinator {
if (!pendingCheckpointActors.isEmpty()) {
// if wait commit report timeout, this cp fail, and restart next cp
if (timeoutOnWaitCheckpoint()) {
LOG.warn("Waiting for checkpoint {} timeout, pending cp actors is {}.",
LOG.warn(
"Waiting for checkpoint {} timeout, pending cp actors is {}.",
runtimeContext.lastCheckpointId,
graphManager.getExecutionGraph().getActorName(pendingCheckpointActors));
@@ -90,14 +91,17 @@ public class CheckpointCoordinator extends BaseCoordinator {
}
private void processCommitReport(WorkerCommitReport commitReport) {
LOG.info("Start process commit report {}, from actor name={}.", commitReport,
LOG.info(
"Start process commit report {}, from actor name={}.",
commitReport,
graphManager.getExecutionGraph().getActorName(commitReport.fromActorId));
try {
Preconditions.checkArgument(
commitReport.commitCheckpointId == runtimeContext.lastCheckpointId,
"expect checkpointId %s, but got %s",
runtimeContext.lastCheckpointId, commitReport);
runtimeContext.lastCheckpointId,
commitReport);
if (!pendingCheckpointActors.contains(commitReport.fromActorId)) {
LOG.warn("Invalid commit report, skipped.");
@@ -105,7 +109,8 @@ public class CheckpointCoordinator extends BaseCoordinator {
}
pendingCheckpointActors.remove(commitReport.fromActorId);
LOG.info("Pending actors after this commit: {}.",
LOG.info(
"Pending actors after this commit: {}.",
graphManager.getExecutionGraph().getActorName(pendingCheckpointActors));
// checkpoint finish
@@ -144,10 +149,14 @@ public class CheckpointCoordinator extends BaseCoordinator {
final List<ObjectRef> sourcesRet = new ArrayList<>();
graphManager.getExecutionGraph().getSourceActors().forEach(actor -> {
sourcesRet.add(RemoteCallWorker.triggerCheckpoint(
actor, runtimeContext.lastCheckpointId));
});
graphManager
.getExecutionGraph()
.getSourceActors()
.forEach(
actor -> {
sourcesRet.add(
RemoteCallWorker.triggerCheckpoint(actor, runtimeContext.lastCheckpointId));
});
for (ObjectRef rayObject : sourcesRet) {
if (rayObject.get() instanceof RayException) {
@@ -171,8 +180,7 @@ public class CheckpointCoordinator extends BaseCoordinator {
List<BaseActorHandle> allActor = graphManager.getExecutionGraph().getAllActors();
if (runtimeContext.lastCheckpointId > runtimeContext.getLastValidCheckpointId()) {
RemoteCallWorker
.notifyCheckpointTimeoutParallel(allActor, runtimeContext.lastCheckpointId);
RemoteCallWorker.notifyCheckpointTimeoutParallel(allActor, runtimeContext.lastCheckpointId);
}
if (!pendingCheckpointActors.isEmpty()) {
@@ -198,15 +206,14 @@ public class CheckpointCoordinator extends BaseCoordinator {
if (runtimeContext.checkpointIds.size() > 1) {
Long stateExpiredCpId = runtimeContext.checkpointIds.remove(0);
Long msgExpiredCheckpointId = runtimeContext.checkpointIds.get(0);
RemoteCallWorker
.clearExpiredCheckpointParallel(allActor, stateExpiredCpId, msgExpiredCheckpointId);
RemoteCallWorker.clearExpiredCheckpointParallel(
allActor, stateExpiredCpId, msgExpiredCheckpointId);
}
return true;
}
private boolean readyToTrigger() {
return (System.currentTimeMillis() - runtimeContext.lastCpTimestamp) >=
cpIntervalSecs * 1000;
return (System.currentTimeMillis() - runtimeContext.lastCpTimestamp) >= cpIntervalSecs * 1000;
}
private boolean timeoutOnWaitCheckpoint() {
@@ -39,8 +39,7 @@ public class FailoverCoordinator extends BaseCoordinator {
}
public FailoverCoordinator(
JobMaster jobMaster, AsyncRemoteCaller asyncRemoteCaller,
boolean isRecover) {
JobMaster jobMaster, AsyncRemoteCaller asyncRemoteCaller, boolean isRecover) {
super(jobMaster);
this.asyncRemoteCaller = asyncRemoteCaller;
@@ -111,8 +110,8 @@ public class FailoverCoordinator extends BaseCoordinator {
ExecutionVertex exeVertex = getExeVertexFromRequest(rollbackRequest);
// Reset pid for new-rollback actor.
if (null != rollbackRequest.getPid() &&
!rollbackRequest.getPid().equals(WorkerRollbackRequest.DEFAULT_PID)) {
if (null != rollbackRequest.getPid()
&& !rollbackRequest.getPid().equals(WorkerRollbackRequest.DEFAULT_PID)) {
exeVertex.setPid(rollbackRequest.getPid());
}
@@ -122,10 +121,9 @@ public class FailoverCoordinator extends BaseCoordinator {
}
String hostname = "";
Optional<Container> container = ResourceUtil.getContainerById(
jobMaster.getResourceManager().getRegisteredContainers(),
exeVertex.getContainerId()
);
Optional<Container> container =
ResourceUtil.getContainerById(
jobMaster.getResourceManager().getRegisteredContainers(), exeVertex.getContainerId());
if (container.isPresent()) {
hostname = container.get().getHostname();
}
@@ -133,16 +131,22 @@ public class FailoverCoordinator extends BaseCoordinator {
if (rollbackRequest.isForcedRollback) {
interruptCheckpointAndRollback(rollbackRequest);
} else {
asyncRemoteCaller.checkIfNeedRollbackAsync(exeVertex.getWorkerActor(), res -> {
if (!res) {
LOG.info("Vertex {} doesn't need to rollback, skip it.", exeVertex);
return;
}
interruptCheckpointAndRollback(rollbackRequest);
}, throwable -> {
LOG.error("Exception when calling checkIfNeedRollbackAsync, maybe vertex is dead" +
", ignore this request, vertex={}.", exeVertex, throwable);
});
asyncRemoteCaller.checkIfNeedRollbackAsync(
exeVertex.getWorkerActor(),
res -> {
if (!res) {
LOG.info("Vertex {} doesn't need to rollback, skip it.", exeVertex);
return;
}
interruptCheckpointAndRollback(rollbackRequest);
},
throwable -> {
LOG.error(
"Exception when calling checkIfNeedRollbackAsync, maybe vertex is dead"
+ ", ignore this request, vertex={}.",
exeVertex,
throwable);
});
}
LOG.info("Deal with rollback request {} success.", rollbackRequest);
@@ -154,7 +158,9 @@ public class FailoverCoordinator extends BaseCoordinator {
rollbackRequest.cascadingGroupId = currentCascadingGroupId++;
}
// get last valid checkpoint id then call worker rollback
rollback(jobMaster.getRuntimeContext().getLastValidCheckpointId(), rollbackRequest,
rollback(
jobMaster.getRuntimeContext().getLastValidCheckpointId(),
rollbackRequest,
currentCascadingGroupId);
// we interrupt current checkpoint for 2 considerations:
// 1. current checkpoint might be timeout, because barrier might be lost after failover. so we
@@ -165,66 +171,83 @@ public class FailoverCoordinator extends BaseCoordinator {
}
/**
* call worker rollback, and deal with it's reports. callback won't be finished until
* the entire DAG back to normal.
* call worker rollback, and deal with it's reports. callback won't be finished until the entire
* DAG back to normal.
*
* @param checkpointId checkpointId to be rollback
* @param rollbackRequest worker rollback request
* @param cascadingGroupId all rollback of a cascading group should have same ID
*/
private void rollback(
long checkpointId, WorkerRollbackRequest rollbackRequest,
long cascadingGroupId) {
long checkpointId, WorkerRollbackRequest rollbackRequest, long cascadingGroupId) {
ExecutionVertex exeVertex = getExeVertexFromRequest(rollbackRequest);
LOG.info("Call vertex {} to rollback, checkpoint id is {}, cascadingGroupId={}.",
exeVertex, checkpointId, cascadingGroupId);
LOG.info(
"Call vertex {} to rollback, checkpoint id is {}, cascadingGroupId={}.",
exeVertex,
checkpointId,
cascadingGroupId);
isRollbacking.put(exeVertex, true);
asyncRemoteCaller.rollback(exeVertex.getWorkerActor(), checkpointId, result -> {
List<WorkerRollbackRequest> newRollbackRequests = new ArrayList<>();
switch (result.getResultEnum()) {
case SUCCESS:
ChannelRecoverInfo recoverInfo = result.getResultObj();
LOG.info("Vertex {} rollback done, dataLostQueues={}, msg={}, cascadingGroupId={}.",
exeVertex, recoverInfo.getDataLostQueues(), result.getResultMsg(), cascadingGroupId);
// rollback upstream if vertex reports abnormal input queues
newRollbackRequests =
cascadeUpstreamActors(recoverInfo.getDataLostQueues(), exeVertex, cascadingGroupId);
break;
case SKIPPED:
LOG.info("Vertex skip rollback, result = {}, cascadingGroupId={}.", result,
cascadingGroupId);
break;
default:
LOG.error(
"Rollback vertex {} failed, result={}, cascadingGroupId={}," +
" rollback this worker again after {} ms.",
exeVertex, result, cascadingGroupId, ROLLBACK_RETRY_TIME_MS);
Thread.sleep(ROLLBACK_RETRY_TIME_MS);
LOG.info("Add rollback request for {} again, cascadingGroupId={}.", exeVertex,
cascadingGroupId);
newRollbackRequests.add(
new WorkerRollbackRequest(exeVertex, "", "Rollback failed, try again.", false)
);
break;
}
asyncRemoteCaller.rollback(
exeVertex.getWorkerActor(),
checkpointId,
result -> {
List<WorkerRollbackRequest> newRollbackRequests = new ArrayList<>();
switch (result.getResultEnum()) {
case SUCCESS:
ChannelRecoverInfo recoverInfo = result.getResultObj();
LOG.info(
"Vertex {} rollback done, dataLostQueues={}, msg={}, cascadingGroupId={}.",
exeVertex,
recoverInfo.getDataLostQueues(),
result.getResultMsg(),
cascadingGroupId);
// rollback upstream if vertex reports abnormal input queues
newRollbackRequests =
cascadeUpstreamActors(
recoverInfo.getDataLostQueues(), exeVertex, cascadingGroupId);
break;
case SKIPPED:
LOG.info(
"Vertex skip rollback, result = {}, cascadingGroupId={}.",
result,
cascadingGroupId);
break;
default:
LOG.error(
"Rollback vertex {} failed, result={}, cascadingGroupId={},"
+ " rollback this worker again after {} ms.",
exeVertex,
result,
cascadingGroupId,
ROLLBACK_RETRY_TIME_MS);
Thread.sleep(ROLLBACK_RETRY_TIME_MS);
LOG.info(
"Add rollback request for {} again, cascadingGroupId={}.",
exeVertex,
cascadingGroupId);
newRollbackRequests.add(
new WorkerRollbackRequest(exeVertex, "", "Rollback failed, try again.", false));
break;
}
// lock to avoid executing new rollback requests added.
// consider such a case: A->B->C, C cascade B, and B cascade A
// if B is rollback before B's rollback request is saved, and then JobMaster crashed,
// then A will never be rollback.
synchronized (cmdLock) {
jobMaster.getRuntimeContext().foCmds.addAll(newRollbackRequests);
// this rollback request is finished, remove it.
jobMaster.getRuntimeContext().unfinishedFoCmds.remove(rollbackRequest);
jobMaster.saveContext();
}
isRollbacking.put(exeVertex, false);
}, throwable -> {
LOG.error("Exception when calling vertex to rollback, vertex={}.", exeVertex, throwable);
isRollbacking.put(exeVertex, false);
});
// lock to avoid executing new rollback requests added.
// consider such a case: A->B->C, C cascade B, and B cascade A
// if B is rollback before B's rollback request is saved, and then JobMaster crashed,
// then A will never be rollback.
synchronized (cmdLock) {
jobMaster.getRuntimeContext().foCmds.addAll(newRollbackRequests);
// this rollback request is finished, remove it.
jobMaster.getRuntimeContext().unfinishedFoCmds.remove(rollbackRequest);
jobMaster.saveContext();
}
isRollbacking.put(exeVertex, false);
},
throwable -> {
LOG.error("Exception when calling vertex to rollback, vertex={}.", exeVertex, throwable);
isRollbacking.put(exeVertex, false);
});
LOG.info("Finish rollback vertex {}, checkpoint id is {}.", exeVertex, checkpointId);
}
@@ -233,32 +256,39 @@ public class FailoverCoordinator extends BaseCoordinator {
Set<String> dataLostQueues, ExecutionVertex fromVertex, long cascadingGroupId) {
List<WorkerRollbackRequest> cascadedRollbackRequest = new ArrayList<>();
// rollback upstream if vertex reports abnormal input queues
dataLostQueues.forEach(q -> {
BaseActorHandle upstreamActor =
graphManager.getExecutionGraph().getPeerActor(fromVertex.getWorkerActor(), q);
ExecutionVertex upstreamExeVertex = getExecutionVertex(upstreamActor);
// vertexes that has already cascaded by other vertex in the same level
// of graph should be ignored.
if (isRollbacking.get(upstreamExeVertex)) {
return;
}
LOG.info("Call upstream vertex {} of vertex {} to rollback, cascadingGroupId={}.",
upstreamExeVertex, fromVertex, cascadingGroupId);
String hostname = "";
Optional<Container> container = ResourceUtil.getContainerById(
jobMaster.getResourceManager().getRegisteredContainers(),
upstreamExeVertex.getContainerId()
);
if (container.isPresent()) {
hostname = container.get().getHostname();
}
// force upstream vertexes to rollback
WorkerRollbackRequest upstreamRequest = new WorkerRollbackRequest(
upstreamExeVertex, hostname, String.format("Cascading rollback from %s", fromVertex), true
);
upstreamRequest.cascadingGroupId = cascadingGroupId;
cascadedRollbackRequest.add(upstreamRequest);
});
dataLostQueues.forEach(
q -> {
BaseActorHandle upstreamActor =
graphManager.getExecutionGraph().getPeerActor(fromVertex.getWorkerActor(), q);
ExecutionVertex upstreamExeVertex = getExecutionVertex(upstreamActor);
// vertexes that has already cascaded by other vertex in the same level
// of graph should be ignored.
if (isRollbacking.get(upstreamExeVertex)) {
return;
}
LOG.info(
"Call upstream vertex {} of vertex {} to rollback, cascadingGroupId={}.",
upstreamExeVertex,
fromVertex,
cascadingGroupId);
String hostname = "";
Optional<Container> container =
ResourceUtil.getContainerById(
jobMaster.getResourceManager().getRegisteredContainers(),
upstreamExeVertex.getContainerId());
if (container.isPresent()) {
hostname = container.get().getHostname();
}
// force upstream vertexes to rollback
WorkerRollbackRequest upstreamRequest =
new WorkerRollbackRequest(
upstreamExeVertex,
hostname,
String.format("Cascading rollback from %s", fromVertex),
true);
upstreamRequest.cascadingGroupId = cascadingGroupId;
cascadedRollbackRequest.add(upstreamRequest);
});
return cascadedRollbackRequest;
}
@@ -7,11 +7,9 @@ public abstract class BaseWorkerCmd implements Serializable {
public ActorId fromActorId;
public BaseWorkerCmd() {
}
public BaseWorkerCmd() {}
protected BaseWorkerCmd(ActorId actorId) {
this.fromActorId = actorId;
}
}
@@ -1,5 +1,3 @@
package io.ray.streaming.runtime.master.coordinator.command;
public final class InterruptCheckpointRequest extends BaseWorkerCmd {
}
public final class InterruptCheckpointRequest extends BaseWorkerCmd {}
@@ -23,10 +23,7 @@ public final class WorkerRollbackRequest extends BaseWorkerCmd {
}
public WorkerRollbackRequest(
ExecutionVertex executionVertex,
String hostname,
String msg,
boolean isForcedRollback) {
ExecutionVertex executionVertex, String hostname, String msg, boolean isForcedRollback) {
super(executionVertex.getWorkerActorId());
@@ -56,8 +53,6 @@ public final class WorkerRollbackRequest extends BaseWorkerCmd {
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("fromActorId", fromActorId)
.toString();
return MoreObjects.toStringHelper(this).add("fromActorId", fromActorId).toString();
}
}
@@ -5,37 +5,35 @@ import io.ray.streaming.runtime.core.graph.executiongraph.ExecutionGraph;
/**
* Graph manager is one of the important roles of JobMaster. It mainly focuses on graph management.
* <p>
* Such as:
*
* <p>Such as:
*
* <ol>
* <li>Build execution graph from job graph.</li>
* <li>Do modifications or operations on graph.</li>
* <li>Query vertex info from graph.</li>
* <li>Build execution graph from job graph.
* <li>Do modifications or operations on graph.
* <li>Query vertex info from graph.
* </ol>
* </p>
*/
public interface GraphManager {
/**
* Build execution graph from job graph.
*
* @param jobGraph logical plan of streaming job.
* @return physical plan of streaming job.
* @param jobGraph logical plan of streaming job. Returns physical plan of streaming job.
*/
ExecutionGraph buildExecutionGraph(JobGraph jobGraph);
/**
* Get job graph.
*
* @return the job graph.
* <p>Returns the job graph.
*/
JobGraph getJobGraph();
/**
* Get execution graph.
*
* @return the execution graph.
* <p>Returns the execution graph.
*/
ExecutionGraph getExecutionGraph();
}
@@ -35,9 +35,11 @@ public class GraphManagerImpl implements GraphManager {
ExecutionGraph executionGraph = setupStructure(jobGraph);
// set max parallelism
int maxParallelism = jobGraph.getJobVertices().stream()
.map(JobVertex::getParallelism)
.max(Integer::compareTo).get();
int maxParallelism =
jobGraph.getJobVertices().stream()
.map(JobVertex::getParallelism)
.max(Integer::compareTo)
.get();
executionGraph.setMaxParallelism(maxParallelism);
// set job config
@@ -57,37 +59,47 @@ public class GraphManagerImpl implements GraphManager {
long buildTime = executionGraph.getBuildTime();
for (JobVertex jobVertex : jobGraph.getJobVertices()) {
int jobVertexId = jobVertex.getVertexId();
exeJobVertexMap.put(jobVertexId,
exeJobVertexMap.put(
jobVertexId,
new ExecutionJobVertex(
jobVertex,
jobConfig,
executionGraph.getExecutionVertexIdGenerator(),
buildTime));
jobVertex, jobConfig, executionGraph.getExecutionVertexIdGenerator(), buildTime));
}
// for each job edge, connect all source exeVertices and target exeVertices
jobGraph.getJobEdges().forEach(jobEdge -> {
ExecutionJobVertex source = exeJobVertexMap.get(jobEdge.getSrcVertexId());
ExecutionJobVertex target = exeJobVertexMap.get(jobEdge.getTargetVertexId());
jobGraph
.getJobEdges()
.forEach(
jobEdge -> {
ExecutionJobVertex source = exeJobVertexMap.get(jobEdge.getSrcVertexId());
ExecutionJobVertex target = exeJobVertexMap.get(jobEdge.getTargetVertexId());
ExecutionJobEdge executionJobEdge = new ExecutionJobEdge(source, target, jobEdge);
ExecutionJobEdge executionJobEdge = new ExecutionJobEdge(source, target, jobEdge);
source.getOutputEdges().add(executionJobEdge);
target.getInputEdges().add(executionJobEdge);
source.getOutputEdges().add(executionJobEdge);
target.getInputEdges().add(executionJobEdge);
source.getExecutionVertices().forEach(sourceExeVertex -> {
target.getExecutionVertices().forEach(targetExeVertex -> {
// pre-process some mappings
executionVertexMap.put(targetExeVertex.getExecutionVertexId(), targetExeVertex);
executionVertexMap.put(sourceExeVertex.getExecutionVertexId(), sourceExeVertex);
// build execution edge
ExecutionEdge executionEdge =
new ExecutionEdge(sourceExeVertex, targetExeVertex, executionJobEdge);
sourceExeVertex.getOutputEdges().add(executionEdge);
targetExeVertex.getInputEdges().add(executionEdge);
});
});
});
source
.getExecutionVertices()
.forEach(
sourceExeVertex -> {
target
.getExecutionVertices()
.forEach(
targetExeVertex -> {
// pre-process some mappings
executionVertexMap.put(
targetExeVertex.getExecutionVertexId(), targetExeVertex);
executionVertexMap.put(
sourceExeVertex.getExecutionVertexId(), sourceExeVertex);
// build execution edge
ExecutionEdge executionEdge =
new ExecutionEdge(
sourceExeVertex, targetExeVertex, executionJobEdge);
sourceExeVertex.getOutputEdges().add(executionEdge);
targetExeVertex.getInputEdges().add(executionEdge);
});
});
});
// set execution job vertex into execution graph
executionGraph.setExecutionJobVertexMap(exeJobVertexMap);
@@ -115,5 +127,4 @@ public class GraphManagerImpl implements GraphManager {
public ExecutionGraph getExecutionGraph() {
return runtimeContext.getExecutionGraph();
}
}
@@ -5,9 +5,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Cluster resource allocation view, used to statically view cluster resource information.
*/
/** Cluster resource allocation view, used to statically view cluster resource information. */
public class ResourceAssignmentView extends HashMap<ContainerId, List<Integer>> {
public static ResourceAssignmentView of(Map<ContainerId, List<Integer>> assignmentView) {
@@ -4,16 +4,13 @@ import com.google.common.collect.ImmutableList;
import io.ray.streaming.runtime.core.resource.Container;
import io.ray.streaming.runtime.master.resourcemanager.strategy.ResourceAssignStrategy;
/**
* ResourceManager(RM) is responsible for resource de-/allocation and monitoring ray cluster.
*/
/** ResourceManager(RM) is responsible for resource de-/allocation and monitoring ray cluster. */
public interface ResourceManager extends ResourceAssignStrategy {
/**
* Get registered containers, the container list is read-only.
*
* @return the registered container list
* <p>Returns the registered container list
*/
ImmutableList<Container> getRegisteredContainers();
}
}
@@ -28,32 +28,21 @@ public class ResourceManagerImpl implements ResourceManager {
private static final Logger LOG = LoggerFactory.getLogger(ResourceManagerImpl.class);
//Container used tag
// Container used tag
private static final String CONTAINER_ENGAGED_KEY = "CONTAINER_ENGAGED_KEY";
/**
* Resource description information.
*/
/** Resource description information. */
private final Resources resources;
/**
* Timing resource updating thread
*/
private final ScheduledExecutorService resourceUpdater = new ScheduledThreadPoolExecutor(1,
new ThreadFactoryBuilder().setNameFormat("resource-update-thread").build());
/**
* Job runtime context.
*/
/** Timing resource updating thread */
private final ScheduledExecutorService resourceUpdater =
new ScheduledThreadPoolExecutor(
1, new ThreadFactoryBuilder().setNameFormat("resource-update-thread").build());
/** Job runtime context. */
private JobMasterRuntimeContext runtimeContext;
/**
* Resource related configuration.
*/
/** Resource related configuration. */
private ResourceConfig resourceConfig;
/**
* Slot assign strategy.
*/
/** Slot assign strategy. */
private ResourceAssignStrategy resourceAssignStrategy;
/**
* Customized actor number for each container
*/
/** Customized actor number for each container */
private int actorNumPerContainer;
public ResourceManagerImpl(JobMasterRuntimeContext runtimeContext) {
@@ -62,19 +51,19 @@ public class ResourceManagerImpl implements ResourceManager {
this.resourceConfig = masterConfig.resourceConfig;
this.resources = new Resources();
LOG.info("ResourceManagerImpl begin init, conf is {}, resources are {}.",
resourceConfig, resources);
LOG.info(
"ResourceManagerImpl begin init, conf is {}, resources are {}.", resourceConfig, resources);
// Init custom resource configurations
this.actorNumPerContainer = resourceConfig.actorNumPerContainer();
ResourceAssignStrategyType resourceAssignStrategyType =
ResourceAssignStrategyType.PIPELINE_FIRST_STRATEGY;
this.resourceAssignStrategy = ResourceAssignStrategyFactory.getStrategy(
resourceAssignStrategyType);
this.resourceAssignStrategy =
ResourceAssignStrategyFactory.getStrategy(resourceAssignStrategyType);
LOG.info("Slot assign strategy: {}.", resourceAssignStrategy.getName());
//Init resource
// Init resource
initResource();
checkAndUpdateResourcePeriodically();
@@ -84,8 +73,7 @@ public class ResourceManagerImpl implements ResourceManager {
@Override
public ResourceAssignmentView assignResource(
List<Container> containers,
ExecutionGraph executionGraph) {
List<Container> containers, ExecutionGraph executionGraph) {
return resourceAssignStrategy.assignResource(containers, executionGraph);
}
@@ -105,17 +93,22 @@ public class ResourceManagerImpl implements ResourceManager {
* system.
*/
private void checkAndUpdateResource() {
//Get add&del nodes(node -> container)
// Get add&del nodes(node -> container)
Map<UniqueId, NodeInfo> latestNodeInfos = RayUtils.getAliveNodeInfoMap();
List<UniqueId> addNodes = latestNodeInfos.keySet().stream()
.filter(this::isAddedNode).collect(Collectors.toList());
List<UniqueId> addNodes =
latestNodeInfos.keySet().stream().filter(this::isAddedNode).collect(Collectors.toList());
List<UniqueId> deleteNodes = resources.getRegisteredContainerMap().keySet().stream()
.filter(nodeId -> !latestNodeInfos.containsKey(nodeId))
.collect(Collectors.toList());
LOG.info("Latest node infos: {}, current containers: {}, add nodes: {}, delete nodes: {}.",
latestNodeInfos, resources.getRegisteredContainers(), addNodes, deleteNodes);
List<UniqueId> deleteNodes =
resources.getRegisteredContainerMap().keySet().stream()
.filter(nodeId -> !latestNodeInfos.containsKey(nodeId))
.collect(Collectors.toList());
LOG.info(
"Latest node infos: {}, current containers: {}, add nodes: {}, delete nodes: {}.",
latestNodeInfos,
resources.getRegisteredContainers(),
addNodes,
deleteNodes);
if (!addNodes.isEmpty() || !deleteNodes.isEmpty()) {
LOG.info("Latest node infos from GCS: {}", latestNodeInfos);
@@ -126,8 +119,8 @@ public class ResourceManagerImpl implements ResourceManager {
unregisterDeletedContainer(deleteNodes);
// register containers
registerNewContainers(addNodes.stream().map(latestNodeInfos::get)
.collect(Collectors.toList()));
registerNewContainers(
addNodes.stream().map(latestNodeInfos::get).collect(Collectors.toList()));
}
}
@@ -152,14 +145,13 @@ public class ResourceManagerImpl implements ResourceManager {
// failover case: container has already allocated actors
double availableCapacity = actorNumPerContainer - container.getAllocatedActorNum();
//Create ray resource.
// Create ray resource.
Ray.setResource(container.getNodeId(), container.getName(), availableCapacity);
//Mark container is already registered.
// Mark container is already registered.
Ray.setResource(container.getNodeId(), CONTAINER_ENGAGED_KEY, 1);
// update container's available dynamic resources
container.getAvailableResources()
.put(container.getName(), availableCapacity);
container.getAvailableResources().put(container.getName(), availableCapacity);
// update register container list
resources.registerContainer(container);
@@ -187,5 +179,4 @@ public class ResourceManagerImpl implements ResourceManager {
private boolean isAddedNode(UniqueId uniqueId) {
return !resources.getRegisteredContainerMap().containsKey(uniqueId);
}
}
@@ -5,19 +5,18 @@ import io.ray.streaming.runtime.core.resource.ContainerId;
import java.util.List;
import java.util.Map;
/**
* ViewBuilder describes current cluster's resource allocation detail information
*/
/** ViewBuilder describes current cluster's resource allocation detail information */
public class ViewBuilder {
// Default constructor for serialization.
public ViewBuilder() {
}
public ViewBuilder() {}
public static ResourceAssignmentView buildResourceAssignmentView(List<Container> containers) {
Map<ContainerId, List<Integer>> assignmentView =
containers.stream().collect(java.util.stream.Collectors.toMap(Container::getId,
Container::getExecutionVertexIds));
containers.stream()
.collect(
java.util.stream.Collectors.toMap(
Container::getId, Container::getExecutionVertexIds));
return ResourceAssignmentView.of(assignmentView);
}
@@ -6,23 +6,17 @@ import io.ray.streaming.runtime.core.resource.Container;
import io.ray.streaming.runtime.master.resourcemanager.ResourceAssignmentView;
import java.util.List;
/**
* The ResourceAssignStrategy responsible assign {@link Container} to {@link ExecutionVertex}.
*/
/** The ResourceAssignStrategy responsible assign {@link Container} to {@link ExecutionVertex}. */
public interface ResourceAssignStrategy {
/**
* Assign {@link Container} for {@link ExecutionVertex}
*
* @param containers registered container
* @param executionGraph execution graph
* @return allocating view
* @param executionGraph execution graph Returns allocating view
*/
ResourceAssignmentView assignResource(List<Container> containers, ExecutionGraph executionGraph);
/**
* Get container assign strategy name
*/
/** Get container assign strategy name */
String getName();
}
@@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory;
* balanced and controllable scheduling. Assume that we have 2 containers and have a DAG graph
* composed of a source node with parallelism of 2 and a sink node with parallelism of 2, the
* structure will be like:
*
* <pre>
* container_0
* |- source_1
@@ -41,24 +42,23 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
* Assign resource to each execution vertex in the given execution graph.
*
* @param containers registered containers
* @param executionGraph execution graph
* @return allocating map, key is container ID, value is list of vertextId, and contains vertices
* @param executionGraph execution graph Returns allocating map, key is container ID, value is
* list of vertextId, and contains vertices
*/
@Override
public ResourceAssignmentView assignResource(
List<Container> containers,
ExecutionGraph executionGraph) {
List<Container> containers, ExecutionGraph executionGraph) {
Map<Integer, ExecutionJobVertex> vertices = executionGraph.getExecutionJobVertexMap();
Map<Integer, Integer> vertexRemainingNum = new HashMap<>();
vertices.forEach((k, v) -> {
int size = v.getExecutionVertices().size();
vertexRemainingNum.put(k, size);
});
int totalExecutionVerticesNum = vertexRemainingNum.values().stream()
.mapToInt(Integer::intValue)
.sum();
vertices.forEach(
(k, v) -> {
int size = v.getExecutionVertices().size();
vertexRemainingNum.put(k, size);
});
int totalExecutionVerticesNum =
vertexRemainingNum.values().stream().mapToInt(Integer::intValue).sum();
int containerNum = containers.size();
int capacityPerContainer = Math.max(totalExecutionVerticesNum / containerNum, 1);
@@ -70,8 +70,11 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
enlargeCapacityThreshold = capacityPerContainer * containerNum;
LOG.info("Need to enlarge capacity per container, threshold: {}.", enlargeCapacityThreshold);
}
LOG.info("Total execution vertices num: {}, container num: {}, capacity per container: {}.",
totalExecutionVerticesNum, containerNum, capacityPerContainer);
LOG.info(
"Total execution vertices num: {}, container num: {}, capacity per container: {}.",
totalExecutionVerticesNum,
containerNum,
capacityPerContainer);
int maxParallelism = executionGraph.getMaxParallelism();
@@ -86,8 +89,10 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
ExecutionVertex executionVertex = exeVertices.get(i);
Map<String, Double> requiredResource = executionVertex.getResource();
if (requiredResource.containsKey(ResourceType.CPU.getValue())) {
LOG.info("Required resource contain {} value : {}, no limitation by default.",
ResourceType.CPU, requiredResource.get(ResourceType.CPU.getValue()));
LOG.info(
"Required resource contain {} value : {}, no limitation by default.",
ResourceType.CPU,
requiredResource.get(ResourceType.CPU.getValue()));
requiredResource.remove(ResourceType.CPU.getValue());
}
@@ -96,7 +101,8 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
targetContainer.allocateActor(executionVertex);
allocatedVertexCount++;
// Once allocatedVertexCount reaches threshold, we should enlarge capacity
if (!enlarged && enlargeCapacityThreshold > 0
if (!enlarged
&& enlargeCapacityThreshold > 0
&& allocatedVertexCount >= enlargeCapacityThreshold) {
updateContainerCapacity(containers, capacityPerContainer + 1);
enlarged = true;
@@ -127,12 +133,10 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
* Find a container which matches required resource
*
* @param requiredResource required resource
* @param containers registered containers
* @return container that matches the required resource
* @param containers registered containers Returns container that matches the required resource
*/
private Container findMatchedContainer(
Map<String, Double> requiredResource,
List<Container> containers) {
Map<String, Double> requiredResource, List<Container> containers) {
LOG.info("Check resource, required: {}.", requiredResource);
@@ -143,7 +147,8 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
forwardToNextContainer(containers);
if (checkedNum >= containers.size()) {
throw new ScheduleException(
String.format("No enough resource left, required resource: %s, available resource: %s.",
String.format(
"No enough resource left, required resource: %s, available resource: %s.",
requiredResource, containers));
}
}
@@ -154,8 +159,7 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
* Check if current container has enough resource
*
* @param requiredResource required resource
* @param container container
* @return true if matches, false else
* @param container container Returns true if matches, false else
*/
private boolean hasEnoughResource(Map<String, Double> requiredResource, Container container) {
LOG.info("Check resource for index: {}, container: {}", currentContainerIndex, container);
@@ -173,13 +177,19 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
for (Map.Entry<String, Double> entry : requiredResource.entrySet()) {
if (availableResource.containsKey(entry.getKey())) {
if (availableResource.get(entry.getKey()) < entry.getValue()) {
LOG.warn("No enough resource for container {}. required: {}, available: {}.",
container.getAddress(), requiredResource, availableResource);
LOG.warn(
"No enough resource for container {}. required: {}, available: {}.",
container.getAddress(),
requiredResource,
availableResource);
return false;
}
} else {
LOG.warn("No enough resource for container {}. required: {}, available: {}.",
container.getAddress(), requiredResource, availableResource);
LOG.warn(
"No enough resource for container {}. required: {}, available: {}.",
container.getAddress(),
requiredResource,
availableResource);
return false;
}
}
@@ -190,8 +200,7 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
/**
* Forward to next container
*
* @param containers registered container list
* @return next container in the list
* @param containers registered container list Returns next container in the list
*/
private Container forwardToNextContainer(List<Container> containers) {
this.currentContainerIndex = (this.currentContainerIndex + 1) % containers.size();
@@ -201,8 +210,7 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy {
/**
* Get current container
*
* @param containers registered container
* @return current container to allocate actor
* @param containers registered container Returns current container to allocate actor
*/
private Container getCurrentContainer(List<Container> containers) {
return containers.get(currentContainerIndex);
@@ -2,16 +2,13 @@ package io.ray.streaming.runtime.master.scheduler;
import io.ray.streaming.runtime.core.graph.executiongraph.ExecutionGraph;
/**
* Job scheduler is used to do the scheduling in JobMaster.
*/
/** Job scheduler is used to do the scheduling in JobMaster. */
public interface JobScheduler {
/**
* Schedule streaming job using the physical plan.
*
* @param executionGraph physical plan
* @return scheduling result
* @param executionGraph physical plan Returns scheduling result
*/
boolean scheduleJob(ExecutionGraph executionGraph);
}
@@ -17,9 +17,7 @@ import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Job scheduler implementation.
*/
/** Job scheduler implementation. */
public class JobSchedulerImpl implements JobScheduler {
private static final Logger LOG = LoggerFactory.getLogger(JobSchedulerImpl.class);
@@ -96,16 +94,15 @@ public class JobSchedulerImpl implements JobScheduler {
/**
* Create JobWorker actors according to the physical plan.
*
* @param executionGraph physical plan
* @return actor creation result
* @param executionGraph physical plan Returns actor creation result
*/
public boolean createWorkers(ExecutionGraph executionGraph) {
LOG.info("Begin creating workers.");
long startTs = System.currentTimeMillis();
// create JobWorker actors
boolean createResult = workerLifecycleController
.createWorkers(executionGraph.getAllAddedExecutionVertices());
boolean createResult =
workerLifecycleController.createWorkers(executionGraph.getAllAddedExecutionVertices());
if (createResult) {
LOG.info("Finished creating workers. Cost {} ms.", System.currentTimeMillis() - startTs);
@@ -124,8 +121,10 @@ public class JobSchedulerImpl implements JobScheduler {
protected boolean initWorkers(Map<ExecutionVertex, JobWorkerContext> vertexToContextMap) {
boolean result;
try {
result = workerLifecycleController.initWorkers(vertexToContextMap,
jobConfig.masterConfig.schedulerConfig.workerInitiationWaitTimeoutMs());
result =
workerLifecycleController.initWorkers(
vertexToContextMap,
jobConfig.masterConfig.schedulerConfig.workerInitiationWaitTimeoutMs());
} catch (Exception e) {
LOG.error("Failed to initiate workers.", e);
return false;
@@ -133,15 +132,15 @@ public class JobSchedulerImpl implements JobScheduler {
return result;
}
/**
* Start JobWorkers according to the physical plan.
*/
/** Start JobWorkers according to the physical plan. */
public boolean startWorkers(ExecutionGraph executionGraph, long checkpointId) {
boolean result;
try {
result = workerLifecycleController.startWorkers(
executionGraph, checkpointId,
jobConfig.masterConfig.schedulerConfig.workerStartingWaitTimeoutMs());
result =
workerLifecycleController.startWorkers(
executionGraph,
checkpointId,
jobConfig.masterConfig.schedulerConfig.workerStartingWaitTimeoutMs());
} catch (Exception e) {
LOG.error("Failed to start workers.", e);
return false;
@@ -152,8 +151,7 @@ public class JobSchedulerImpl implements JobScheduler {
/**
* Build workers context.
*
* @param executionGraph execution graph
* @return vertex to worker context map
* @param executionGraph execution graph Returns vertex to worker context map
*/
protected Map<ExecutionVertex, JobWorkerContext> buildWorkersContext(
ExecutionGraph executionGraph) {
@@ -161,22 +159,21 @@ public class JobSchedulerImpl implements JobScheduler {
// build workers' context
Map<ExecutionVertex, JobWorkerContext> vertexToContextMap = new HashMap<>();
executionGraph.getAllExecutionVertices().forEach(vertex -> {
JobWorkerContext context = buildJobWorkerContext(vertex, masterActor);
vertexToContextMap.put(vertex, context);
});
executionGraph
.getAllExecutionVertices()
.forEach(
vertex -> {
JobWorkerContext context = buildJobWorkerContext(vertex, masterActor);
vertexToContextMap.put(vertex, context);
});
return vertexToContextMap;
}
private JobWorkerContext buildJobWorkerContext(
ExecutionVertex executionVertex,
ActorHandle<JobMaster> masterActor) {
ExecutionVertex executionVertex, ActorHandle<JobMaster> masterActor) {
// create java worker context
JobWorkerContext context = new JobWorkerContext(
masterActor,
executionVertex
);
JobWorkerContext context = new JobWorkerContext(masterActor, executionVertex);
return context;
}
@@ -200,5 +197,4 @@ public class JobSchedulerImpl implements JobScheduler {
private void initMaster() {
jobMaster.init(false);
}
}
@@ -19,8 +19,7 @@ public class ScheduleException extends RuntimeException {
}
protected ScheduleException(
String message, Throwable cause, boolean enableSuppression,
boolean writableStackTrace) {
String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
}
@@ -24,9 +24,7 @@ import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Worker lifecycle controller is used to control JobWorker's creation, initiation and so on.
*/
/** Worker lifecycle controller is used to control JobWorker's creation, initiation and so on. */
public class WorkerLifecycleController {
private static final Logger LOG = LoggerFactory.getLogger(WorkerLifecycleController.class);
@@ -38,30 +36,34 @@ public class WorkerLifecycleController {
/**
* Create JobWorker actor according to the execution vertex.
*
* @param executionVertex target execution vertex
* @return creation result
* @param executionVertex target execution vertex Returns creation result
*/
private boolean createWorker(ExecutionVertex executionVertex) {
LOG.info("Start to create worker actor for vertex: {} with resource: {}, workeConfig: {}.",
executionVertex.getExecutionVertexName(), executionVertex.getResource(),
LOG.info(
"Start to create worker actor for vertex: {} with resource: {}, workeConfig: {}.",
executionVertex.getExecutionVertexName(),
executionVertex.getResource(),
executionVertex.getWorkerConfig());
Language language = executionVertex.getLanguage();
BaseActorHandle actor;
if (Language.JAVA == language) {
actor = Ray.actor(JobWorker::new, executionVertex)
.setResources(executionVertex.getResource())
.setMaxRestarts(-1)
.remote();
actor =
Ray.actor(JobWorker::new, executionVertex)
.setResources(executionVertex.getResource())
.setMaxRestarts(-1)
.remote();
} else {
RemoteCall.ExecutionVertexContext.ExecutionVertex vertexPb
= new GraphPbBuilder().buildVertex(executionVertex);
actor = Ray.actor(
PyActorClass.of("ray.streaming.runtime.worker", "JobWorker"), vertexPb.toByteArray())
.setResources(executionVertex.getResource())
.setMaxRestarts(-1)
.remote();
RemoteCall.ExecutionVertexContext.ExecutionVertex vertexPb =
new GraphPbBuilder().buildVertex(executionVertex);
actor =
Ray.actor(
PyActorClass.of("ray.streaming.runtime.worker", "JobWorker"),
vertexPb.toByteArray())
.setResources(executionVertex.getResource())
.setMaxRestarts(-1)
.remote();
}
if (null == actor) {
@@ -71,8 +73,10 @@ public class WorkerLifecycleController {
executionVertex.setWorkerActor(actor);
LOG.info("Worker actor created, actor: {}, vertex: {}.",
executionVertex.getWorkerActorId(), executionVertex.getExecutionVertexName());
LOG.info(
"Worker actor created, actor: {}, vertex: {}.",
executionVertex.getWorkerActorId(),
executionVertex.getExecutionVertexName());
return true;
}
@@ -80,8 +84,7 @@ public class WorkerLifecycleController {
* Using context to init JobWorker.
*
* @param vertexToContextMap target JobWorker actor
* @param timeout timeout for waiting, unit: ms
* @return initiation result
* @param timeout timeout for waiting, unit: ms Returns initiation result
*/
public boolean initWorkers(
Map<ExecutionVertex, JobWorkerContext> vertexToContextMap, int timeout) {
@@ -89,11 +92,15 @@ public class WorkerLifecycleController {
long startTime = System.currentTimeMillis();
Map<ObjectRef<Boolean>, ActorId> rayObjects = new HashMap<>();
vertexToContextMap.entrySet().forEach((entry -> {
ExecutionVertex vertex = entry.getKey();
rayObjects.put(RemoteCallWorker.initWorker(vertex.getWorkerActor(), entry.getValue()),
vertex.getWorkerActorId());
}));
vertexToContextMap
.entrySet()
.forEach(
(entry -> {
ExecutionVertex vertex = entry.getKey();
rayObjects.put(
RemoteCallWorker.initWorker(vertex.getWorkerActor(), entry.getValue()),
vertex.getWorkerActorId());
}));
List<ObjectRef<Boolean>> objectRefList = new ArrayList<>(rayObjects.keySet());
@@ -113,8 +120,7 @@ public class WorkerLifecycleController {
* Start JobWorkers to run task.
*
* @param executionGraph physical plan
* @param timeout timeout for waiting, unit: ms
* @return starting result
* @param timeout timeout for waiting, unit: ms Returns starting result
*/
public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId, int timeout) {
LOG.info("Begin starting workers.");
@@ -122,11 +128,13 @@ public class WorkerLifecycleController {
List<ObjectRef<Object>> objectRefs = new ArrayList<>();
// start source actors 1st
executionGraph.getSourceActors()
executionGraph
.getSourceActors()
.forEach(actor -> objectRefs.add(RemoteCallWorker.rollback(actor, lastCheckpointId)));
// then start non-source actors
executionGraph.getNonSourceActors()
executionGraph
.getNonSourceActors()
.forEach(actor -> objectRefs.add(RemoteCallWorker.rollback(actor, lastCheckpointId)));
WaitResult<Object> result = Ray.wait(objectRefs, objectRefs.size(), timeout);
@@ -142,8 +150,7 @@ public class WorkerLifecycleController {
/**
* Stop and destroy JobWorkers' actor.
*
* @param executionVertices target vertices
* @return destroy result
* @param executionVertices target vertices Returns destroy result
*/
public boolean destroyWorkers(List<ExecutionVertex> executionVertices) {
return asyncBatchExecute(this::destroyWorker, executionVertices);
@@ -151,14 +158,18 @@ public class WorkerLifecycleController {
private boolean destroyWorker(ExecutionVertex executionVertex) {
BaseActorHandle rayActor = executionVertex.getWorkerActor();
LOG.info("Begin destroying worker[vertex={}, actor={}].",
executionVertex.getExecutionVertexName(), rayActor.getId());
LOG.info(
"Begin destroying worker[vertex={}, actor={}].",
executionVertex.getExecutionVertexName(),
rayActor.getId());
boolean destroyResult = RemoteCallWorker.shutdownWithoutReconstruction(rayActor);
if (!destroyResult) {
LOG.error("Failed to destroy JobWorker[{}]'s actor: {}.",
executionVertex.getExecutionVertexName(), rayActor);
LOG.error(
"Failed to destroy JobWorker[{}]'s actor: {}.",
executionVertex.getExecutionVertexName(),
rayActor);
return false;
}
@@ -172,18 +183,22 @@ public class WorkerLifecycleController {
* @param operation the function to be executed
*/
private boolean asyncBatchExecute(
Function<ExecutionVertex, Boolean> operation,
List<ExecutionVertex> executionVertices) {
Function<ExecutionVertex, Boolean> operation, List<ExecutionVertex> executionVertices) {
final Object asyncContext = Ray.getAsyncContext();
List<CompletableFuture<Boolean>> futureResults =
executionVertices.stream().map(vertex -> CompletableFuture.supplyAsync(() -> {
Ray.setAsyncContext(asyncContext);
return operation.apply(vertex);
})).collect(Collectors.toList());
executionVertices.stream()
.map(
vertex ->
CompletableFuture.supplyAsync(
() -> {
Ray.setAsyncContext(asyncContext);
return operation.apply(vertex);
}))
.collect(Collectors.toList());
List<Boolean> succeeded = futureResults.stream().map(CompletableFuture::join)
.collect(Collectors.toList());
List<Boolean> succeeded =
futureResults.stream().map(CompletableFuture::join).collect(Collectors.toList());
if (succeeded.stream().anyMatch(x -> !x)) {
LOG.error("Not all futures return true, check ResourceManager'log the detail.");
@@ -191,5 +206,4 @@ public class WorkerLifecycleController {
}
return true;
}
}
@@ -10,8 +10,7 @@ public class CallResult<T> implements Serializable {
private int resultCode;
private String resultMsg;
public CallResult() {
}
public CallResult() {}
public CallResult(boolean success, int resultCode, String resultMsg, T resultObj) {
this.success = success;
@@ -95,9 +94,7 @@ public class CallResult<T> implements Serializable {
}
public enum CallResultEnum implements Serializable {
/**
* call result enum
*/
/** call result enum */
SUCCESS(0, "SUCCESS"),
FAILED(1, "FAILED"),
SKIPPED(2, "SKIPPED");
@@ -33,33 +33,25 @@ public class GraphPbBuilder {
// build upstream vertices
List<ExecutionVertex> upstreamVertices = executionVertex.getInputVertices();
List<RemoteCall.ExecutionVertexContext.ExecutionVertex> upstreamVertexPbs =
upstreamVertices.stream()
.map(this::buildVertex)
.collect(Collectors.toList());
upstreamVertices.stream().map(this::buildVertex).collect(Collectors.toList());
builder.addAllUpstreamExecutionVertices(upstreamVertexPbs);
// build downstream vertices
List<ExecutionVertex> downstreamVertices = executionVertex.getOutputVertices();
List<RemoteCall.ExecutionVertexContext.ExecutionVertex> downstreamVertexPbs =
downstreamVertices.stream()
.map(this::buildVertex)
.collect(Collectors.toList());
downstreamVertices.stream().map(this::buildVertex).collect(Collectors.toList());
builder.addAllDownstreamExecutionVertices(downstreamVertexPbs);
// build input edges
List<ExecutionEdge> inputEdges = executionVertex.getInputEdges();
List<RemoteCall.ExecutionVertexContext.ExecutionEdge> inputEdgesPbs =
inputEdges.stream()
.map(this::buildEdge)
.collect(Collectors.toList());
inputEdges.stream().map(this::buildEdge).collect(Collectors.toList());
builder.addAllInputExecutionEdges(inputEdgesPbs);
// build output edges
List<ExecutionEdge> outputEdges = executionVertex.getOutputEdges();
List<RemoteCall.ExecutionVertexContext.ExecutionEdge> outputEdgesPbs =
outputEdges.stream()
.map(this::buildEdge)
.collect(Collectors.toList());
outputEdges.stream().map(this::buildEdge).collect(Collectors.toList());
builder.addAllOutputExecutionEdges(outputEdgesPbs);
return builder.build();
@@ -76,13 +68,11 @@ public class GraphPbBuilder {
executionVertexBuilder.setExecutionVertexIndex(executionVertex.getExecutionVertexIndex());
executionVertexBuilder.setParallelism(executionVertex.getParallelism());
executionVertexBuilder.setOperator(
ByteString.copyFrom(
serializeOperator(executionVertex.getStreamOperator())));
ByteString.copyFrom(serializeOperator(executionVertex.getStreamOperator())));
executionVertexBuilder.setChained(isPythonChainedOperator(executionVertex.getStreamOperator()));
if (executionVertex.getWorkerActor() != null) {
executionVertexBuilder.setWorkerActor(
ByteString.copyFrom(
((NativeActorHandle) (executionVertex.getWorkerActor())).toBytes()));
ByteString.copyFrom(((NativeActorHandle) (executionVertex.getWorkerActor())).toBytes()));
}
executionVertexBuilder.setContainerId(executionVertex.getContainerId().toString());
executionVertexBuilder.setBuildTime(executionVertex.getBuildTime());
@@ -112,11 +102,11 @@ public class GraphPbBuilder {
return serializePythonChainedOperator((ChainedPythonOperator) operator);
} else {
PythonOperator pythonOperator = (PythonOperator) operator;
return serializer.serialize(Arrays.asList(
serializeFunction(pythonOperator.getFunction()),
pythonOperator.getModuleName(),
pythonOperator.getClassName()
));
return serializer.serialize(
Arrays.asList(
serializeFunction(pythonOperator.getFunction()),
pythonOperator.getModuleName(),
pythonOperator.getClassName()));
}
} else {
return new byte[0];
@@ -128,24 +118,19 @@ public class GraphPbBuilder {
}
private byte[] serializePythonChainedOperator(ChainedPythonOperator operator) {
List<byte[]> serializedOperators = operator.getOperators().stream()
.map(this::serializeOperator)
.collect(Collectors.toList());
return serializer.serialize(Arrays.asList(
serializedOperators,
operator.getConfigs()
));
List<byte[]> serializedOperators =
operator.getOperators().stream().map(this::serializeOperator).collect(Collectors.toList());
return serializer.serialize(Arrays.asList(serializedOperators, operator.getConfigs()));
}
private byte[] serializeFunction(Function function) {
if (function instanceof PythonFunction) {
PythonFunction pyFunc = (PythonFunction) function;
// function_bytes, module_name, function_name, function_interface
return serializer.serialize(Arrays.asList(
pyFunc.getFunction(), pyFunc.getModuleName(),
pyFunc.getFunctionName(), pyFunc.getFunctionInterface()
));
return serializer.serialize(
Arrays.asList(
pyFunc.getFunction(), pyFunc.getModuleName(),
pyFunc.getFunctionName(), pyFunc.getFunctionInterface()));
} else {
return new byte[0];
}
@@ -155,13 +140,13 @@ public class GraphPbBuilder {
if (partition instanceof PythonPartition) {
PythonPartition pythonPartition = (PythonPartition) partition;
// partition_bytes, module_name, function_name
return serializer.serialize(Arrays.asList(
pythonPartition.getPartition(), pythonPartition.getModuleName(),
pythonPartition.getFunctionName()
));
return serializer.serialize(
Arrays.asList(
pythonPartition.getPartition(),
pythonPartition.getModuleName(),
pythonPartition.getFunctionName()));
} else {
return new byte[0];
}
}
}
@@ -72,8 +72,8 @@ public class PythonGateway {
public byte[] createPythonStreamSource(byte[] pySourceFunc) {
Preconditions.checkNotNull(streamingContext);
try {
PythonStreamSource pythonStreamSource = PythonStreamSource.from(
streamingContext, new PythonFunction(pySourceFunc));
PythonStreamSource pythonStreamSource =
PythonStreamSource.from(streamingContext, new PythonFunction(pySourceFunc));
referenceMap.put(getReferenceId(pythonStreamSource), pythonStreamSource);
return serializer.serialize(getReferenceId(pythonStreamSource));
} catch (Exception e) {
@@ -104,8 +104,7 @@ public class PythonGateway {
List<Object> streams = (List<Object>) serializer.deserialize(paramsBytes);
streams = processParameters(streams);
LOG.info("Call union with streams {}", streams);
Preconditions.checkArgument(streams.size() >= 2,
"Union needs at least two streams");
Preconditions.checkArgument(streams.size() >= 2, "Union needs at least two streams");
Stream unionStream;
Stream stream1 = (Stream) streams.get(0);
List otherStreams = streams.subList(1, streams.size());
@@ -128,8 +127,8 @@ public class PythonGateway {
String className = (String) params.get(0);
String funcName = (String) params.get(1);
Class<?> clz = Class.forName(className, true, this.getClass().getClassLoader());
Class[] paramsTypes = params.subList(2, params.size()).stream()
.map(Object::getClass).toArray(Class[]::new);
Class[] paramsTypes =
params.subList(2, params.size()).stream().map(Object::getClass).toArray(Class[]::new);
Method method = findMethod(clz, funcName, paramsTypes);
Object result = method.invoke(null, params.subList(2, params.size()).toArray());
return serialize(result);
@@ -146,8 +145,8 @@ public class PythonGateway {
Object obj = params.get(0);
String methodName = (String) params.get(1);
Class<?> clz = obj.getClass();
Class[] paramsTypes = params.subList(2, params.size()).stream()
.map(Object::getClass).toArray(Class[]::new);
Class[] paramsTypes =
params.subList(2, params.size()).stream().map(Object::getClass).toArray(Class[]::new);
Method method = findMethod(clz, methodName, paramsTypes);
Object result = method.invoke(obj, params.subList(2, params.size()).toArray());
return serialize(result);
@@ -162,31 +161,36 @@ public class PythonGateway {
return methods.get(0);
}
// Convert all params types to primitive types if it's boxed type
Class[] unwrappedTypes = Arrays.stream(paramsTypes)
.map((Function<Class, Class>) Primitives::unwrap)
.toArray(Class[]::new);
Optional<Method> any = methods.stream()
.filter(m -> {
boolean exactMatch =
Arrays.equals(m.getParameterTypes(), paramsTypes) ||
Arrays.equals(m.getParameterTypes(), unwrappedTypes);
if (exactMatch) {
return true;
} else if (paramsTypes.length == m.getParameterTypes().length) {
for (int i = 0; i < m.getParameterTypes().length; i++) {
Class<?> parameterType = m.getParameterTypes()[i];
if (!parameterType.isAssignableFrom(paramsTypes[i])) {
return false;
}
}
return true;
} else {
return false;
}
})
.findAny();
Preconditions.checkArgument(any.isPresent(),
String.format("Method %s with type %s doesn't exist on class %s",
Class[] unwrappedTypes =
Arrays.stream(paramsTypes)
.map((Function<Class, Class>) Primitives::unwrap)
.toArray(Class[]::new);
Optional<Method> any =
methods.stream()
.filter(
m -> {
boolean exactMatch =
Arrays.equals(m.getParameterTypes(), paramsTypes)
|| Arrays.equals(m.getParameterTypes(), unwrappedTypes);
if (exactMatch) {
return true;
} else if (paramsTypes.length == m.getParameterTypes().length) {
for (int i = 0; i < m.getParameterTypes().length; i++) {
Class<?> parameterType = m.getParameterTypes()[i];
if (!parameterType.isAssignableFrom(paramsTypes[i])) {
return false;
}
}
return true;
} else {
return false;
}
})
.findAny();
Preconditions.checkArgument(
any.isPresent(),
String.format(
"Method %s with type %s doesn't exist on class %s",
methodName, Arrays.toString(paramsTypes), cls));
return any.get();
}
@@ -214,8 +218,11 @@ public class PythonGateway {
}
private static boolean isBasic(Object value) {
return value == null || (value instanceof Boolean) || (value instanceof Number) ||
(value instanceof String) || (value instanceof byte[]);
return value == null
|| (value instanceof Boolean)
|| (value instanceof Number)
|| (value instanceof String)
|| (value instanceof byte[]);
}
public byte[] newInstance(byte[] classNameBytes) {
@@ -232,8 +239,7 @@ public class PythonGateway {
}
private List<Object> processParameters(List<Object> params) {
return params.stream().map(this::processParameter)
.collect(Collectors.toList());
return params.stream().map(this::processParameter).collect(Collectors.toList());
}
private Object processParameter(Object o) {
@@ -253,5 +259,4 @@ public class PythonGateway {
private String getReferenceId(Object o) {
return REFERENCE_ID_PREFIX + System.identityHashCode(o);
}
}
@@ -45,9 +45,13 @@ public class PbResultParser {
callResult.setResultMsg(callResultPb.getResultMsg());
RemoteCall.QueueRecoverInfo recoverInfo = callResultPb.getResultObj();
Map<String, ChannelRecoverInfo.ChannelCreationStatus> creationStatusMap = new HashMap<>();
recoverInfo.getCreationStatusMap().forEach((k, v) -> {
creationStatusMap.put(k, ChannelRecoverInfo.ChannelCreationStatus.fromInt(v.getNumber()));
});
recoverInfo
.getCreationStatusMap()
.forEach(
(k, v) -> {
creationStatusMap.put(
k, ChannelRecoverInfo.ChannelCreationStatus.fromInt(v.getNumber()));
});
callResult.setResultObj(new ChannelRecoverInfo(creationStatusMap));
return callResult;
}
@@ -12,34 +12,39 @@ import io.ray.streaming.runtime.master.coordinator.command.WorkerRollbackRequest
public class RemoteCallMaster {
public static ObjectRef<byte[]> reportJobWorkerCommitAsync(
ActorHandle<JobMaster> actor,
WorkerCommitReport commitReport) {
RemoteCall.WorkerCommitReport commit = RemoteCall.WorkerCommitReport.newBuilder()
.setCommitCheckpointId(commitReport.commitCheckpointId)
.build();
ActorHandle<JobMaster> actor, WorkerCommitReport commitReport) {
RemoteCall.WorkerCommitReport commit =
RemoteCall.WorkerCommitReport.newBuilder()
.setCommitCheckpointId(commitReport.commitCheckpointId)
.build();
Any detail = Any.pack(commit);
RemoteCall.BaseWorkerCmd cmd = RemoteCall.BaseWorkerCmd.newBuilder()
.setActorId(ByteString.copyFrom(commitReport.fromActorId.getBytes()))
.setTimestamp(System.currentTimeMillis())
.setDetail(detail).build();
RemoteCall.BaseWorkerCmd cmd =
RemoteCall.BaseWorkerCmd.newBuilder()
.setActorId(ByteString.copyFrom(commitReport.fromActorId.getBytes()))
.setTimestamp(System.currentTimeMillis())
.setDetail(detail)
.build();
return actor.task(JobMaster::reportJobWorkerCommit, cmd.toByteArray()).remote();
}
public static Boolean requestJobWorkerRollback(
ActorHandle<JobMaster> actor,
WorkerRollbackRequest rollbackRequest) {
RemoteCall.WorkerRollbackRequest request = RemoteCall.WorkerRollbackRequest.newBuilder()
.setExceptionMsg(rollbackRequest.getRollbackExceptionMsg())
.setWorkerHostname(rollbackRequest.getHostname())
.setWorkerPid(rollbackRequest.getPid()).build();
ActorHandle<JobMaster> actor, WorkerRollbackRequest rollbackRequest) {
RemoteCall.WorkerRollbackRequest request =
RemoteCall.WorkerRollbackRequest.newBuilder()
.setExceptionMsg(rollbackRequest.getRollbackExceptionMsg())
.setWorkerHostname(rollbackRequest.getHostname())
.setWorkerPid(rollbackRequest.getPid())
.build();
Any detail = Any.pack(request);
RemoteCall.BaseWorkerCmd cmd = RemoteCall.BaseWorkerCmd.newBuilder()
.setActorId(ByteString.copyFrom(rollbackRequest.fromActorId.getBytes()))
.setTimestamp(System.currentTimeMillis())
.setDetail(detail).build();
ObjectRef<byte[]> ret = actor.task(
JobMaster::requestJobWorkerRollback, cmd.toByteArray()).remote();
RemoteCall.BaseWorkerCmd cmd =
RemoteCall.BaseWorkerCmd.newBuilder()
.setActorId(ByteString.copyFrom(rollbackRequest.fromActorId.getBytes()))
.setTimestamp(System.currentTimeMillis())
.setDetail(detail)
.build();
ObjectRef<byte[]> ret =
actor.task(JobMaster::requestJobWorkerRollback, cmd.toByteArray()).remote();
byte[] res = ret.get();
return PbResultParser.parseBoolResult(res);
}
@@ -16,9 +16,7 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Ray call worker. It takes the communication job from {@link JobMaster} to {@link JobWorker}.
*/
/** Ray call worker. It takes the communication job from {@link JobMaster} to {@link JobWorker}. */
public class RemoteCallWorker {
private static final Logger LOG = LoggerFactory.getLogger(RemoteCallWorker.class);
@@ -27,8 +25,7 @@ public class RemoteCallWorker {
* Call JobWorker actor to init.
*
* @param actor target JobWorker actor
* @param context JobWorker's context
* @return init result
* @param context JobWorker's context Returns init result
*/
public static ObjectRef<Boolean> initWorker(BaseActorHandle actor, JobWorkerContext context) {
LOG.info("Call worker to initiate, actor: {}, context: {}.", actor.getId(), context);
@@ -36,8 +33,10 @@ public class RemoteCallWorker {
// python
if (actor instanceof PyActorHandle) {
result = ((PyActorHandle) actor).task(PyActorMethod.of("init", Boolean.class),
context.getPythonWorkerContextBytes()).remote();
result =
((PyActorHandle) actor)
.task(PyActorMethod.of("init", Boolean.class), context.getPythonWorkerContextBytes())
.remote();
} else {
// java
result = ((ActorHandle<JobWorker>) actor).task(JobWorker::init, context).remote();
@@ -51,8 +50,7 @@ public class RemoteCallWorker {
* Call JobWorker actor to start.
*
* @param actor target JobWorker actor
* @param checkpointId checkpoint ID to be rollback
* @return start result
* @param checkpointId checkpoint ID to be rollback Returns start result
*/
public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) {
LOG.info("Call worker to start, actor: {}.", actor.getId());
@@ -60,17 +58,18 @@ public class RemoteCallWorker {
// python
if (actor instanceof PyActorHandle) {
RemoteCall.CheckpointId checkpointIdPb = RemoteCall.CheckpointId.newBuilder()
.setCheckpointId(checkpointId)
.build();
result = ((PyActorHandle) actor)
.task(PyActorMethod.of("rollback"),
checkpointIdPb.toByteArray()
).remote();
RemoteCall.CheckpointId checkpointIdPb =
RemoteCall.CheckpointId.newBuilder().setCheckpointId(checkpointId).build();
result =
((PyActorHandle) actor)
.task(PyActorMethod.of("rollback"), checkpointIdPb.toByteArray())
.remote();
} else {
// java
result = ((ActorHandle<JobWorker>) actor)
.task(JobWorker::rollback, checkpointId, System.currentTimeMillis()).remote();
result =
((ActorHandle<JobWorker>) actor)
.task(JobWorker::rollback, checkpointId, System.currentTimeMillis())
.remote();
}
LOG.info("Finished calling worker to start.");
@@ -80,12 +79,10 @@ public class RemoteCallWorker {
/**
* Call JobWorker actor to destroy without reconstruction.
*
* @param actor target JobWorker actor
* @return destroy result
* @param actor target JobWorker actor Returns destroy result
*/
public static Boolean shutdownWithoutReconstruction(BaseActorHandle actor) {
LOG.info("Call worker to shutdown without reconstruction, actor is {}.",
actor.getId());
LOG.info("Call worker to shutdown without reconstruction, actor is {}.", actor.getId());
Boolean result = false;
// TODO (datayjz): ray call worker to destroy
@@ -98,26 +95,34 @@ public class RemoteCallWorker {
// python
if (actor instanceof PyActorHandle) {
RemoteCall.Barrier barrierPb = RemoteCall.Barrier.newBuilder().setId(barrierId).build();
return ((PyActorHandle) actor).task(
PyActorMethod.of("commit"), barrierPb.toByteArray()).remote();
return ((PyActorHandle) actor)
.task(PyActorMethod.of("commit"), barrierPb.toByteArray())
.remote();
} else {
// java
return ((ActorHandle<JobWorker>) actor).task(JobWorker::triggerCheckpoint, barrierId)
return ((ActorHandle<JobWorker>) actor)
.task(JobWorker::triggerCheckpoint, barrierId)
.remote();
}
}
public static void clearExpiredCheckpointParallel(
List<BaseActorHandle> actors, Long stateCheckpointId,
Long queueCheckpointId) {
List<BaseActorHandle> actors, Long stateCheckpointId, Long queueCheckpointId) {
if (LOG.isInfoEnabled()) {
LOG.info("Call worker clearExpiredCheckpoint, state checkpoint id is {}," +
" queue checkpoint id is {}.", stateCheckpointId, queueCheckpointId);
LOG.info(
"Call worker clearExpiredCheckpoint, state checkpoint id is {},"
+ " queue checkpoint id is {}.",
stateCheckpointId,
queueCheckpointId);
}
List<Object> result =
checkpointCompleteCommonCallTwoWay(actors, stateCheckpointId, queueCheckpointId,
"clear_expired_cp", JobWorker::clearExpiredCheckpoint);
checkpointCompleteCommonCallTwoWay(
actors,
stateCheckpointId,
queueCheckpointId,
"clear_expired_cp",
JobWorker::clearExpiredCheckpoint);
if (LOG.isInfoEnabled()) {
result.forEach(
@@ -126,60 +131,68 @@ public class RemoteCallWorker {
}
public static void notifyCheckpointTimeoutParallel(
List<BaseActorHandle> actors,
Long checkpointId) {
List<BaseActorHandle> actors, Long checkpointId) {
LOG.info("Call worker notifyCheckpointTimeoutParallel, checkpoint id is {}", checkpointId);
actors.forEach(actor -> {
if (actor instanceof PyActorHandle) {
RemoteCall.CheckpointId checkpointIdPb = RemoteCall.CheckpointId.newBuilder()
.setCheckpointId(checkpointId)
.build();
((PyActorHandle) actor).task(PyActorMethod.of("notify_checkpoint_timeout"),
checkpointIdPb.toByteArray()).remote();
} else {
((ActorHandle<JobWorker>) actor).task(JobWorker::notifyCheckpointTimeout, checkpointId)
.remote();
}
});
actors.forEach(
actor -> {
if (actor instanceof PyActorHandle) {
RemoteCall.CheckpointId checkpointIdPb =
RemoteCall.CheckpointId.newBuilder().setCheckpointId(checkpointId).build();
((PyActorHandle) actor)
.task(PyActorMethod.of("notify_checkpoint_timeout"), checkpointIdPb.toByteArray())
.remote();
} else {
((ActorHandle<JobWorker>) actor)
.task(JobWorker::notifyCheckpointTimeout, checkpointId)
.remote();
}
});
LOG.info("Finish call worker notifyCheckpointTimeoutParallel.");
}
private static List<Object> checkpointCompleteCommonCallTwoWay(
List<BaseActorHandle> actors, Long stateCheckpointId, Long queueCheckpointId,
String pyFuncName, RayFunc3<JobWorker, Long, Long, Boolean> rayFunc) {
List<BaseActorHandle> actors,
Long stateCheckpointId,
Long queueCheckpointId,
String pyFuncName,
RayFunc3<JobWorker, Long, Long, Boolean> rayFunc) {
List<ObjectRef<Object>> waitFor =
checkpointCompleteCommonCall(actors, stateCheckpointId, queueCheckpointId,
pyFuncName, rayFunc);
checkpointCompleteCommonCall(
actors, stateCheckpointId, queueCheckpointId, pyFuncName, rayFunc);
return Ray.get(waitFor);
}
private static List<ObjectRef<Object>> checkpointCompleteCommonCall(
List<BaseActorHandle> actors,
Long stateCheckpointId, Long queueCheckpointId,
Long stateCheckpointId,
Long queueCheckpointId,
String pyFuncName,
RayFunc3<JobWorker, Long, Long, Boolean> rayFunc) {
List<ObjectRef<Object>> waitFor = new ArrayList<>();
actors.forEach(actor -> {
// python
if (actor instanceof PyActorHandle) {
RemoteCall.CheckpointId stateCheckpointIdPb = RemoteCall.CheckpointId.newBuilder()
.setCheckpointId(stateCheckpointId)
.build();
actors.forEach(
actor -> {
// python
if (actor instanceof PyActorHandle) {
RemoteCall.CheckpointId stateCheckpointIdPb =
RemoteCall.CheckpointId.newBuilder().setCheckpointId(stateCheckpointId).build();
RemoteCall.CheckpointId queueCheckpointIdPb = RemoteCall.CheckpointId.newBuilder()
.setCheckpointId(queueCheckpointId)
.build();
waitFor.add(((PyActorHandle) actor).task(PyActorMethod.of(pyFuncName),
stateCheckpointIdPb.toByteArray(), queueCheckpointIdPb.toByteArray()).remote());
} else {
// java
waitFor.add(((ActorHandle) actor).task(rayFunc, stateCheckpointId, queueCheckpointId)
.remote());
}
});
RemoteCall.CheckpointId queueCheckpointIdPb =
RemoteCall.CheckpointId.newBuilder().setCheckpointId(queueCheckpointId).build();
waitFor.add(
((PyActorHandle) actor)
.task(
PyActorMethod.of(pyFuncName),
stateCheckpointIdPb.toByteArray(),
queueCheckpointIdPb.toByteArray())
.remote());
} else {
// java
waitFor.add(
((ActorHandle) actor).task(rayFunc, stateCheckpointId, queueCheckpointId).remote());
}
});
return waitFor;
}
}
@@ -34,8 +34,7 @@ public class AsyncRemoteCaller {
* @param onException callback function on exception
*/
public void checkIfNeedRollbackAsync(
BaseActorHandle actor, Callback<Boolean> callback,
ExceptionHandler<Throwable> onException) {
BaseActorHandle actor, Callback<Boolean> callback, ExceptionHandler<Throwable> onException) {
if (actor instanceof PyActorHandle) {
// python
remoteCallPool.bindCallback(
@@ -43,12 +42,16 @@ public class AsyncRemoteCaller {
(obj) -> {
byte[] res = (byte[]) obj;
callback.handle(PbResultParser.parseBoolResult(res));
}, onException);
},
onException);
} else {
// java
remoteCallPool.bindCallback(
((ActorHandle<JobWorker>) actor).task(JobWorker::checkIfNeedRollback,
System.currentTimeMillis()).remote(), callback, onException);
((ActorHandle<JobWorker>) actor)
.task(JobWorker::checkIfNeedRollback, System.currentTimeMillis())
.remote(),
callback,
onException);
}
}
@@ -66,21 +69,29 @@ public class AsyncRemoteCaller {
ExceptionHandler<Throwable> onException) {
// python
if (actor instanceof PyActorHandle) {
RemoteCall.CheckpointId checkpointIdPb = RemoteCall.CheckpointId.newBuilder()
.setCheckpointId(checkpointId)
.build();
ObjectRef call = ((PyActorHandle) actor).task(PyActorMethod.of("rollback"),
checkpointIdPb.toByteArray()).remote();
remoteCallPool.bindCallback(call, obj ->
callback.handle(PbResultParser.parseRollbackResult((byte[]) obj)), onException);
RemoteCall.CheckpointId checkpointIdPb =
RemoteCall.CheckpointId.newBuilder().setCheckpointId(checkpointId).build();
ObjectRef call =
((PyActorHandle) actor)
.task(PyActorMethod.of("rollback"), checkpointIdPb.toByteArray())
.remote();
remoteCallPool.bindCallback(
call,
obj -> callback.handle(PbResultParser.parseRollbackResult((byte[]) obj)),
onException);
} else {
// java
ObjectRef call = ((ActorHandle<JobWorker>) actor).task(
JobWorker::rollback, checkpointId, System.currentTimeMillis()).remote();
remoteCallPool.bindCallback(call, obj -> {
CallResult<ChannelRecoverInfo> res = (CallResult<ChannelRecoverInfo>) obj;
callback.handle(res);
}, onException);
ObjectRef call =
((ActorHandle<JobWorker>) actor)
.task(JobWorker::rollback, checkpointId, System.currentTimeMillis())
.remote();
remoteCallPool.bindCallback(
call,
obj -> {
CallResult<ChannelRecoverInfo> res = (CallResult<ChannelRecoverInfo>) obj;
callback.handle(res);
},
onException);
}
}
@@ -92,7 +103,8 @@ public class AsyncRemoteCaller {
* @param onException callback function on exception
*/
public void batchRollback(
List<BaseActorHandle> actors, final Long checkpointId,
List<BaseActorHandle> actors,
final Long checkpointId,
Collection<String> abnormalQueues,
Callback<List<CallResult<ChannelRecoverInfo>>> callback,
ExceptionHandler<Throwable> onException) {
@@ -103,29 +115,35 @@ public class AsyncRemoteCaller {
ObjectRef call;
if (actor instanceof PyActorHandle) {
isPyActor.put(i, true);
RemoteCall.CheckpointId checkpointIdPb = RemoteCall.CheckpointId.newBuilder()
.setCheckpointId(checkpointId)
.build();
call = ((PyActorHandle) actor).task(PyActorMethod.of("rollback"),
checkpointIdPb.toByteArray()).remote();
RemoteCall.CheckpointId checkpointIdPb =
RemoteCall.CheckpointId.newBuilder().setCheckpointId(checkpointId).build();
call =
((PyActorHandle) actor)
.task(PyActorMethod.of("rollback"), checkpointIdPb.toByteArray())
.remote();
} else {
// java
call = ((ActorHandle<JobWorker>) actor).task(JobWorker::rollback, checkpointId,
System.currentTimeMillis()).remote();
call =
((ActorHandle<JobWorker>) actor)
.task(JobWorker::rollback, checkpointId, System.currentTimeMillis())
.remote();
}
rayCallList.add(call);
}
remoteCallPool.bindCallback(rayCallList, objList -> {
List<CallResult<ChannelRecoverInfo>> results = new ArrayList<>();
for (int i = 0; i < objList.size(); ++i) {
Object obj = objList.get(i);
if (isPyActor.getOrDefault(i, false)) {
results.add(PbResultParser.parseRollbackResult((byte[]) obj));
} else {
results.add((CallResult<ChannelRecoverInfo>) obj);
}
}
callback.handle(results);
}, onException);
remoteCallPool.bindCallback(
rayCallList,
objList -> {
List<CallResult<ChannelRecoverInfo>> results = new ArrayList<>();
for (int i = 0; i < objList.size(); ++i) {
Object obj = objList.get(i);
if (isPyActor.getOrDefault(i, false)) {
results.add(PbResultParser.parseRollbackResult((byte[]) obj));
} else {
results.add((CallResult<ChannelRecoverInfo>) obj);
}
}
callback.handle(results);
},
onException);
}
}
@@ -18,7 +18,6 @@ import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class RemoteCallPool implements Runnable {
private static final Logger LOG = LoggerFactory.getLogger(RemoteCallPool.class);
@@ -30,27 +29,28 @@ public class RemoteCallPool implements Runnable {
new ConcurrentHashMap<>();
private Map<RemoteCallBundle, ExceptionHandler<Throwable>> bundleExceptionHandlerMap =
new ConcurrentHashMap<>();
private ThreadPoolExecutor callBackPool = new ThreadPoolExecutor(
2, Runtime.getRuntime().availableProcessors(),
1, TimeUnit.MINUTES, new LinkedBlockingQueue<>(),
new CallbackThreadFactory());
private ThreadPoolExecutor callBackPool =
new ThreadPoolExecutor(
2,
Runtime.getRuntime().availableProcessors(),
1,
TimeUnit.MINUTES,
new LinkedBlockingQueue<>(),
new CallbackThreadFactory());
private volatile boolean stop = false;
public RemoteCallPool() {
Thread t = new Thread(Ray.wrapRunnable(this), "remote-pool-loop");
t.setUncaughtExceptionHandler((thread, throwable) ->
LOG.error("Error in remote call pool thread.", throwable)
);
t.setUncaughtExceptionHandler(
(thread, throwable) -> LOG.error("Error in remote call pool thread.", throwable));
t.start();
}
@SuppressWarnings("unchecked")
public <T> void bindCallback(
ObjectRef<T> obj, Callback<T> callback,
ExceptionHandler<Throwable> onException) {
ObjectRef<T> obj, Callback<T> callback, ExceptionHandler<Throwable> onException) {
List objectRefList = Collections.singletonList(obj);
RemoteCallBundle bundle = new RemoteCallBundle(objectRefList,
true);
RemoteCallBundle bundle = new RemoteCallBundle(objectRefList, true);
singletonHandlerMap.put(bundle, (Callback<Object>) callback);
bundleExceptionHandlerMap.put(bundle, onException);
synchronized (pendingObjectBundles) {
@@ -59,7 +59,8 @@ public class RemoteCallPool implements Runnable {
}
public void bindCallback(
List<ObjectRef<Object>> objectBundle, Callback<List<Object>> callback,
List<ObjectRef<Object>> objectBundle,
Callback<List<Object>> callback,
ExceptionHandler<Throwable> onException) {
RemoteCallBundle bundle = new RemoteCallBundle(objectBundle, false);
bundleHandlerMap.put(bundle, callback);
@@ -99,34 +100,40 @@ public class RemoteCallPool implements Runnable {
ExceptionHandler<Throwable> exceptionHandler = bundleExceptionHandlerMap.get(bundle);
if (bundle.isSingletonBundle) {
callBackPool.execute(Ray.wrapRunnable(() -> {
try {
singletonHandlerMap.get(bundle).handle(readyObjs.get(0).get());
singletonHandlerMap.remove(bundle);
} catch (Throwable th) {
LOG.error("Error when get object, objectId = {}.", readyObjs.get(0).toString(),
th);
if (exceptionHandler != null) {
exceptionHandler.handle(th);
}
}
}));
callBackPool.execute(
Ray.wrapRunnable(
() -> {
try {
singletonHandlerMap.get(bundle).handle(readyObjs.get(0).get());
singletonHandlerMap.remove(bundle);
} catch (Throwable th) {
LOG.error(
"Error when get object, objectId = {}.",
readyObjs.get(0).toString(),
th);
if (exceptionHandler != null) {
exceptionHandler.handle(th);
}
}
}));
} else {
List<Object> results =
readyObjs.stream().map(ObjectRef::get).collect(Collectors.toList());
List<String> resultIds =
readyObjs.stream().map(ObjectRef::toString).collect(Collectors.toList());
callBackPool.execute(Ray.wrapRunnable(() -> {
try {
bundleHandlerMap.get(bundle).handle(results);
bundleHandlerMap.remove(bundle);
} catch (Throwable th) {
LOG.error("Error when get object, objectIds = {}.", resultIds, th);
if (exceptionHandler != null) {
exceptionHandler.handle(th);
}
}
}));
callBackPool.execute(
Ray.wrapRunnable(
() -> {
try {
bundleHandlerMap.get(bundle).handle(results);
bundleHandlerMap.remove(bundle);
} catch (Throwable th) {
LOG.error("Error when get object, objectIds = {}.", resultIds, th);
if (exceptionHandler != null) {
exceptionHandler.handle(th);
}
}
}));
}
itr.remove();
}
@@ -185,5 +192,4 @@ public class RemoteCallPool implements Runnable {
return t;
}
}
}
@@ -21,13 +21,12 @@ public class CrossLangSerializer implements Serializer {
Object value = record.getValue();
Class<? extends Record> clz = record.getClass();
if (clz == Record.class) {
return msgPackSerializer.serialize(Arrays.asList(
RECORD_TYPE_ID, record.getStream(), value));
return msgPackSerializer.serialize(Arrays.asList(RECORD_TYPE_ID, record.getStream(), value));
} else if (clz == KeyRecord.class) {
KeyRecord keyRecord = (KeyRecord) record;
Object key = keyRecord.getKey();
return msgPackSerializer.serialize(Arrays.asList(
KEY_RECORD_TYPE_ID, keyRecord.getStream(), key, value));
return msgPackSerializer.serialize(
Arrays.asList(KEY_RECORD_TYPE_ID, keyRecord.getStream(), key, value));
} else {
throw new UnsupportedOperationException(
String.format("Serialize %s is unsupported.", record));
@@ -39,25 +38,25 @@ public class CrossLangSerializer implements Serializer {
List list = (List) msgPackSerializer.deserialize(bytes);
Byte typeId = (Byte) list.get(0);
switch (typeId) {
case RECORD_TYPE_ID: {
String stream = (String) list.get(1);
Object value = list.get(2);
Record record = new Record(value);
record.setStream(stream);
return record;
}
case KEY_RECORD_TYPE_ID: {
String stream = (String) list.get(1);
Object key = list.get(2);
Object value = list.get(3);
KeyRecord keyRecord = new KeyRecord(key, value);
keyRecord.setStream(stream);
return keyRecord;
}
case RECORD_TYPE_ID:
{
String stream = (String) list.get(1);
Object value = list.get(2);
Record record = new Record(value);
record.setStream(stream);
return record;
}
case KEY_RECORD_TYPE_ID:
{
String stream = (String) list.get(1);
Object key = list.get(2);
Object value = list.get(3);
KeyRecord keyRecord = new KeyRecord(key, value);
keyRecord.setStream(stream);
return keyRecord;
}
default:
throw new UnsupportedOperationException("Unsupported type " + typeId);
}
}
}
@@ -9,5 +9,4 @@ public interface Serializer {
byte[] serialize(Object object);
<T> T deserialize(byte[] bytes);
}
@@ -13,9 +13,7 @@ import io.ray.streaming.runtime.worker.JobWorker;
import java.util.ArrayList;
import java.util.List;
/**
* Save channel initial parameters needed by DataWriter/DataReader.
*/
/** Save channel initial parameters needed by DataWriter/DataReader. */
public class ChannelCreationParametersBuilder {
public static class Parameter {
@@ -28,20 +26,22 @@ public class ChannelCreationParametersBuilder {
this.actorId = actorId;
}
public void setAsyncFunctionDescriptor(
FunctionDescriptor asyncFunctionDescriptor) {
public void setAsyncFunctionDescriptor(FunctionDescriptor asyncFunctionDescriptor) {
this.asyncFunctionDescriptor = asyncFunctionDescriptor;
}
public void setSyncFunctionDescriptor(
FunctionDescriptor syncFunctionDescriptor) {
public void setSyncFunctionDescriptor(FunctionDescriptor syncFunctionDescriptor) {
this.syncFunctionDescriptor = syncFunctionDescriptor;
}
public String toString() {
String language =
asyncFunctionDescriptor instanceof JavaFunctionDescriptor ? "Java" : "Python";
return "Language: " + language + " Desc: " + asyncFunctionDescriptor.toList() + " "
return "Language: "
+ language
+ " Desc: "
+ asyncFunctionDescriptor.toList()
+ " "
+ syncFunctionDescriptor.toList();
}
@@ -64,61 +64,60 @@ public class ChannelCreationParametersBuilder {
private List<Parameter> parameters;
// function descriptors of direct call entry point for Java workers
private static JavaFunctionDescriptor javaReaderAsyncFuncDesc = new JavaFunctionDescriptor(
JobWorker.class.getName(),
"onReaderMessage", "([B)V");
private static JavaFunctionDescriptor javaReaderSyncFuncDesc = new JavaFunctionDescriptor(
JobWorker.class.getName(),
"onReaderMessageSync", "([B)[B");
private static JavaFunctionDescriptor javaWriterAsyncFuncDesc = new JavaFunctionDescriptor(
JobWorker.class.getName(),
"onWriterMessage", "([B)V");
private static JavaFunctionDescriptor javaWriterSyncFuncDesc = new JavaFunctionDescriptor(
JobWorker.class.getName(),
"onWriterMessageSync", "([B)[B");
private static JavaFunctionDescriptor javaReaderAsyncFuncDesc =
new JavaFunctionDescriptor(JobWorker.class.getName(), "onReaderMessage", "([B)V");
private static JavaFunctionDescriptor javaReaderSyncFuncDesc =
new JavaFunctionDescriptor(JobWorker.class.getName(), "onReaderMessageSync", "([B)[B");
private static JavaFunctionDescriptor javaWriterAsyncFuncDesc =
new JavaFunctionDescriptor(JobWorker.class.getName(), "onWriterMessage", "([B)V");
private static JavaFunctionDescriptor javaWriterSyncFuncDesc =
new JavaFunctionDescriptor(JobWorker.class.getName(), "onWriterMessageSync", "([B)[B");
// function descriptors of direct call entry point for Python workers
private static PyFunctionDescriptor pyReaderAsyncFunctionDesc = new PyFunctionDescriptor(
"ray.streaming.runtime.worker",
"JobWorker", "on_reader_message");
private static PyFunctionDescriptor pyReaderSyncFunctionDesc = new PyFunctionDescriptor(
"ray.streaming.runtime.worker",
"JobWorker", "on_reader_message_sync");
private static PyFunctionDescriptor pyWriterAsyncFunctionDesc = new PyFunctionDescriptor(
"ray.streaming.runtime.worker",
"JobWorker", "on_writer_message");
private static PyFunctionDescriptor pyWriterSyncFunctionDesc = new PyFunctionDescriptor(
"ray.streaming.runtime.worker",
"JobWorker", "on_writer_message_sync");
private static PyFunctionDescriptor pyReaderAsyncFunctionDesc =
new PyFunctionDescriptor("ray.streaming.runtime.worker", "JobWorker", "on_reader_message");
private static PyFunctionDescriptor pyReaderSyncFunctionDesc =
new PyFunctionDescriptor(
"ray.streaming.runtime.worker", "JobWorker", "on_reader_message_sync");
private static PyFunctionDescriptor pyWriterAsyncFunctionDesc =
new PyFunctionDescriptor("ray.streaming.runtime.worker", "JobWorker", "on_writer_message");
private static PyFunctionDescriptor pyWriterSyncFunctionDesc =
new PyFunctionDescriptor(
"ray.streaming.runtime.worker", "JobWorker", "on_writer_message_sync");
public ChannelCreationParametersBuilder() {
}
public ChannelCreationParametersBuilder() {}
public static void setJavaReaderFunctionDesc(
JavaFunctionDescriptor asyncFunc,
JavaFunctionDescriptor syncFunc) {
JavaFunctionDescriptor asyncFunc, JavaFunctionDescriptor syncFunc) {
javaReaderAsyncFuncDesc = asyncFunc;
javaReaderSyncFuncDesc = syncFunc;
}
public static void setJavaWriterFunctionDesc(
JavaFunctionDescriptor asyncFunc,
JavaFunctionDescriptor syncFunc) {
JavaFunctionDescriptor asyncFunc, JavaFunctionDescriptor syncFunc) {
javaWriterAsyncFuncDesc = asyncFunc;
javaWriterSyncFuncDesc = syncFunc;
}
public ChannelCreationParametersBuilder buildInputQueueParameters(
List<String> queues,
List<BaseActorHandle> actors) {
return buildParameters(queues, actors, javaWriterAsyncFuncDesc, javaWriterSyncFuncDesc,
pyWriterAsyncFunctionDesc, pyWriterSyncFunctionDesc);
List<String> queues, List<BaseActorHandle> actors) {
return buildParameters(
queues,
actors,
javaWriterAsyncFuncDesc,
javaWriterSyncFuncDesc,
pyWriterAsyncFunctionDesc,
pyWriterSyncFunctionDesc);
}
public ChannelCreationParametersBuilder buildOutputQueueParameters(
List<String> queues,
List<BaseActorHandle> actors) {
return buildParameters(queues, actors, javaReaderAsyncFuncDesc, javaReaderSyncFuncDesc,
pyReaderAsyncFunctionDesc, pyReaderSyncFunctionDesc);
List<String> queues, List<BaseActorHandle> actors) {
return buildParameters(
queues,
actors,
javaReaderAsyncFuncDesc,
javaReaderSyncFuncDesc,
pyReaderAsyncFunctionDesc,
pyReaderSyncFunctionDesc);
}
private ChannelCreationParametersBuilder buildParameters(
@@ -127,8 +126,7 @@ public class ChannelCreationParametersBuilder {
JavaFunctionDescriptor javaAsyncFunctionDesc,
JavaFunctionDescriptor javaSyncFunctionDesc,
PyFunctionDescriptor pyAsyncFunctionDesc,
PyFunctionDescriptor pySyncFunctionDesc
) {
PyFunctionDescriptor pySyncFunctionDesc) {
parameters = new ArrayList<>(queues.size());
for (int i = 0; i < queues.size(); ++i) {
@@ -62,8 +62,8 @@ public class DataReader {
Preconditions.checkArgument(inputChannels.size() == fromActors.size());
ChannelCreationParametersBuilder initialParameters =
new ChannelCreationParametersBuilder().buildInputQueueParameters(inputChannels, fromActors);
byte[][] inputChannelsBytes = inputChannels.stream()
.map(ChannelId::idStrToBytes).toArray(byte[][]::new);
byte[][] inputChannelsBytes =
inputChannels.stream().map(ChannelId::idStrToBytes).toArray(byte[][]::new);
// get sequence ID and message ID from OffsetInfo
long[] msgIds = new long[inputChannels.size()];
@@ -84,21 +84,23 @@ public class DataReader {
// create native reader
List<Integer> creationStatus = new ArrayList<>();
this.nativeReaderPtr = createDataReaderNative(
initialParameters,
inputChannelsBytes,
msgIds,
timerInterval,
creationStatus,
ChannelUtils.toNativeConf(workerConfig),
isMock
);
this.nativeReaderPtr =
createDataReaderNative(
initialParameters,
inputChannelsBytes,
msgIds,
timerInterval,
creationStatus,
ChannelUtils.toNativeConf(workerConfig),
isMock);
for (int i = 0; i < inputChannels.size(); ++i) {
queueCreationStatusMap
.put(inputChannels.get(i), ChannelCreationStatus.fromInt(creationStatus.get(i)));
queueCreationStatusMap.put(
inputChannels.get(i), ChannelCreationStatus.fromInt(creationStatus.get(i)));
}
LOG.info("Create DataReader succeed for worker: {}, creation status={}.",
workerConfig.workerInternalConfig.workerName(), queueCreationStatusMap);
LOG.info(
"Create DataReader succeed for worker: {}, creation status={}.",
workerConfig.workerInternalConfig.workerName(),
queueCreationStatusMap);
}
private static native long createDataReaderNative(
@@ -113,8 +115,7 @@ public class DataReader {
/**
* Read message from input channels, if timeout, return null.
*
* @param timeoutMillis timeout
* @return message or null
* @param timeoutMillis timeout Returns message or null
*/
public ChannelMessage read(long timeoutMillis) {
if (buf.isEmpty()) {
@@ -183,8 +184,11 @@ public class DataReader {
}
private void getBundle(long timeoutMillis) {
getBundleNative(nativeReaderPtr, timeoutMillis,
Platform.getAddress(getBundleParams), Platform.getAddress(bundleMeta));
getBundleNative(
nativeReaderPtr,
timeoutMillis,
Platform.getAddress(getBundleParams),
Platform.getAddress(bundleMeta));
bundleMeta.rewind();
long bundleAddress = getBundleParams.getLong(0);
int bundleSize = getBundleParams.getInt(8);
@@ -192,16 +196,12 @@ public class DataReader {
Platform.wrapDirectBuffer(bundleData, bundleAddress, bundleSize);
}
/**
* Stop reader
*/
/** Stop reader */
public void stop() {
stopReaderNative(nativeReaderPtr);
}
/**
* Close reader to release resource
*/
/** Close reader to release resource */
public void close() {
if (nativeReaderPtr == 0) {
return;
@@ -213,10 +213,7 @@ public class DataReader {
}
private native void getBundleNative(
long nativeReaderPtr,
long timeoutMillis,
long params,
long metaAddress);
long nativeReaderPtr, long timeoutMillis, long params, long metaAddress);
private native byte[] getOffsetsInfoNative(long nativeQueueConsumerPtr);
@@ -378,5 +375,4 @@ public class DataReader {
return barrierOffsetInfo;
}
}
}
@@ -17,9 +17,7 @@ import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* DataWriter is a wrapper of streaming c++ DataWriter, which sends data to downstream workers
*/
/** DataWriter is a wrapper of streaming c++ DataWriter, which sends data to downstream workers */
public class DataWriter {
private static final Logger LOG = LoggerFactory.getLogger(DataWriter.class);
@@ -51,8 +49,8 @@ public class DataWriter {
ChannelCreationParametersBuilder initialParameters =
new ChannelCreationParametersBuilder().buildOutputQueueParameters(outputChannels, toActors);
byte[][] outputChannelsBytes = outputChannels.stream()
.map(ChannelId::idStrToBytes).toArray(byte[][]::new);
byte[][] outputChannelsBytes =
outputChannels.stream().map(ChannelId::idStrToBytes).toArray(byte[][]::new);
long channelSize = workerConfig.transferConfig.channelSize();
// load message id from checkpoints
@@ -70,15 +68,16 @@ public class DataWriter {
if (TransferChannelType.MEMORY_CHANNEL == channelType) {
isMock = true;
}
this.nativeWriterPtr = createWriterNative(
initialParameters,
outputChannelsBytes,
msgIds,
channelSize,
ChannelUtils.toNativeConf(workerConfig),
isMock
);
LOG.info("Create DataWriter succeed for worker: {}.",
this.nativeWriterPtr =
createWriterNative(
initialParameters,
outputChannelsBytes,
msgIds,
channelSize,
ChannelUtils.toNativeConf(workerConfig),
isMock);
LOG.info(
"Create DataWriter succeed for worker: {}.",
workerConfig.workerInternalConfig.workerName());
}
@@ -108,8 +107,8 @@ public class DataWriter {
* Write msg into the specified channels
*
* @param ids channel ids
* @param item message item data section is specified by [position, limit).
* item doesn't have to be a direct buffer.
* @param item message item data section is specified by [position, limit). item doesn't have to
* be a direct buffer.
*/
public void write(Set<ChannelId> ids, ByteBuffer item) {
int size = item.remaining();
@@ -150,16 +149,12 @@ public class DataWriter {
clearCheckpointNative(nativeWriterPtr, checkpointId);
}
/**
* stop writer
*/
/** stop writer */
public void stop() {
stopWriterNative(nativeWriterPtr);
}
/**
* close writer to release resources
*/
/** close writer to release resources */
public void close() {
if (nativeWriterPtr == 0) {
return;
@@ -180,12 +175,7 @@ public class DataWriter {
private native long[] getOutputMsgIdNative(long nativeQueueProducerPtr);
private native void broadcastBarrierNative(
long nativeQueueProducerPtr, long checkpointId,
byte[] data);
private native void clearCheckpointNative(
long nativeQueueProducerPtr,
long checkpointId
);
long nativeQueueProducerPtr, long checkpointId, byte[] data);
private native void clearCheckpointNative(long nativeQueueProducerPtr, long checkpointId);
}
@@ -42,7 +42,6 @@ public class TransferHandler {
private native long createReaderClientNative();
private native void handleWriterMessageNative(long handler, byte[] buffer);
private native byte[] handleWriterMessageSyncNative(long handler, byte[] buffer);
@@ -12,8 +12,7 @@ import java.util.Set;
import sun.nio.ch.DirectBuffer;
/**
* ChannelID is used to identify a transfer channel between a upstream worker and downstream
* worker.
* ChannelID is used to identify a transfer channel between a upstream worker and downstream worker.
*/
public class ChannelId {
@@ -45,16 +44,12 @@ public class ChannelId {
private static native void destroyNativeId(long nativeIdPtr);
/**
* @param id hex string representation of channel id
*/
/** @param id hex string representation of channel id */
public static ChannelId from(String id) {
return from(id, ChannelId.idStrToBytes(id));
}
/**
* @param idBytes bytes representation of channel id
*/
/** @param idBytes bytes representation of channel id */
public static ChannelId from(byte[] idBytes) {
return from(idBytesToStr(idBytes), idBytes);
}
@@ -76,9 +71,7 @@ public class ChannelId {
return id;
}
/**
* @return a random channel id string
*/
/** Returns a random channel id string */
public static String genRandomIdStr() {
StringBuilder sb = new StringBuilder();
Random random = new Random();
@@ -92,18 +85,20 @@ public class ChannelId {
* Generate channel name, which will be 20 character
*
* @param fromTaskId upstream task id
* @param toTaskId downstream task id
* @return channel name
* @param toTaskId downstream task id Returns channel name
*/
public static String genIdStr(int fromTaskId, int toTaskId, long ts) {
/*
| Head | Timestamp | Empty | From | To |
| 8 bytes | 4bytes | 4bytes| 2bytes| 2bytes |
*/
Preconditions.checkArgument(fromTaskId < Short.MAX_VALUE,
"fromTaskId %s is larger than %s", fromTaskId, Short.MAX_VALUE);
Preconditions.checkArgument(toTaskId < Short.MAX_VALUE,
"toTaskId %s is larger than %s", fromTaskId, Short.MAX_VALUE);
Preconditions.checkArgument(
fromTaskId < Short.MAX_VALUE,
"fromTaskId %s is larger than %s",
fromTaskId,
Short.MAX_VALUE);
Preconditions.checkArgument(
toTaskId < Short.MAX_VALUE, "toTaskId %s is larger than %s", fromTaskId, Short.MAX_VALUE);
byte[] channelName = new byte[20];
for (int i = 11; i >= 8; i--) {
@@ -120,8 +115,7 @@ public class ChannelId {
}
/**
* @param id hex string representation of channel id
* @return bytes representation of channel id
* @param id hex string representation of channel id Returns bytes representation of channel id
*/
public static byte[] idStrToBytes(String id) {
byte[] idBytes = BaseEncoding.base16().decode(id.toUpperCase());
@@ -130,8 +124,7 @@ public class ChannelId {
}
/**
* @param id bytes representation of channel id
* @return hex string representation of channel id
* @param id bytes representation of channel id Returns hex string representation of channel id
*/
public static String idBytesToStr(byte[] id) {
assert id.length == ChannelId.ID_LENGTH;
@@ -178,6 +171,4 @@ public class ChannelId {
public int hashCode() {
return strId.hashCode();
}
}
@@ -8,32 +8,29 @@ import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ChannelRecoverInfo implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(ChannelRecoverInfo.class);
public Map<String, ChannelCreationStatus> queueCreationStatusMap;
public ChannelRecoverInfo(Map<String, ChannelCreationStatus> queueCreationStatusMap) {
this.queueCreationStatusMap = queueCreationStatusMap;
}
public Set<String> getDataLostQueues() {
Set<String> dataLostQueues = new HashSet<>();
queueCreationStatusMap.forEach((q, status) -> {
if (status.equals(ChannelCreationStatus.DataLost)) {
dataLostQueues.add(q);
}
});
queueCreationStatusMap.forEach(
(q, status) -> {
if (status.equals(ChannelCreationStatus.DataLost)) {
dataLostQueues.add(q);
}
});
return dataLostQueues;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("dataLostQueues", getDataLostQueues())
.toString();
return MoreObjects.toStringHelper(this).add("dataLostQueues", getDataLostQueues()).toString();
}
public enum ChannelCreationStatus {
@@ -43,7 +43,7 @@ public class ChannelUtils {
builder.setEmptyMessageInterval(emptyMsgInterval);
}
//flow control type
// flow control type
int flowControlType = workerConfig.transferConfig.flowControlType();
if (flowControlType != -1) {
builder.setFlowControlType(Streaming.FlowControlType.forNumber(flowControlType));
@@ -55,7 +55,7 @@ public class ChannelUtils {
builder.setWriterConsumedStep(writerConsumedStep);
}
//reader consumed step
// reader consumed step
int readerConsumedStep = workerConfig.transferConfig.readerConsumedStep();
if (readerConsumedStep != -1) {
builder.setReaderConsumedStep(readerConsumedStep);
@@ -65,5 +65,4 @@ public class ChannelUtils {
LOGGER.info("Streaming native conf {}", streamingConf.toString());
return streamingConf.toByteArray();
}
}
@@ -3,9 +3,7 @@ package io.ray.streaming.runtime.transfer.channel;
import com.google.common.base.MoreObjects;
import java.io.Serializable;
/**
* This data structure contains offset used by streaming queue.
*/
/** This data structure contains offset used by streaming queue. */
public class OffsetInfo implements Serializable {
private long streamingMsgId;
@@ -24,8 +22,6 @@ public class OffsetInfo implements Serializable {
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("streamingMsgId", streamingMsgId)
.toString();
return MoreObjects.toStringHelper(this).add("streamingMsgId", streamingMsgId).toString();
}
}
@@ -4,7 +4,6 @@ import io.ray.streaming.runtime.transfer.channel.OffsetInfo;
import java.nio.ByteBuffer;
import java.util.Map;
public class BarrierMessage extends ChannelMessage {
private final ByteBuffer data;
@@ -12,8 +11,12 @@ public class BarrierMessage extends ChannelMessage {
private final Map<String, OffsetInfo> inputOffsets;
public BarrierMessage(
long msgId, long timestamp, String channelId,
ByteBuffer data, long checkpointId, Map<String, OffsetInfo> inputOffsets) {
long msgId,
long timestamp,
String channelId,
ByteBuffer data,
long checkpointId,
Map<String, OffsetInfo> inputOffsets) {
super(msgId, timestamp, channelId);
this.data = data;
this.checkpointId = checkpointId;
@@ -2,10 +2,7 @@ package io.ray.streaming.runtime.transfer.message;
import java.nio.ByteBuffer;
/**
* DataMessage represents data between upstream and downstream operators.
*/
/** DataMessage represents data between upstream and downstream operators. */
public class DataMessage extends ChannelMessage {
private final ByteBuffer body;
@@ -4,9 +4,7 @@ import io.ray.streaming.runtime.context.ContextBackend;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Handle exception for checkpoint state
*/
/** Handle exception for checkpoint state */
public class CheckpointStateUtil {
private static final Logger LOG = LoggerFactory.getLogger(CheckpointStateUtil.class);
@@ -45,8 +43,7 @@ public class CheckpointStateUtil {
public static class CheckpointStateRuntimeException extends RuntimeException {
public CheckpointStateRuntimeException() {
}
public CheckpointStateRuntimeException() {}
public CheckpointStateRuntimeException(String message) {
super(message);
@@ -2,9 +2,7 @@ package io.ray.streaming.runtime.util;
import java.util.Map;
/**
* Common tools.
*/
/** Common tools. */
public class CommonUtils {
public static Map<String, Object> strMapToObjectMap(Map<String, String> srcMap) {
@@ -36,13 +36,14 @@ public class EnvUtil {
/**
* Execute an external command.
*
* @return Whether the command succeeded.
* <p>Returns Whether the command succeeded.
*/
public static boolean executeCommand(List<String> command, int waitTimeoutSeconds) {
try {
ProcessBuilder processBuilder = new ProcessBuilder(command)
.redirectOutput(ProcessBuilder.Redirect.INHERIT)
.redirectError(ProcessBuilder.Redirect.INHERIT);
ProcessBuilder processBuilder =
new ProcessBuilder(command)
.redirectOutput(ProcessBuilder.Redirect.INHERIT)
.redirectError(ProcessBuilder.Redirect.INHERIT);
Process process = processBuilder.start();
boolean exit = process.waitFor(waitTimeoutSeconds, TimeUnit.SECONDS);
if (!exit) {
@@ -53,5 +54,4 @@ public class EnvUtil {
throw new RuntimeException("Error executing command " + String.join(" ", command), e);
}
}
}
@@ -9,9 +9,7 @@ import java.nio.ByteBuffer;
import sun.misc.Unsafe;
import sun.nio.ch.DirectBuffer;
/**
* Based on org.apache.spark.unsafe.Platform
*/
/** Based on org.apache.spark.unsafe.Platform */
public final class Platform {
public static final Unsafe UNSAFE;
@@ -51,18 +49,19 @@ public final class Platform {
}
private static final ThreadLocal<ByteBuffer> localEmptyBuffer =
ThreadLocal.withInitial(() -> {
try {
return (ByteBuffer) DBB_CONSTRUCTOR.newInstance(0, 0);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
UNSAFE.throwException(e);
}
throw new IllegalStateException("unreachable");
});
ThreadLocal.withInitial(
() -> {
try {
return (ByteBuffer) DBB_CONSTRUCTOR.newInstance(0, 0);
} catch (InstantiationException
| IllegalAccessException
| InvocationTargetException e) {
UNSAFE.throwException(e);
}
throw new IllegalStateException("unreachable");
});
/**
* Wrap a buffer [address, address + size) as a DirectByteBuffer.
*/
/** Wrap a buffer [address, address + size) as a DirectByteBuffer. */
public static ByteBuffer wrapDirectBuffer(long address, int size) {
ByteBuffer buffer = localEmptyBuffer.get().duplicate();
UNSAFE.putLong(buffer, BUFFER_ADDRESS_FIELD_OFFSET, address);
@@ -71,21 +70,15 @@ public final class Platform {
return buffer;
}
/**
* Wrap a buffer [address, address + size) into provided <code>buffer</code>.
*/
/** Wrap a buffer [address, address + size) into provided <code>buffer</code>. */
public static void wrapDirectBuffer(ByteBuffer buffer, long address, int size) {
UNSAFE.putLong(buffer, BUFFER_ADDRESS_FIELD_OFFSET, address);
UNSAFE.putInt(buffer, BUFFER_CAPACITY_FIELD_OFFSET, size);
buffer.clear();
}
/**
* @param buffer a DirectBuffer backed by off-heap memory
* @return address of off-heap memory
*/
/** @param buffer a DirectBuffer backed by off-heap memory Returns address of off-heap memory */
public static long getAddress(ByteBuffer buffer) {
return ((DirectBuffer) buffer).address();
}
}
@@ -9,15 +9,13 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* RayUtils is the utility class to access ray runtime api.
*/
/** RayUtils is the utility class to access ray runtime api. */
public class RayUtils {
/**
* Get all node info from GCS
*
* @return node info list
* <p>Returns node info list
*/
public static List<NodeInfo> getAllNodeInfo() {
if (Ray.getRuntimeContext().isSingleProcess()) {
@@ -30,7 +28,7 @@ public class RayUtils {
/**
* Get all alive node info map
*
* @return node info map, key is unique node id , value is node info
* <p>Returns node info map, key is unique node id , value is node info
*/
public static Map<UniqueId, NodeInfo> getAliveNodeInfoMap() {
return getAllNodeInfo().stream()
@@ -50,13 +48,18 @@ public class RayUtils {
for (int byteIndex = 0; byteIndex < UniqueId.LENGTH; ++byteIndex) {
nodeIdBytes[byteIndex] = String.valueOf(i).getBytes()[0];
}
NodeInfo nodeInfo = new NodeInfo(new UniqueId(nodeIdBytes),
"localhost" + i, "localhost" + i, -1,
"", "",
true, resources);
NodeInfo nodeInfo =
new NodeInfo(
new UniqueId(nodeIdBytes),
"localhost" + i,
"localhost" + i,
-1,
"",
"",
true,
resources);
nodeInfos.add(nodeInfo);
}
return nodeInfos;
}
}
@@ -20,7 +20,7 @@ public class ReflectionUtils {
/**
* For covariant return type, return the most specific method.
*
* @return all methods named by {@code methodName},
* <p>Returns all methods named by {@code methodName},
*/
public static List<Method> findMethods(Class<?> cls, String methodName) {
List<Class<?>> classes = new ArrayList<>();
@@ -55,10 +55,10 @@ public class ReflectionUtils {
}
/**
* <p>Gets a <code>List</code> of all interfaces implemented by the given
* class and its superclasses.</p>
* <p>The order is determined by looking through each interface in turn as
* declared in the source file and following its hierarchy up.</p>
* Gets a <code>List</code> of all interfaces implemented by the given class and its superclasses.
*
* <p>The order is determined by looking through each interface in turn as declared in the source
* file and following its hierarchy up.
*/
public static List<Class<?>> getAllInterfaces(Class<?> cls) {
if (cls == null) {
@@ -83,5 +83,4 @@ public class ReflectionUtils {
cls = cls.getSuperclass();
}
}
}
@@ -16,35 +16,35 @@ import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Resource Utility collects current OS and JVM resource usage information
*/
/** Resource Utility collects current OS and JVM resource usage information */
public class ResourceUtil {
public static final Logger LOG = LoggerFactory.getLogger(ResourceUtil.class);
/**
* Refer to: https://docs.oracle.com/javase/8/docs/jre/api/management/extension/com/sun/management/OperatingSystemMXBean.html
* Refer to:
* https://docs.oracle.com/javase/8/docs/jre/api/management/extension/com/sun/management/OperatingSystemMXBean.html
*/
private static OperatingSystemMXBean osmxb =
(OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean();
/**
* Log current jvm process's memory detail
*/
/** Log current jvm process's memory detail */
public static void logProcessMemoryDetail() {
int mb = 1024 * 1024;
//Getting the runtime reference from system
// Getting the runtime reference from system
Runtime runtime = Runtime.getRuntime();
StringBuilder sb = new StringBuilder(32);
sb.append("used memory: ").append((runtime.totalMemory() - runtime.freeMemory()) / mb)
.append(", free memory: ").append(runtime.freeMemory() / mb)
.append(", total memory: ").append(runtime.totalMemory() / mb)
.append(", max memory: ").append(runtime.maxMemory() / mb);
sb.append("used memory: ")
.append((runtime.totalMemory() - runtime.freeMemory()) / mb)
.append(", free memory: ")
.append(runtime.freeMemory() / mb)
.append(", total memory: ")
.append(runtime.totalMemory() / mb)
.append(", max memory: ")
.append(runtime.maxMemory() / mb);
if (LOG.isInfoEnabled()) {
LOG.info(sb.toString());
@@ -52,8 +52,8 @@ public class ResourceUtil {
}
/**
* @return jvm heap usage ratio. note that one of the survivor space is not include in total
* memory while calculating this ratio.
* Returns jvm heap usage ratio. note that one of the survivor space is not include in total
* memory while calculating this ratio.
*/
public static double getJvmHeapUsageRatio() {
Runtime runtime = Runtime.getRuntime();
@@ -61,33 +61,27 @@ public class ResourceUtil {
}
/**
* @return jvm heap usage(in bytes).
* note that this value doesn't include one of the survivor space.
* Returns jvm heap usage(in bytes). note that this value doesn't include one of the survivor
* space.
*/
public static long getJvmHeapUsageInBytes() {
Runtime runtime = Runtime.getRuntime();
return runtime.totalMemory() - runtime.freeMemory();
}
/**
* @return the total amount of physical memory in bytes.
*/
/** Returns the total amount of physical memory in bytes. */
public static long getSystemTotalMemory() {
return osmxb.getTotalPhysicalMemorySize();
}
/**
* @return the used system physical memory in bytes
*/
/** Returns the used system physical memory in bytes */
public static long getSystemMemoryUsage() {
long totalMemory = osmxb.getTotalPhysicalMemorySize();
long freeMemory = osmxb.getFreePhysicalMemorySize();
return totalMemory - freeMemory;
}
/**
* @return the ratio of used system physical memory. This value is a double in the [0.0,1.0]
*/
/** Returns the ratio of used system physical memory. This value is a double in the [0.0,1.0] */
public static double getSystemMemoryUsageRatio() {
double totalMemory = osmxb.getTotalPhysicalMemorySize();
double freeMemory = osmxb.getFreePhysicalMemorySize();
@@ -95,18 +89,14 @@ public class ResourceUtil {
return 1 - ratio;
}
/**
* @return the cpu load for current jvm process. This value is a double in the [0.0,1.0]
*/
/** Returns the cpu load for current jvm process. This value is a double in the [0.0,1.0] */
public static double getProcessCpuUsage() {
return osmxb.getProcessCpuLoad();
}
/**
* @return the system cpu usage.
* This value is a double in the [0.0,1.0]
* We will try to use `vsar` to get cpu usage by default,
* and use MXBean if any exception raised.
* Returns the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar`
* to get cpu usage by default, and use MXBean if any exception raised.
*/
public static double getSystemCpuUsage() {
double cpuUsage = 0.0;
@@ -128,9 +118,7 @@ public class ResourceUtil {
return osmxb.getSystemCpuLoad();
}
/**
* Get system cpu util by vsar
*/
/** Get system cpu util by vsar */
public static double getSystemCpuUtilByVsar() throws Exception {
double cpuUsageFromVsar = 0.0;
String[] vsarCpuCommand = {"/bin/sh", "-c", "vsar --check --cpu -s util"};
@@ -156,16 +144,12 @@ public class ResourceUtil {
return cpuUsageFromVsar;
}
/**
* @returns the system load average for the last minute
*/
/** Returnss the system load average for the last minute */
public static double getSystemLoadAverage() {
return osmxb.getSystemLoadAverage();
}
/**
* @return system cpu cores num
*/
/** Returns system cpu cores num */
public static int getCpuCores() {
return osmxb.getAvailableProcessors();
}
@@ -174,44 +158,40 @@ public class ResourceUtil {
* Get containers by hostname of address
*
* @param containers container list
* @param containerHosts container hostname or address set
* @return matched containers
* @param containerHosts container hostname or address set Returns matched containers
*/
public static List<Container> getContainersByHostname(
List<Container> containers,
Collection<String> containerHosts) {
List<Container> containers, Collection<String> containerHosts) {
return containers.stream()
.filter(container ->
containerHosts.contains(container.getHostname()) ||
containerHosts.contains(container.getAddress()))
.filter(
container ->
containerHosts.contains(container.getHostname())
|| containerHosts.contains(container.getAddress()))
.collect(Collectors.toList());
}
/**
* Get container by hostname
*
* @param hostName container hostname
* @return container
* @param hostName container hostname Returns container
*/
public static Optional<Container> getContainerByHostname(
List<Container> containers,
String hostName) {
List<Container> containers, String hostName) {
return containers.stream()
.filter(container -> container.getHostname().equals(hostName) ||
container.getAddress().equals(hostName))
.filter(
container ->
container.getHostname().equals(hostName) || container.getAddress().equals(hostName))
.findFirst();
}
/**
* Get container by id
*
* @param containerID container id
* @return container
* @param containerID container id Returns container
*/
public static Optional<Container> getContainerById(
List<Container> containers,
ContainerId containerID) {
List<Container> containers, ContainerId containerID) {
return containers.stream()
.filter(container -> container.getId().equals(containerID))
.findFirst();
@@ -11,5 +11,4 @@ public class Serializer {
public static <T> T decode(byte[] bytes) {
return FstSerializer.decode(bytes);
}
}
@@ -31,8 +31,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The streaming worker implementation class, it is ray actor. JobWorker is created by
* {@link JobMaster} through ray api, and JobMaster communicates with JobWorker through Ray.call().
* The streaming worker implementation class, it is ray actor. JobWorker is created by {@link
* JobMaster} through ray api, and JobMaster communicates with JobWorker through Ray.call().
*
* <p>The JobWorker is responsible for creating tasks and defines the methods of communication
* between workers.
@@ -49,27 +49,23 @@ public class JobWorker implements Serializable {
}
public final Object initialStateChangeLock = new Object();
/**
* isRecreate=true means this worker is initialized more than once after actor created.
*/
/** isRecreate=true means this worker is initialized more than once after actor created. */
public AtomicBoolean isRecreate = new AtomicBoolean(false);
public ContextBackend contextBackend;
private JobWorkerContext workerContext;
private ExecutionVertex executionVertex;
private StreamingWorkerConfig workerConfig;
/**
* The while-loop thread to read message, process message, and write results
*/
/** The while-loop thread to read message, process message, and write results */
private StreamTask task;
/**
* transferHandler handles messages by ray direct call
*/
/** transferHandler handles messages by ray direct call */
private TransferHandler transferHandler;
/**
* A flag to avoid duplicated rollback. Becomes true after requesting
* rollback, set to false when finish rollback.
* A flag to avoid duplicated rollback. Becomes true after requesting rollback, set to false when
* finish rollback.
*/
private boolean isNeedRollback = false;
private int rollbackCount = 0;
public JobWorker(ExecutionVertex executionVertex) {
@@ -80,7 +76,8 @@ public class JobWorker implements Serializable {
this.workerConfig = new StreamingWorkerConfig(executionVertex.getWorkerConfig());
this.contextBackend = ContextBackendFactory.getContextBackend(this.workerConfig);
LOG.info("Ray.getRuntimeContext().wasCurrentActorRestarted()={}",
LOG.info(
"Ray.getRuntimeContext().wasCurrentActorRestarted()={}",
Ray.getRuntimeContext().wasCurrentActorRestarted());
if (!Ray.getRuntimeContext().wasCurrentActorRestarted()) {
saveContext();
@@ -93,14 +90,14 @@ public class JobWorker implements Serializable {
byte[] bytes = CheckpointStateUtil.get(contextBackend, getJobWorkerContextKey());
if (bytes != null) {
JobWorkerContext context = Serializer.decode(bytes);
LOG.info("Worker recover from checkpoint state, byte len={}, context={}.", bytes.length,
context);
LOG.info(
"Worker recover from checkpoint state, byte len={}, context={}.", bytes.length, context);
init(context);
requestRollback("LoadCheckpoint request rollback in new actor.");
} else {
LOG.error(
"Worker is reconstructed, but can't load checkpoint. " +
"Check whether you checkpoint state is reliable. Current checkpoint state is {}.",
"Worker is reconstructed, but can't load checkpoint. "
+ "Check whether you checkpoint state is reliable. Current checkpoint state is {}.",
contextBackend.getClass().getName());
}
}
@@ -108,19 +105,23 @@ public class JobWorker implements Serializable {
public synchronized void saveContext() {
byte[] contextBytes = Serializer.encode(workerContext);
String key = getJobWorkerContextKey();
LOG.info("Saving context, worker context={}, serialized byte length={}, key={}.", workerContext,
contextBytes.length, key);
LOG.info(
"Saving context, worker context={}, serialized byte length={}, key={}.",
workerContext,
contextBytes.length,
key);
CheckpointStateUtil.put(contextBackend, key, contextBytes);
}
/**
* Initialize JobWorker and data communication pipeline.
*/
/** Initialize JobWorker and data communication pipeline. */
public Boolean init(JobWorkerContext workerContext) {
// IMPORTANT: some test cases depends on this log to find workers' pid,
// be careful when changing this log.
LOG.info("Initiating job worker: {}. Worker context is: {}, pid={}.",
workerContext.getWorkerName(), workerContext, EnvUtil.getJvmPid());
LOG.info(
"Initiating job worker: {}. Worker context is: {}, pid={}.",
workerContext.getWorkerName(),
workerContext,
EnvUtil.getJvmPid());
this.workerContext = workerContext;
this.executionVertex = workerContext.getExecutionVertex();
@@ -136,20 +137,25 @@ public class JobWorker implements Serializable {
/**
* Start worker's stream tasks with specific checkpoint ID.
*
* @return a {@link CallResult} with {@link ChannelRecoverInfo},
* contains {@link ChannelCreationStatus} of each input queue.
* <p>Returns a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link
* ChannelCreationStatus} of each input queue.
*/
public CallResult<ChannelRecoverInfo> rollback(Long checkpointId, Long startRollbackTs) {
synchronized (initialStateChangeLock) {
if (task != null && task.isAlive() && checkpointId == task.lastCheckpointId &&
task.isInitialState) {
if (task != null
&& task.isAlive()
&& checkpointId == task.lastCheckpointId
&& task.isInitialState) {
return CallResult.skipped("Task is already in initial state, skip this rollback.");
}
}
long remoteCallCost = System.currentTimeMillis() - startRollbackTs;
LOG.info("Start rollback[{}], checkpoint is {}, remote call cost {}ms.",
executionVertex.getExecutionJobVertexName(), checkpointId, remoteCallCost);
LOG.info(
"Start rollback[{}], checkpoint is {}, remote call cost {}ms.",
executionVertex.getExecutionJobVertexName(),
checkpointId,
remoteCallCost);
rollbackCount++;
if (rollbackCount > 1) {
@@ -157,7 +163,7 @@ public class JobWorker implements Serializable {
}
try {
//Init transfer
// Init transfer
TransferChannelType channelType = workerConfig.transferConfig.channelType();
if (TransferChannelType.NATIVE_CHANNEL == channelType) {
transferHandler = new TransferHandler();
@@ -174,8 +180,10 @@ public class JobWorker implements Serializable {
ChannelRecoverInfo channelRecoverInfo = task.recover(isRecreate.get());
isNeedRollback = false;
LOG.info("Rollback job worker success, checkpoint is {}, channelRecoverInfo is {}.",
checkpointId, channelRecoverInfo);
LOG.info(
"Rollback job worker success, checkpoint is {}, channelRecoverInfo is {}.",
checkpointId,
channelRecoverInfo);
return CallResult.success(channelRecoverInfo);
} catch (Exception e) {
@@ -184,13 +192,11 @@ public class JobWorker implements Serializable {
}
}
/**
* Create tasks based on the processor corresponding of the operator.
*/
/** Create tasks based on the processor corresponding of the operator. */
private StreamTask createStreamTask(long checkpointId) {
StreamTask task;
StreamProcessor streamProcessor = ProcessBuilder
.buildProcessor(executionVertex.getStreamOperator());
StreamProcessor streamProcessor =
ProcessBuilder.buildProcessor(executionVertex.getStreamOperator());
LOG.debug("Stream processor created: {}.", streamProcessor);
if (streamProcessor instanceof SourceProcessor) {
@@ -208,9 +214,7 @@ public class JobWorker implements Serializable {
// Checkpoint
// ----------------------------------------------------------------------
/**
* Trigger source job worker checkpoint
*/
/** Trigger source job worker checkpoint */
public Boolean triggerCheckpoint(Long barrierId) {
LOG.info("Receive trigger, barrierId is {}.", barrierId);
if (task != null) {
@@ -228,9 +232,11 @@ public class JobWorker implements Serializable {
}
public Boolean clearExpiredCheckpoint(Long expiredStateCpId, Long expiredQueueCpId) {
LOG.info("Clear expired checkpoint state, checkpoint id is {}; " +
"Clear expired queue msg, checkpoint id is {}",
expiredStateCpId, expiredQueueCpId);
LOG.info(
"Clear expired checkpoint state, checkpoint id is {}; "
+ "Clear expired queue msg, checkpoint id is {}",
expiredStateCpId,
expiredQueueCpId);
if (task != null) {
if (expiredStateCpId > 0) {
task.clearExpiredCpState(expiredStateCpId);
@@ -247,13 +253,14 @@ public class JobWorker implements Serializable {
LOG.info("Request rollback.");
isNeedRollback = true;
isRecreate.set(true);
boolean requestRet = RemoteCallMaster.requestJobWorkerRollback(
workerContext.getMaster(), new WorkerRollbackRequest(
workerContext.getWorkerActorId(),
exceptionMsg,
EnvUtil.getHostName(),
EnvUtil.getJvmPid()
));
boolean requestRet =
RemoteCallMaster.requestJobWorkerRollback(
workerContext.getMaster(),
new WorkerRollbackRequest(
workerContext.getWorkerActorId(),
exceptionMsg,
EnvUtil.getHostName(),
EnvUtil.getJvmPid()));
if (!requestRet) {
LOG.warn("Job worker request rollback failed! exceptionMsg={}.", exceptionMsg);
}
@@ -262,8 +269,10 @@ public class JobWorker implements Serializable {
public Boolean checkIfNeedRollback(Long startCallTs) {
// No save checkpoint in this query.
long remoteCallCost = System.currentTimeMillis() - startCallTs;
LOG.info("Finished checking if need to rollback with result: {}, rpc delay={}ms.",
isNeedRollback, remoteCallCost);
LOG.info(
"Finished checking if need to rollback with result: {}, rpc delay={}ms.",
isNeedRollback,
remoteCallCost);
return isNeedRollback;
}
@@ -286,12 +295,11 @@ public class JobWorker implements Serializable {
private String getJobWorkerContextKey() {
return workerConfig.checkpointConfig.jobWorkerContextCpPrefixKey()
+ workerConfig.commonConfig.jobName()
+ "_" + executionVertex.getExecutionVertexId();
+ "_"
+ executionVertex.getExecutionVertexId();
}
/**
* Used by upstream streaming queue to send data to this actor
*/
/** Used by upstream streaming queue to send data to this actor */
public void onReaderMessage(byte[] buffer) {
if (transferHandler != null) {
transferHandler.onReaderMessage(buffer);
@@ -308,9 +316,7 @@ public class JobWorker implements Serializable {
return transferHandler.onReaderMessageSync(buffer);
}
/**
* Used by downstream streaming queue to send data to this actor
*/
/** Used by downstream streaming queue to send data to this actor */
public void onWriterMessage(byte[] buffer) {
if (transferHandler != null) {
transferHandler.onWriterMessage(buffer);
@@ -327,5 +333,4 @@ public class JobWorker implements Serializable {
}
return transferHandler.onWriterMessageSync(buffer);
}
}
@@ -13,24 +13,16 @@ import io.ray.streaming.runtime.python.GraphPbBuilder;
import java.io.Serializable;
import java.util.Map;
/**
* Job worker context of java type.
*/
/** Job worker context of java type. */
public class JobWorkerContext implements Serializable {
/**
* JobMaster actor.
*/
/** JobMaster actor. */
private ActorHandle<JobMaster> master;
/**
* Worker's vertex info.
*/
/** Worker's vertex info. */
private ExecutionVertex executionVertex;
public JobWorkerContext(
ActorHandle<JobMaster> master,
ExecutionVertex executionVertex) {
public JobWorkerContext(ActorHandle<JobMaster> master, ExecutionVertex executionVertex) {
this.master = master;
this.executionVertex = executionVertex;
}
@@ -81,14 +73,13 @@ public class JobWorkerContext implements Serializable {
RemoteCall.ExecutionVertexContext executionVertexContext =
new GraphPbBuilder().buildExecutionVertexContext(executionVertex);
byte[] contextBytes = RemoteCall.PythonJobWorkerContext.newBuilder()
.setMasterActor(
ByteString.copyFrom((((NativeActorHandle) (master)).toBytes())))
.setExecutionVertexContext(executionVertexContext)
.build()
.toByteArray();
byte[] contextBytes =
RemoteCall.PythonJobWorkerContext.newBuilder()
.setMasterActor(ByteString.copyFrom((((NativeActorHandle) (master)).toBytes())))
.setExecutionVertexContext(executionVertexContext)
.build()
.toByteArray();
return contextBytes;
}
}
@@ -15,19 +15,14 @@ import io.ray.streaming.state.keystate.state.MapState;
import io.ray.streaming.state.keystate.state.ValueState;
import java.util.Map;
/**
* Use Ray to implement RuntimeContext.
*/
/** Use Ray to implement RuntimeContext. */
public class StreamingRuntimeContext implements RuntimeContext {
/**
* Backend for keyed state. This might be empty if we're not on a keyed stream.
*/
/** Backend for keyed state. This might be empty if we're not on a keyed stream. */
protected transient KeyStateBackend keyStateBackend;
/**
* Backend for operator state. This might be empty
*/
/** Backend for operator state. This might be empty */
protected transient OperatorStateBackend operatorStateBackend;
private int taskId;
private int taskIndex;
private int parallelism;
@@ -35,8 +30,7 @@ public class StreamingRuntimeContext implements RuntimeContext {
private Map<String, String> config;
public StreamingRuntimeContext(
ExecutionVertex executionVertex, Map<String, String> config,
int parallelism) {
ExecutionVertex executionVertex, Map<String, String> config, int parallelism) {
this.taskId = executionVertex.getExecutionVertexId();
this.config = config;
this.taskIndex = executionVertex.getExecutionVertexIndex();
@@ -118,8 +112,7 @@ public class StreamingRuntimeContext implements RuntimeContext {
}
protected void stateSanityCheck(
AbstractStateDescriptor stateDescriptor,
AbstractKeyStateBackend backend) {
AbstractStateDescriptor stateDescriptor, AbstractKeyStateBackend backend) {
Preconditions.checkNotNull(stateDescriptor, "The state properties must not be null");
Preconditions.checkNotNull(backend, "backend must not be null");
}
@@ -33,8 +33,7 @@ public abstract class InputStreamTask extends StreamTask {
}
@Override
protected void init() {
}
protected void init() {}
@Override
public void run() {
@@ -71,7 +70,9 @@ public abstract class InputStreamTask extends StreamTask {
queueBarrier.getData().get(barrierData);
RemoteCall.Barrier barrierPb = RemoteCall.Barrier.parseFrom(barrierData);
final long checkpointId = barrierPb.getId();
LOG.info("Start to do checkpoint {}, worker name is {}.", checkpointId,
LOG.info(
"Start to do checkpoint {}, worker name is {}.",
checkpointId,
jobWorker.getWorkerContext().getWorkerName());
final Map<String, OffsetInfo> inputPoints = queueBarrier.getInputOffsets();
@@ -80,8 +81,8 @@ public abstract class InputStreamTask extends StreamTask {
}
}
} catch (Throwable throwable) {
if (throwable instanceof ChannelInterruptException ||
ExceptionUtils.getRootCause(throwable) instanceof ChannelInterruptException) {
if (throwable instanceof ChannelInterruptException
|| ExceptionUtils.getRootCause(throwable) instanceof ChannelInterruptException) {
LOG.info("queue has stopped.");
} else {
// error occurred, need to rollback
@@ -95,8 +96,6 @@ public abstract class InputStreamTask extends StreamTask {
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("processor", processor)
.toString();
return MoreObjects.toStringHelper(this).add("processor", processor).toString();
}
}
@@ -3,9 +3,7 @@ package io.ray.streaming.runtime.worker.tasks;
import io.ray.streaming.runtime.core.processor.Processor;
import io.ray.streaming.runtime.worker.JobWorker;
/**
* Input stream task with 1 input. Such as: map operator.
*/
/** Input stream task with 1 input. Such as: map operator. */
public class OneInputStreamTask extends InputStreamTask {
public OneInputStreamTask(Processor inputProcessor, JobWorker jobWorker, long lastCheckpointId) {
@@ -16,11 +16,9 @@ public class SourceStreamTask extends StreamTask {
private final SourceProcessor sourceProcessor;
/**
* The pending barrier ID to be triggered.
*/
/** The pending barrier ID to be triggered. */
private final AtomicReference<Long> pendingBarrier = new AtomicReference<>();
private long lastCheckpointId = 0;
/**
@@ -33,8 +31,7 @@ public class SourceStreamTask extends StreamTask {
}
@Override
protected void init() {
}
protected void init() {}
@Override
public void run() {
@@ -50,15 +47,19 @@ public class SourceStreamTask extends StreamTask {
// Important: because cp maybe timeout, master will use the old checkpoint id again
if (pendingBarrier.compareAndSet(barrierId, null)) {
// source fetcher only have outputPoints
LOG.info("Start to do checkpoint {}, worker name is {}.",
barrierId, jobWorker.getWorkerContext().getWorkerName());
LOG.info(
"Start to do checkpoint {}, worker name is {}.",
barrierId,
jobWorker.getWorkerContext().getWorkerName());
doCheckpoint(barrierId, null);
LOG.info("Finish to do checkpoint {}.", barrierId);
} else {
// pendingCheckpointId has modify, should not happen
LOG.warn("Pending checkpointId modify unexpected, expect={}, now={}.", barrierId,
LOG.warn(
"Pending checkpointId modify unexpected, expect={}, now={}.",
barrierId,
pendingBarrier.get());
}
}
@@ -66,8 +67,8 @@ public class SourceStreamTask extends StreamTask {
sourceProcessor.fetch();
}
} catch (Throwable e) {
if (e instanceof ChannelInterruptException ||
ExceptionUtils.getRootCause(e) instanceof ChannelInterruptException) {
if (e instanceof ChannelInterruptException
|| ExceptionUtils.getRootCause(e) instanceof ChannelInterruptException) {
LOG.info("queue has stopped.");
} else {
// occur error, need to rollback
@@ -63,8 +63,9 @@ public abstract class StreamTask implements Runnable {
this.checkpointState = jobWorker.contextBackend;
this.lastCheckpointId = lastCheckpointId;
this.thread = new Thread(Ray.wrapRunnable(this),
this.getClass().getName() + "-" + System.currentTimeMillis());
this.thread =
new Thread(
Ray.wrapRunnable(this), this.getClass().getName() + "-" + System.currentTimeMillis());
this.thread.setDaemon(true);
}
@@ -98,18 +99,24 @@ public abstract class StreamTask implements Runnable {
}
/**
* Load checkpoint and build upstream and downstream data transmission
* channels according to {@link ExecutionVertex}.
* Load checkpoint and build upstream and downstream data transmission channels according to
* {@link ExecutionVertex}.
*/
private void prepareTask(boolean isRecreate) {
LOG.info("Preparing stream task, isRecreate={}.", isRecreate);
ExecutionVertex executionVertex = jobWorker.getExecutionVertex();
// set vertex info into config for native using
jobWorker.getWorkerConfig().workerInternalConfig.setProperty(
WorkerInternalConfig.WORKER_NAME_INTERNAL, executionVertex.getExecutionVertexName());
jobWorker.getWorkerConfig().workerInternalConfig.setProperty(
WorkerInternalConfig.OP_NAME_INTERNAL, executionVertex.getExecutionJobVertexName());
jobWorker
.getWorkerConfig()
.workerInternalConfig
.setProperty(
WorkerInternalConfig.WORKER_NAME_INTERNAL, executionVertex.getExecutionVertexName());
jobWorker
.getWorkerConfig()
.workerInternalConfig
.setProperty(
WorkerInternalConfig.OP_NAME_INTERNAL, executionVertex.getExecutionJobVertexName());
OperatorCheckpointInfo operatorCheckpointInfo = new OperatorCheckpointInfo();
byte[] bytes = null;
@@ -118,7 +125,9 @@ public abstract class StreamTask implements Runnable {
// in rescaling or something like that.
if (isRecreate) {
String cpKey = genOpCheckpointKey(lastCheckpointId);
LOG.info("Getting task checkpoints from state, cpKey={}, checkpointId={}.", cpKey,
LOG.info(
"Getting task checkpoints from state, cpKey={}, checkpointId={}.",
cpKey,
lastCheckpointId);
bytes = CheckpointStateUtil.get(checkpointState, cpKey);
if (bytes == null) {
@@ -133,31 +142,36 @@ public abstract class StreamTask implements Runnable {
processor.loadCheckpoint(operatorCheckpointInfo.processorCheckpoint);
LOG.info(
"Stream task recover from checkpoint state, checkpoint bytes len={}, checkpointInfo={}.",
bytes.length, operatorCheckpointInfo);
bytes.length,
operatorCheckpointInfo);
}
// writer
if (!executionVertex.getOutputEdges().isEmpty()) {
LOG.info("Register queue writer, channels={}, outputCheckpoints={}.",
executionVertex.getOutputChannelIdList(), operatorCheckpointInfo.outputPoints);
writer = new DataWriter(
LOG.info(
"Register queue writer, channels={}, outputCheckpoints={}.",
executionVertex.getOutputChannelIdList(),
executionVertex.getOutputActorList(),
operatorCheckpointInfo.outputPoints,
jobWorker.getWorkerConfig()
);
operatorCheckpointInfo.outputPoints);
writer =
new DataWriter(
executionVertex.getOutputChannelIdList(),
executionVertex.getOutputActorList(),
operatorCheckpointInfo.outputPoints,
jobWorker.getWorkerConfig());
}
// reader
if (!executionVertex.getInputEdges().isEmpty()) {
LOG.info("Register queue reader, channels={}, inputCheckpoints={}.",
executionVertex.getInputChannelIdList(), operatorCheckpointInfo.inputPoints);
reader = new DataReader(
LOG.info(
"Register queue reader, channels={}, inputCheckpoints={}.",
executionVertex.getInputChannelIdList(),
executionVertex.getInputActorList(),
operatorCheckpointInfo.inputPoints,
jobWorker.getWorkerConfig()
);
operatorCheckpointInfo.inputPoints);
reader =
new DataReader(
executionVertex.getInputChannelIdList(),
executionVertex.getInputActorList(),
operatorCheckpointInfo.inputPoints,
jobWorker.getWorkerConfig());
}
openProcessor();
@@ -186,27 +200,31 @@ public abstract class StreamTask implements Runnable {
opGroupedActor.get(opName).add(executionVertex.getOutputActorList().get(i));
opPartitionMap.put(opName, edge.getPartition());
}
opPartitionMap.keySet().forEach(opName -> {
collectors.add(new OutputCollector(
writer, opGroupedChannelId.get(opName),
opGroupedActor.get(opName), opPartitionMap.get(opName)
));
});
opPartitionMap
.keySet()
.forEach(
opName -> {
collectors.add(
new OutputCollector(
writer,
opGroupedChannelId.get(opName),
opGroupedActor.get(opName),
opPartitionMap.get(opName)));
});
RuntimeContext runtimeContext = new StreamingRuntimeContext(executionVertex,
jobWorker.getWorkerConfig().configMap, executionVertex.getParallelism());
RuntimeContext runtimeContext =
new StreamingRuntimeContext(
executionVertex,
jobWorker.getWorkerConfig().configMap,
executionVertex.getParallelism());
processor.open(collectors, runtimeContext);
}
/**
* Task initialization related work.
*/
/** Task initialization related work. */
protected abstract void init() throws Exception;
/**
* Close running tasks.
*/
/** Close running tasks. */
public void close() {
this.running = false;
if (thread.isAlive() && !Ray.getRuntimeContext().isSingleProcess()) {
@@ -230,23 +248,24 @@ public abstract class StreamTask implements Runnable {
Map<String, OffsetInfo> outputPoints = null;
if (writer != null) {
outputPoints = writer.getOutputCheckpoints();
RemoteCall.Barrier barrierPb =
RemoteCall.Barrier.newBuilder().setId(checkpointId).build();
RemoteCall.Barrier barrierPb = RemoteCall.Barrier.newBuilder().setId(checkpointId).build();
ByteBuffer byteBuffer = ByteBuffer.wrap(barrierPb.toByteArray());
byteBuffer.order(ByteOrder.nativeOrder());
writer.broadcastBarrier(checkpointId, byteBuffer);
}
LOG.info("Start do checkpoint, cp id={}, inputPoints={}, outputPoints={}.", checkpointId,
inputPoints, outputPoints);
LOG.info(
"Start do checkpoint, cp id={}, inputPoints={}, outputPoints={}.",
checkpointId,
inputPoints,
outputPoints);
this.lastCheckpointId = checkpointId;
Serializable processorCheckpoint = processor.saveCheckpoint();
try {
OperatorCheckpointInfo opCpInfo =
new OperatorCheckpointInfo(inputPoints, outputPoints, processorCheckpoint,
checkpointId);
new OperatorCheckpointInfo(inputPoints, outputPoints, processorCheckpoint, checkpointId);
saveCpStateAndReport(opCpInfo, checkpointId);
} catch (Exception e) {
// there will be exceptions when flush state to backend.
@@ -258,8 +277,7 @@ public abstract class StreamTask implements Runnable {
}
private void saveCpStateAndReport(
OperatorCheckpointInfo operatorCheckpointInfo,
long checkpointId) {
OperatorCheckpointInfo operatorCheckpointInfo, long checkpointId) {
saveCp(operatorCheckpointInfo, checkpointId);
reportCommit(checkpointId);
@@ -269,8 +287,11 @@ public abstract class StreamTask implements Runnable {
private void saveCp(OperatorCheckpointInfo operatorCheckpointInfo, long checkpointId) {
byte[] bytes = Serializer.encode(operatorCheckpointInfo);
String cpKey = genOpCheckpointKey(checkpointId);
LOG.info("Saving task checkpoint, cpKey={}, byte len={}, checkpointInfo={}.", cpKey,
bytes.length, operatorCheckpointInfo);
LOG.info(
"Saving task checkpoint, cpKey={}, byte len={}, checkpointInfo={}.",
cpKey,
bytes.length,
operatorCheckpointInfo);
synchronized (checkpointState) {
if (outdatedCheckpoints.contains(checkpointId)) {
LOG.info("Outdated checkpoint, skip save checkpoint.");
@@ -284,8 +305,8 @@ public abstract class StreamTask implements Runnable {
private void reportCommit(long checkpointId) {
final JobWorkerContext context = jobWorker.getWorkerContext();
LOG.info("Report commit async, checkpoint id {}.", checkpointId);
RemoteCallMaster.reportJobWorkerCommitAsync(context.getMaster(),
new WorkerCommitReport(context.getWorkerActorId(), checkpointId));
RemoteCallMaster.reportJobWorkerCommitAsync(
context.getMaster(), new WorkerCommitReport(context.getWorkerActorId(), checkpointId));
}
public void notifyCheckpointTimeout(long checkpointId) {
@@ -335,7 +356,11 @@ public abstract class StreamTask implements Runnable {
// TODO: need to support job restart and actorId changed
final JobWorkerContext context = jobWorker.getWorkerContext();
return jobWorker.getWorkerConfig().checkpointConfig.jobWorkerOpCpPrefixKey()
+ context.getJobName() + "_" + context.getWorkerName() + "_" + checkpointId;
+ context.getJobName()
+ "_"
+ context.getWorkerName()
+ "_"
+ checkpointId;
}
// ----------------------------------------------------------------------
@@ -4,9 +4,7 @@ import io.ray.streaming.runtime.core.processor.Processor;
import io.ray.streaming.runtime.core.processor.TwoInputProcessor;
import io.ray.streaming.runtime.worker.JobWorker;
/**
* Input stream task with 2 inputs. Such as: join operator.
*/
/** Input stream task with 2 inputs. Such as: join operator. */
public class TwoInputStreamTask extends InputStreamTask {
public TwoInputStreamTask(
@@ -19,5 +17,4 @@ public class TwoInputStreamTask extends InputStreamTask {
((TwoInputProcessor) (super.processor)).setLeftStream(leftStream);
((TwoInputProcessor) (super.processor)).setRightStream(rightStream);
}
}
@@ -24,13 +24,17 @@ public abstract class BaseUnitTest {
@BeforeMethod
public void testBegin(Method method) {
LOG.info(">>>>>>>>>>>>>>>>>>>> Test case: {}.{} began >>>>>>>>>>>>>>>>>>>>",
method.getDeclaringClass(), method.getName());
LOG.info(
">>>>>>>>>>>>>>>>>>>> Test case: {}.{} began >>>>>>>>>>>>>>>>>>>>",
method.getDeclaringClass(),
method.getName());
}
@AfterMethod
public void testEnd(Method method) {
LOG.info(">>>>>>>>>>>>>>>>>>>> Test case: {}.{} end >>>>>>>>>>>>>>>>>>>>",
method.getDeclaringClass(), method.getName());
LOG.info(
">>>>>>>>>>>>>>>>>>>> Test case: {}.{} end >>>>>>>>>>>>>>>>>>>>",
method.getDeclaringClass(),
method.getName());
}
}
@@ -15,4 +15,4 @@ public class TestHelper {
public static boolean isUT() {
return UT_FLAG;
}
}
}
@@ -43,20 +43,25 @@ public class ExecutionGraphTest extends BaseUnitTest {
Assert.assertEquals(executionJobVertices.size(), jobGraph.getJobVertices().size());
int totalVertexNum = jobGraph.getJobVertices().stream()
.mapToInt(JobVertex::getParallelism).sum();
int totalVertexNum =
jobGraph.getJobVertices().stream().mapToInt(JobVertex::getParallelism).sum();
Assert.assertEquals(executionGraph.getAllExecutionVertices().size(), totalVertexNum);
Assert.assertEquals(executionGraph.getAllExecutionVertices().size(),
Assert.assertEquals(
executionGraph.getAllExecutionVertices().size(),
executionGraph.getExecutionVertexIdGenerator().get());
executionGraph.getAllExecutionVertices().forEach(vertex -> {
Assert.assertNotNull(vertex.getStreamOperator());
Assert.assertNotNull(vertex.getExecutionJobVertexName());
Assert.assertNotNull(vertex.getVertexType());
Assert.assertNotNull(vertex.getLanguage());
Assert.assertEquals(vertex.getExecutionVertexName(),
vertex.getExecutionJobVertexName() + "-" + vertex.getExecutionVertexIndex());
});
executionGraph
.getAllExecutionVertices()
.forEach(
vertex -> {
Assert.assertNotNull(vertex.getStreamOperator());
Assert.assertNotNull(vertex.getExecutionJobVertexName());
Assert.assertNotNull(vertex.getVertexType());
Assert.assertNotNull(vertex.getLanguage());
Assert.assertEquals(
vertex.getExecutionVertexName(),
vertex.getExecutionJobVertexName() + "-" + vertex.getExecutionVertexIndex());
});
int startIndex = 0;
ExecutionJobVertex upStream = executionJobVertices.get(startIndex);
@@ -65,13 +70,17 @@ public class ExecutionGraphTest extends BaseUnitTest {
List<ExecutionVertex> upStreamVertices = upStream.getExecutionVertices();
List<ExecutionVertex> downStreamVertices = downStream.getExecutionVertices();
upStreamVertices.forEach(vertex -> {
Assert.assertEquals((double) vertex.getResource().get(ResourceType.CPU.name()), 2.0);
vertex.getOutputEdges().forEach(upStreamOutPutEdge -> {
Assert
.assertTrue(downStreamVertices.contains(upStreamOutPutEdge.getTargetExecutionVertex()));
});
});
upStreamVertices.forEach(
vertex -> {
Assert.assertEquals((double) vertex.getResource().get(ResourceType.CPU.name()), 2.0);
vertex
.getOutputEdges()
.forEach(
upStreamOutPutEdge -> {
Assert.assertTrue(
downStreamVertices.contains(upStreamOutPutEdge.getTargetExecutionVertex()));
});
});
}
public static ExecutionGraph buildExecutionGraph(GraphManager graphManager) {
@@ -84,8 +93,8 @@ public class ExecutionGraphTest extends BaseUnitTest {
public static JobGraph buildJobGraph() {
StreamingContext streamingContext = StreamingContext.buildContext();
DataStream<String> dataStream = DataStreamSource.fromCollection(streamingContext,
Lists.newArrayList("a", "b", "c"));
DataStream<String> dataStream =
DataStreamSource.fromCollection(streamingContext, Lists.newArrayList("a", "b", "c"));
StreamSink streamSink = dataStream.sink(x -> LOG.info(x));
Map<String, String> jobConfig = new HashMap<>();
@@ -94,10 +103,9 @@ public class ExecutionGraphTest extends BaseUnitTest {
jobConfig.put(ResourceConfig.TASK_RESOURCE_CPU, "2.0");
jobConfig.put(ResourceConfig.TASK_RESOURCE_MEM, "2.0");
JobGraphBuilder jobGraphBuilder = new JobGraphBuilder(
Lists.newArrayList(streamSink), "test", jobConfig);
JobGraphBuilder jobGraphBuilder =
new JobGraphBuilder(Lists.newArrayList(streamSink), "test", jobConfig);
return jobGraphBuilder.build();
}
}
@@ -45,8 +45,7 @@ public class HybridStreamTest {
@Test(timeOut = 60000)
public void testHybridDataStream() throws Exception {
Ray.shutdown();
Preconditions.checkArgument(
EnvUtil.executeCommand(ImmutableList.of("ray", "stop"), 5));
Preconditions.checkArgument(EnvUtil.executeCommand(ImmutableList.of("ray", "stop"), 5));
String sinkFileName = "/tmp/testHybridDataStream.txt";
Files.deleteIfExists(Paths.get(sinkFileName));
@@ -59,18 +58,22 @@ public class HybridStreamTest {
.map("ray.streaming.tests.test_hybrid_stream", "map_func1")
.filter("ray.streaming.tests.test_hybrid_stream", "filter_func1")
.asJavaStream()
.sink((SinkFunction<Object>) value -> {
LOG.info("HybridStreamTest: {}", value);
try {
if (!Files.exists(Paths.get(sinkFileName))) {
Files.createFile(Paths.get(sinkFileName));
}
Files.write(Paths.get(sinkFileName), value.toString().getBytes(),
StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
.sink(
(SinkFunction<Object>)
value -> {
LOG.info("HybridStreamTest: {}", value);
try {
if (!Files.exists(Paths.get(sinkFileName))) {
Files.createFile(Paths.get(sinkFileName));
}
Files.write(
Paths.get(sinkFileName),
value.toString().getBytes(),
StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
context.execute("HybridStreamTestJob");
int sleptTime = 0;
TimeUnit.SECONDS.sleep(3);
@@ -94,5 +97,4 @@ public class HybridStreamTest {
context.stop();
LOG.info("HybridStreamTest succeed");
}
}
@@ -35,18 +35,22 @@ public class UnionStreamTest {
DataStreamSource.fromCollection(context, Arrays.asList(1, 1));
streamSource1
.union(streamSource2, streamSource3)
.sink((SinkFunction<Integer>) value -> {
LOG.info("UnionStreamTest, sink: {}", value);
try {
if (!Files.exists(Paths.get(sinkFileName))) {
Files.createFile(Paths.get(sinkFileName));
}
Files.write(Paths.get(sinkFileName), value.toString().getBytes(),
StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
.sink(
(SinkFunction<Integer>)
value -> {
LOG.info("UnionStreamTest, sink: {}", value);
try {
if (!Files.exists(Paths.get(sinkFileName))) {
Files.createFile(Paths.get(sinkFileName));
}
Files.write(
Paths.get(sinkFileName),
value.toString().getBytes(),
StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
context.execute("UnionStreamTest");
int sleptTime = 0;
TimeUnit.SECONDS.sleep(3);
@@ -68,5 +72,4 @@ public class UnionStreamTest {
context.stop();
LOG.info("HybridStreamTest succeed");
}
}
@@ -37,19 +37,21 @@ public class WordCountTest extends BaseUnitTest implements Serializable {
text.add("hello world eagle eagle eagle");
DataStreamSource<String> streamSource = DataStreamSource.fromCollection(streamingContext, text);
streamSource
.flatMap((FlatMapFunction<String, WordAndCount>) (value, collector) -> {
String[] records = value.split(" ");
for (String record : records) {
collector.collect(new WordAndCount(record, 1));
}
})
.flatMap(
(FlatMapFunction<String, WordAndCount>)
(value, collector) -> {
String[] records = value.split(" ");
for (String record : records) {
collector.collect(new WordAndCount(record, 1));
}
})
.filter(pair -> !pair.word.contains("world"))
.keyBy(pair -> pair.word)
.reduce((ReduceFunction<WordAndCount>) (oldValue, newValue) ->
new WordAndCount(oldValue.word,
oldValue.count + newValue.count))
.sink((SinkFunction<WordAndCount>)
result -> wordCount.put(result.word, result.count));
.reduce(
(ReduceFunction<WordAndCount>)
(oldValue, newValue) ->
new WordAndCount(oldValue.word, oldValue.count + newValue.count))
.sink((SinkFunction<WordAndCount>) result -> wordCount.put(result.word, result.count));
streamingContext.execute("testWordCount");
@@ -74,5 +76,4 @@ public class WordCountTest extends BaseUnitTest implements Serializable {
this.count = count;
}
}
}
@@ -31,5 +31,4 @@ public class JobMasterTest extends BaseUnitTest {
Assert.assertNull(jobMaster.getJobMasterActor());
Assert.assertFalse(jobMaster.init(false));
}
}

Some files were not shown because too many files have changed in this diff Show More