[Streaming] Fault Tolerance Implementation (#10595)

This commit is contained in:
Lixin Wei
2020-09-05 16:40:47 +08:00
committed by GitHub
parent 31f8ce4768
commit f31ee84bfd
161 changed files with 7071 additions and 1239 deletions
+70 -42
View File
@@ -5,6 +5,9 @@ import ray
import ray.streaming.generated.remote_call_pb2 as remote_call_pb
import ray.streaming.operator as operator
import ray.streaming.partition as partition
from ray._raylet import ActorID
from ray.actor import ActorHandle
from ray.streaming.config import Config
from ray.streaming.generated.streaming_pb2 import Language
logger = logging.getLogger(__name__)
@@ -27,10 +30,12 @@ class NodeType(enum.Enum):
class ExecutionEdge:
def __init__(self, edge_pb, language):
self.source_execution_vertex_id = edge_pb.source_execution_vertex_id
self.target_execution_vertex_id = edge_pb.target_execution_vertex_id
partition_bytes = edge_pb.partition
def __init__(self, execution_edge_pb, language):
self.source_execution_vertex_id = execution_edge_pb \
.source_execution_vertex_id
self.target_execution_vertex_id = execution_edge_pb \
.target_execution_vertex_id
partition_bytes = execution_edge_pb.partition
# Sink node doesn't have partition function,
# so we only deserialize partition_bytes when it's not None or empty
if language == Language.PYTHON and partition_bytes:
@@ -38,50 +43,73 @@ class ExecutionEdge:
class ExecutionVertex:
def __init__(self, vertex_pb):
self.execution_vertex_id = vertex_pb.execution_vertex_id
self.execution_job_vertex_Id = vertex_pb.execution_job_vertex_Id
self.execution_job_vertex_name = vertex_pb.execution_job_vertex_name
self.execution_vertex_index = vertex_pb.execution_vertex_index
self.parallelism = vertex_pb.parallelism
if vertex_pb.language == Language.PYTHON:
operator_bytes = vertex_pb.operator # python operator descriptor
if vertex_pb.chained:
worker_actor: ActorHandle
def __init__(self, execution_vertex_pb):
self.execution_vertex_id = execution_vertex_pb.execution_vertex_id
self.execution_job_vertex_id = execution_vertex_pb \
.execution_job_vertex_id
self.execution_job_vertex_name = execution_vertex_pb \
.execution_job_vertex_name
self.execution_vertex_index = execution_vertex_pb\
.execution_vertex_index
self.parallelism = execution_vertex_pb.parallelism
if execution_vertex_pb\
.language == Language.PYTHON:
# python operator descriptor
operator_bytes = execution_vertex_pb.operator
if execution_vertex_pb.chained:
logger.info("Load chained operator")
self.stream_operator = operator.load_chained_operator(
operator_bytes)
else:
logger.info("Load operator")
self.stream_operator = operator.load_operator(operator_bytes)
self.worker_actor = ray.actor.ActorHandle. \
_deserialization_helper(vertex_pb.worker_actor)
self.container_id = vertex_pb.container_id
self.build_time = vertex_pb.build_time
self.language = vertex_pb.language
self.config = vertex_pb.config
self.resource = vertex_pb.resource
self.worker_actor = None
if execution_vertex_pb.worker_actor:
self.worker_actor = ray.actor.ActorHandle. \
_deserialization_helper(execution_vertex_pb.worker_actor)
self.container_id = execution_vertex_pb.container_id
self.build_time = execution_vertex_pb.build_time
self.language = execution_vertex_pb.language
self.config = execution_vertex_pb.config
self.resource = execution_vertex_pb.resource
@property
def execution_vertex_name(self):
return "{}_{}_{}".format(self.execution_job_vertex_id,
self.execution_job_vertex_name,
self.execution_vertex_id)
class ExecutionVertexContext:
def __init__(self,
vertex_context_pb: remote_call_pb.ExecutionVertexContext):
self.execution_vertex = \
ExecutionVertex(vertex_context_pb.current_execution_vertex)
actor_id: ActorID
execution_vertex: ExecutionVertex
def __init__(
self,
execution_vertex_context_pb: remote_call_pb.ExecutionVertexContext
):
self.execution_vertex = ExecutionVertex(
execution_vertex_context_pb.current_execution_vertex)
self.job_name = self.execution_vertex.config[Config.STREAMING_JOB_NAME]
self.exe_vertex_name = self.execution_vertex.execution_vertex_name
self.actor_id = self.execution_vertex.worker_actor._ray_actor_id
self.upstream_execution_vertices = [
ExecutionVertex(vertex)
for vertex in vertex_context_pb.upstream_execution_vertices
ExecutionVertex(vertex) for vertex in
execution_vertex_context_pb.upstream_execution_vertices
]
self.downstream_execution_vertices = [
ExecutionVertex(vertex)
for vertex in vertex_context_pb.downstream_execution_vertices
ExecutionVertex(vertex) for vertex in
execution_vertex_context_pb.downstream_execution_vertices
]
self.input_execution_edges = [
ExecutionEdge(edge, self.execution_vertex.language)
for edge in vertex_context_pb.input_execution_edges
for edge in execution_vertex_context_pb.input_execution_edges
]
self.output_execution_edges = [
ExecutionEdge(edge, self.execution_vertex.language)
for edge in vertex_context_pb.output_execution_edges
for edge in execution_vertex_context_pb.output_execution_edges
]
def get_parallelism(self):
@@ -112,16 +140,16 @@ class ExecutionVertexContext:
def get_task_id(self):
return self.execution_vertex.execution_vertex_id
def get_source_actor_by_vertex_id(self, execution_vertex_id):
for vertex in self.upstream_execution_vertices:
if vertex.execution_vertex_id == execution_vertex_id:
return vertex.worker_actor
raise Exception("ExecutionVertex %s does not exist!"
.format(execution_vertex_id))
def get_source_actor_by_execution_vertex_id(self, execution_vertex_id):
for execution_vertex in self.upstream_execution_vertices:
if execution_vertex.execution_vertex_id == execution_vertex_id:
return execution_vertex.worker_actor
raise Exception(
"Vertex %s does not exist!".format(execution_vertex_id))
def get_target_actor_by_vertex_id(self, execution_vertex_id):
for vertex in self.downstream_execution_vertices:
if vertex.execution_vertex_id == execution_vertex_id:
return vertex.worker_actor
raise Exception("ExecutionVertex %s does not exist!"
.format(execution_vertex_id))
def get_target_actor_by_execution_vertex_id(self, execution_vertex_id):
for execution_vertex in self.downstream_execution_vertices:
if execution_vertex.execution_vertex_id == execution_vertex_id:
return execution_vertex.worker_actor
raise Exception(
"Vertex %s does not exist!".format(execution_vertex_id))