From 3900643948d8bfec95a6dca63c29b016923460de Mon Sep 17 00:00:00 2001 From: fangfengbin <869218239a@zju.edu.cn> Date: Fri, 31 Jul 2020 15:35:25 +0800 Subject: [PATCH] Add actor states definitions & transition diagram doc (#9754) --- src/ray/design_docs/actor_states.rst | 60 ++++++++++++++++++++++ src/ray/gcs/gcs_server/gcs_actor_manager.h | 42 +++++++++++++++ src/ray/protobuf/gcs.proto | 2 +- 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 src/ray/design_docs/actor_states.rst diff --git a/src/ray/design_docs/actor_states.rst b/src/ray/design_docs/actor_states.rst new file mode 100644 index 000000000..456c0d03c --- /dev/null +++ b/src/ray/design_docs/actor_states.rst @@ -0,0 +1,60 @@ +Actor State: Definitions & Transition Diagram +============================================ + +An actor can be in one of the following states: + +- **DEPENDENCIES_UNREADY**: The actor info is registered in GCS. But its dependencies are not ready. + +- **PENDING_CREATION**: The actor local dependencies are ready. This actor is being created. + +- **ALIVE**: The actor is created successfully. + +- **RESTARTING**: The actor is dead, now being restarted. After reconstruction finishes, + the state will become alive again. + +- **DEAD**: The actor is already dead and won't be restarted. + +:: + + 3 + 0 1 2 -------------> + ---->DEPENDENCIES_UNREADY-------->PENDING_CREATION-------->ALIVE RESTARTING + | | | <------------- | + | | | 4 | + | | | | + 8 | 7 | 6 | | 5 + | | | | + | | | | + | | | | + | v | | + -------------------------->DEAD<--------------------------------------- + +- **0**: When GCS receives a `RegisterActor` request from core worker, GCS will persist the actor in database with state `DEPENDENCIES_UNREADY`. + +- **1**: When core worker has finished resolving the actor dependencies, it will send a `CreateActor` request to GCS and GCS will update actor state to `PENDING_CREATION` in memory. + +- **2**: When core worker has finished actor creation task, it will send a `PushTask` reply to GCS and GCS will update actor state to `ALIVE` in database. + +- **3**: When GCS detects that the worker/node of an actor is dead and the actor's remaining restarts number is greater than 0, it will update actor state to `RESTARTING` in database. + +- **4**: When the actor is successfully reconstructed, GCS will update its state to `ALIVE` in the database. + +- **5**: + + 1) If the actor is restarting, GCS detects that its worker or node is dead and its remaining restarts number is 0, it will update its state to `DEAD` in database. + + 2) If an actor is non-detached, when GCS detects that its owner is dead, it will update its state to `DEAD` in the database. + +- **6**: + + 1) When GCS detected that an actor is dead and its remaining restarts number is 0, it will update its state to `DEAD` in database. + + 2) If the actor is non-detached, when GCS detects that its owner is dead, it will update its state to `DEAD` in the database. + +- **7**: If the actor is non-detached, when GCS detects that its owner is dead, it will update its state to `DEAD` in the database. + +- **8**: + + 1) For both detached and non-detached actors, when GCS detects that an actor's creator is dead, it will update its state to `DEAD`. Because in this case, the actor can never be created. + + 2) If the actor is non-detached, when GCS detects that its owner is dead, it will update its state to `DEAD` in the database. diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index b00332190..4b9d21825 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -110,8 +110,50 @@ class GcsActor { using RegisterActorCallback = std::function)>; using CreateActorCallback = std::function)>; + /// GcsActorManager is responsible for managing the lifecycle of all actors. /// This class is not thread-safe. +/// Actor State Transition Diagram: +/// 3 +/// 0 1 2 ---> +/// --->DEPENDENCIES_UNREADY--->PENDING_CREATION--->ALIVE RESTARTING +/// | | | <--- | +/// 8 | 7 | 6 | 4 | 5 +/// | v | | +/// ------------------> DEAD <------------------------- +/// +/// 0: When GCS receives a `RegisterActor` request from core worker, it will add an actor +/// to `registered_actors_` and `unresolved_actors_`. +/// 1: When GCS receives a `CreateActor` request from core worker, it will remove the +/// actor from `unresolved_actors_` and schedule the actor. +/// 2: GCS selects a node to lease worker. If the worker is successfully leased, +/// GCS will push actor creation task to the core worker, else GCS will select another +/// node to lease worker. If the actor is created successfully, GCS will add the actor to +/// `created_actors_`. +/// 3: When GCS detects that the worker/node of an actor is dead, it +/// will get actor from `registered_actors_` by actor id. If the actor's remaining +/// restarts number is greater than 0, it will reconstruct the actor. +/// 4: When the actor is successfully reconstructed, GCS will update its state to `ALIVE`. +/// 5: If the actor is restarting, GCS detects that its worker or node is dead and its +/// remaining restarts number is 0, it will update its state to `DEAD`. If the actor is +/// detached, GCS will remove it from `registered_actors_` and `created_actors_`. If the +/// actor is non-detached, when GCS detects that its owner is dead, GCS will remove it +/// from `registered_actors_`. +/// 6: When GCS detected that an actor is dead, GCS will +/// reconstruct it. If its remaining restarts number is 0, it will update its state to +/// `DEAD`. If the actor is detached, GCS will remove it from `registered_actors_` and +/// `created_actors_`. If the actor is non-detached, when GCS detects that its owner is +/// dead, it will destroy the actor and remove it from `registered_actors_` and +/// `created_actors_`. +/// 7: If the actor is non-detached, when GCS detects that its owner is +/// dead, it will destroy the actor and remove it from `registered_actors_` and +/// `created_actors_`. +/// 8: For both detached and non-detached actors, when GCS detects that +/// an actor's creator is dead, it will update its state to `DEAD` and remove it from +/// `registered_actors_` and `created_actors_`. Because in this case, the actor can never +/// be created. If the actor is non-detached, when GCS detects that its owner is dead, it +/// will update its state to `DEAD` and remove it from `registered_actors_` and +/// `created_actors_`. class GcsActorManager : public rpc::ActorInfoHandler { public: /// Create a GcsActorManager diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index a7c691828..d96cd3dc0 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -103,7 +103,7 @@ message TaskTableData { message ActorTableData { // State of an actor. enum ActorState { - // Actor info ins registered in GCS. But its dependencies are not ready. + // Actor info is registered in GCS. But its dependencies are not ready. DEPENDENCIES_UNREADY = 0; // Actor local dependencies are ready. This actor is being created. PENDING_CREATION = 1;