[Core]Add runtime context for python worker (#10309)

* add runtime context for python

* fixed

* code fixed

* test added

* lint

* lint
This commit is contained in:
Lixin Wei
2020-08-27 11:11:42 +08:00
committed by GitHub
parent 2526c06b5e
commit fe6daef85e
4 changed files with 136 additions and 1 deletions
+2 -1
View File
@@ -91,6 +91,7 @@ import ray.projects # noqa: E402
import ray.actor # noqa: F401
from ray.actor import method # noqa: E402
from ray.cross_language import java_function, java_actor_class # noqa: E402
from ray.runtime_context import get_runtime_context # noqa: E402
from ray import util # noqa: E402
# Replaced with the current commit when building the wheels.
@@ -100,7 +101,7 @@ __version__ = "0.9.0.dev0"
__all__ = [
"__version__",
"_config",
"_get_runtime_context",
"get_runtime_context",
"actor",
"actors",
"available_resources",
+57
View File
@@ -0,0 +1,57 @@
import ray.worker
import logging
logger = logging.getLogger(__name__)
class RuntimeContext(object):
"""A class used for getting runtime context."""
def __init__(self, worker):
assert worker is not None
self.worker = worker
@property
def current_job_id(self):
"""Get current job ID for this worker or driver.
Returns:
If called by a driver, this returns the job ID. If called in
a task, return the job ID of the associated driver.
"""
return self.worker.current_job_id
@property
def current_actor_id(self):
"""Get the current actor ID in this worker.
Returns:
The current driver id in this worker.
"""
# only worker mode has actor_id
assert self.worker.mode == ray.worker.WORKER_MODE, (
f"This method is only available when the process is a\
worker. Current mode: {self.worker.mode}")
return self.worker.actor_id
@property
def was_current_actor_reconstructed(self):
"""Check whether this actor has been restarted
Returns:
Whether this actor has been ever restarted.
"""
# TODO: this method should not be called in a normal task.
actor_info = ray.state.actors(self.current_actor_id.hex())
return actor_info and actor_info["NumRestarts"] != 0
_runtime_context = None
def get_runtime_context():
global _runtime_context
if _runtime_context is None:
_runtime_context = RuntimeContext(ray.worker.global_worker)
return _runtime_context
+1
View File
@@ -251,6 +251,7 @@ class GlobalState:
actor_table_data.owner_address.raylet_id),
},
"State": actor_table_data.state,
"NumRestarts": actor_table_data.num_restarts,
"Timestamp": actor_table_data.timestamp,
}
return actor_info
+76
View File
@@ -0,0 +1,76 @@
import ray
import os
import signal
import time
import sys
def test_was_current_actor_reconstructed():
ray.init()
@ray.remote(max_restarts=10)
class A(object):
def __init__(self):
self._was_reconstructed = ray.get_runtime_context(
).was_current_actor_reconstructed
def get_was_reconstructed(self):
return self._was_reconstructed
def update_was_reconstructed(self):
return ray.get_runtime_context().was_current_actor_reconstructed
def get_pid(self):
return os.getpid()
# The following methods is to apply the checkpointable interface.
def should_checkpoint(self, checkpoint_context):
return False
def save_checkpoint(self, actor_id, checkpoint_id):
pass
def load_checkpoint(self, actor_id, available_checkpoints):
pass
def checkpoint_expired(self, actor_id, checkpoint_id):
pass
a = A.remote()
# `was_reconstructed` should be False when it's called in actor.
assert ray.get(a.get_was_reconstructed.remote()) is False
# `was_reconstructed` should be False when it's called in a remote method
# and the actor never fails.
assert ray.get(a.update_was_reconstructed.remote()) is False
pid = ray.get(a.get_pid.remote())
os.kill(pid, signal.SIGKILL)
time.sleep(2)
# These 2 methods should be return True because
# this actor failed and restored.
assert ray.get(a.get_was_reconstructed.remote()) is True
assert ray.get(a.update_was_reconstructed.remote()) is True
ray.shutdown()
def test_runtime_context_interface():
ray.init()
@ray.remote(max_restarts=10)
class A(object):
def current_job_id(self):
return ray.get_runtime_context().current_job_id
def current_actor_id(self):
return ray.get_runtime_context().current_actor_id
a = A.remote()
assert ray.get(a.current_job_id.remote()) is not None
assert ray.get(a.current_actor_id.remote()) is not None
ray.shutdown()
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))