[ID Refactor] Shorten the length of JobID to 4 bytes (#5110)

* WIP

* Fix

* Add jobid test

* Fix

* Add python part

* Fix

* Fix tes

* Remove TODOs

* Fix C++ tests

* Lint

* Fix

* Fix exporting functions in multiple ray.init

* Fix java test

* Fix lint

* Fix linting

* Address comments.

* FIx

* Address and fix linting

* Refine and fix

* Fix

* address

* Address comments.

* Fix linting

* Fix

* Address

* Address comments.

* Address

* Address

* Fix

* Fix

* Fix

* Fix lint

* Fix

* Fix linting

* Address comments.

* Fix linting

* Address comments.

* Fix linting

* address comments.

* Fix
This commit is contained in:
Qing Wang
2019-07-11 14:25:16 +08:00
committed by GitHub
parent 88365d4112
commit f2293243cc
37 changed files with 385 additions and 132 deletions
+20 -13
View File
@@ -185,11 +185,11 @@ class ActorClass(object):
task.
_resources: The default resources required by the actor creation task.
_actor_method_cpus: The number of CPUs required by actor method tasks.
_last_job_id_exported_for: The ID of the job of the last Ray
session during which this actor class definition was exported. This
is an imperfect mechanism used to determine if we need to export
the remote function again. It is imperfect in the sense that the
actor class definition could be exported multiple times by
_last_export_session_and_job: A pair of the last exported session
and job to help us to know whether this function was exported.
This is an imperfect mechanism used to determine if we need to
export the remote function again. It is imperfect in the sense that
the actor class definition could be exported multiple times by
different workers.
_actor_methods: The actor methods.
_method_decorators: Optional decorators that should be applied to the
@@ -211,7 +211,7 @@ class ActorClass(object):
self._num_cpus = num_cpus
self._num_gpus = num_gpus
self._resources = resources
self._last_job_id_exported_for = None
self._last_export_session_and_job = None
self._actor_methods = inspect.getmembers(
self._modified_class, ray.utils.is_function_or_method)
@@ -344,12 +344,13 @@ class ActorClass(object):
*copy.deepcopy(args), **copy.deepcopy(kwargs))
else:
# Export the actor.
if (self._last_job_id_exported_for is None or
self._last_job_id_exported_for != worker.current_job_id):
# If this actor class was exported in a previous session, we
# need to export this function again, because current GCS
if (self._last_export_session_and_job !=
worker.current_session_and_job):
# If this actor class was not exported in this session and job,
# we need to export this function again, because current GCS
# doesn't have it.
self._last_job_id_exported_for = worker.current_job_id
self._last_export_session_and_job = (
worker.current_session_and_job)
worker.function_actor_manager.export_actor_class(
self._modified_class, self._actor_method_names)
@@ -387,7 +388,8 @@ class ActorClass(object):
actor_id, self._modified_class.__module__, self._class_name,
actor_cursor, self._actor_method_names, self._method_decorators,
self._method_signatures, self._actor_method_num_return_vals,
actor_cursor, actor_method_cpu, worker.current_job_id)
actor_cursor, actor_method_cpu, worker.current_job_id,
worker.current_session_and_job)
# We increment the actor counter by 1 to account for the actor creation
# task.
actor_handle._ray_actor_counter += 1
@@ -465,6 +467,7 @@ class ActorHandle(object):
actor_creation_dummy_object_id,
actor_method_cpus,
actor_job_id,
session_and_job,
actor_handle_id=None):
assert isinstance(actor_id, ActorID)
assert isinstance(actor_job_id, ray.JobID)
@@ -490,6 +493,7 @@ class ActorHandle(object):
actor_creation_dummy_object_id)
self._ray_actor_method_cpus = actor_method_cpus
self._ray_actor_job_id = actor_job_id
self._ray_session_and_job = session_and_job
self._ray_new_actor_handles = []
self._ray_actor_lock = threading.Lock()
@@ -610,8 +614,10 @@ class ActorHandle(object):
# there are ANY handles in scope in the process that created the actor,
# not just the first one.
worker = ray.worker.get_global_worker()
exported_in_current_session_and_job = (
self._ray_session_and_job == worker.current_session_and_job)
if (worker.mode == ray.worker.SCRIPT_MODE
and self._ray_actor_job_id.binary() != worker.worker_id):
and not exported_in_current_session_and_job):
# If the worker is a driver and driver id has changed because
# Ray was shut down re-initialized, the actor is already cleaned up
# and we don't need to send `__ray_terminate__` again.
@@ -729,6 +735,7 @@ class ActorHandle(object):
# This is the ID of the job that owns the actor, not
# necessarily the job that owns this actor handle.
state["actor_job_id"],
worker.current_session_and_job,
actor_handle_id=actor_handle_id)
def __getstate__(self):
+11 -2
View File
@@ -1,6 +1,6 @@
from libcpp cimport bool as c_bool
from libcpp.string cimport string as c_string
from libc.stdint cimport uint8_t, int64_t
from libc.stdint cimport uint8_t, uint32_t, int64_t
cdef extern from "ray/common/id.h" namespace "ray" nogil:
cdef cppclass CBaseID[T]:
@@ -78,11 +78,20 @@ cdef extern from "ray/common/id.h" namespace "ray" nogil:
@staticmethod
CFunctionID FromBinary(const c_string &binary)
cdef cppclass CJobID "ray::JobID"(CUniqueID):
cdef cppclass CJobID "ray::JobID"(CBaseID[CJobID]):
@staticmethod
CJobID FromBinary(const c_string &binary)
@staticmethod
const CJobID Nil()
@staticmethod
size_t Size()
@staticmethod
CJobID FromInt(uint32_t value)
cdef cppclass CTaskID "ray::TaskID"(CBaseID[CTaskID]):
@staticmethod
+31 -4
View File
@@ -109,7 +109,6 @@ cdef class UniqueID(BaseID):
def nil(cls):
return cls(CUniqueID.Nil().Binary())
@classmethod
def from_random(cls):
return cls(os.urandom(CUniqueID.Size()))
@@ -194,7 +193,7 @@ cdef class TaskID(BaseID):
return cls(CTaskID.Nil().Binary())
@classmethod
def size(cla):
def size(cls):
return CTaskID.Size()
@classmethod
@@ -212,15 +211,43 @@ cdef class ClientID(UniqueID):
return <CClientID>self.data
cdef class JobID(UniqueID):
cdef class JobID(BaseID):
cdef CJobID data
def __init__(self, id):
check_id(id)
check_id(id, CJobID.Size())
self.data = CJobID.FromBinary(<c_string>id)
cdef CJobID native(self):
return <CJobID>self.data
@classmethod
def from_int(cls, value):
return cls(CJobID.FromInt(value).Binary())
@classmethod
def nil(cls):
return cls(CJobID.Nil().Binary())
@classmethod
def size(cls):
return CJobID.Size()
def binary(self):
return self.data.Binary()
def hex(self):
return decode(self.data.Hex())
def size(self):
return CJobID.Size()
def is_nil(self):
return self.data.IsNil()
cdef size_t hash(self):
return self.data.Hash()
cdef class WorkerID(UniqueID):
def __init__(self, id):
+8 -10
View File
@@ -43,8 +43,8 @@ class RemoteFunction(object):
return the resulting ObjectIDs. For an example, see
"test_decorated_function" in "python/ray/tests/test_basic.py".
_function_signature: The function signature.
_last_job_id_exported_for: The ID of the job ID of the last Ray
session during which this remote function definition was exported.
_last_export_session_and_job: A pair of the last exported session
and job to help us to know whether this function was exported.
This is an imperfect mechanism used to determine if we need to
export the remote function again. It is imperfect in the sense that
the actor class definition could be exported multiple times by
@@ -71,9 +71,7 @@ class RemoteFunction(object):
ray.signature.check_signature_supported(self._function)
self._function_signature = ray.signature.extract_signature(
self._function)
self._last_job_id_exported_for = None
self._last_export_session_and_job = None
# Override task.remote's signature and docstring
@wraps(function)
def _remote_proxy(*args, **kwargs):
@@ -114,11 +112,11 @@ class RemoteFunction(object):
worker = ray.worker.get_global_worker()
worker.check_connected()
if (self._last_job_id_exported_for is None
or self._last_job_id_exported_for != worker.current_job_id):
# If this function was exported in a previous session, we need to
# export this function again, because current GCS doesn't have it.
self._last_job_id_exported_for = worker.current_job_id
if self._last_export_session_and_job != worker.current_session_and_job:
# If this function was not exported in this session and job,
# we need to export this function again, because current GCS
# doesn't have it.
self._last_export_session_and_job = worker.current_session_and_job
worker.function_actor_manager.export(self)
kwargs = {} if kwargs is None else kwargs
+3
View File
@@ -615,6 +615,9 @@ def start_redis(node_ip_address,
# can access it and know whether or not to enable cross-languages.
primary_redis_client.set("INCLUDE_JAVA", 1 if include_java else 0)
# Init job counter to GCS.
primary_redis_client.set("JobCounter", 0)
# Store version information in the primary Redis shard.
_put_version_info_in_redis(primary_redis_client)
+6 -5
View File
@@ -2510,7 +2510,8 @@ def test_global_state_api(shutdown_only):
assert ray.objects() == {}
job_id = ray.utils.binary_to_hex(ray.worker.global_worker.worker_id)
job_id = ray.utils.compute_job_id_from_driver(
ray.WorkerID(ray.worker.global_worker.worker_id))
driver_task_id = ray.worker.global_worker.current_task_id.hex()
# One task is put in the task table which corresponds to this driver.
@@ -2524,7 +2525,7 @@ def test_global_state_api(shutdown_only):
assert task_spec["TaskID"] == driver_task_id
assert task_spec["ActorID"] == nil_id_hex
assert task_spec["Args"] == []
assert task_spec["JobID"] == job_id
assert task_spec["JobID"] == job_id.hex()
assert task_spec["FunctionID"] == nil_id_hex
assert task_spec["ReturnObjectIDs"] == []
@@ -2552,7 +2553,7 @@ def test_global_state_api(shutdown_only):
task_spec = task_table[task_id]["TaskSpec"]
assert task_spec["ActorID"] == nil_id_hex
assert task_spec["Args"] == [1, "hi", x_id]
assert task_spec["JobID"] == job_id
assert task_spec["JobID"] == job_id.hex()
assert task_spec["ReturnObjectIDs"] == [result_id]
assert task_table[task_id] == ray.tasks(task_id)
@@ -2583,7 +2584,7 @@ def test_global_state_api(shutdown_only):
job_table = ray.jobs()
assert len(job_table) == 1
assert job_table[0]["JobID"] == job_id
assert job_table[0]["JobID"] == job_id.hex()
assert job_table[0]["NodeManagerAddress"] == node_ip_address
@@ -2691,7 +2692,7 @@ def test_workers(shutdown_only):
def test_specific_job_id():
dummy_driver_id = ray.JobID(b"00112233445566778899")
dummy_driver_id = ray.JobID.from_int(1)
ray.init(num_cpus=1, job_id=dummy_driver_id)
# in driver
+14
View File
@@ -232,6 +232,20 @@ def hex_to_binary(hex_identifier):
return binascii.unhexlify(hex_identifier)
# TODO(qwang): Remove these hepler functions
# once we separate `WorkerID` from `UniqueID`.
def compute_job_id_from_driver(driver_id):
assert isinstance(driver_id, ray.WorkerID)
return ray.JobID(driver_id.binary()[0:ray.JobID.size()])
def compute_driver_id_from_job(job_id):
assert isinstance(job_id, ray.JobID)
rest_length = ray_constants.ID_SIZE - job_id.size()
driver_id_str = job_id.binary() + (rest_length * b"\xff")
return ray.WorkerID(driver_id_str)
def get_cuda_visible_devices():
"""Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
+39 -21
View File
@@ -18,6 +18,7 @@ import sys
import threading
import time
import traceback
import random
# Ray modules
import pyarrow
@@ -217,6 +218,13 @@ class Worker(object):
def current_task_id(self):
return self.task_context.current_task_id
@property
def current_session_and_job(self):
"""Get the current session index and job id as pair."""
assert isinstance(self._session_index, int)
assert isinstance(self.current_job_id, ray.JobID)
return self._session_index, self.current_job_id
def mark_actor_init_failed(self, error):
"""Called to mark this actor as failed during initialization."""
@@ -1718,27 +1726,44 @@ def connect(node,
worker.profiler = profiling.Profiler(worker, worker.threads_stopped)
if mode is not LOCAL_MODE:
# Create a Redis client to primary.
# The Redis client can safely be shared between threads. However,
# that is not true of Redis pubsub clients. See the documentation at
# https://github.com/andymccurdy/redis-py#thread-safety.
worker.redis_client = node.create_redis_client()
# Initialize some fields.
if mode is WORKER_MODE:
# We should not specify the job_id if it's `WORKER_MODE`.
assert job_id is None
job_id = JobID.nil()
# TODO(qwang): Rename this to `worker_id_str` or type to `WorkerID`
worker.worker_id = _random_string()
if setproctitle:
setproctitle.setproctitle("ray_worker")
elif mode is LOCAL_MODE:
# Code path of local mode
if job_id is None:
job_id = JobID.from_int(random.randint(1, 100000))
worker.worker_id = ray.utils.compute_driver_id_from_job(
job_id).binary()
else:
# This is the code path of driver mode.
if job_id is None:
job_id = JobID.from_random()
# TODO(qwang): use `GcsClient::GenerateJobId()` here.
job_id = JobID.from_int(
int(worker.redis_client.incr("JobCounter")))
# When tasks are executed on remote workers in the context of multiple
# drivers, the current job ID is used to keep track of which job is
# responsible for the task so that error messages will be propagated to
# the correct driver.
worker.worker_id = ray.utils.compute_driver_id_from_job(
job_id).binary()
if not isinstance(job_id, JobID):
raise TypeError("The type of given job id must be JobID.")
worker.worker_id = job_id.binary()
# When tasks are executed on remote workers in the context of multiple
# drivers, the current job ID is used to keep track of which driver is
# responsible for the task so that error messages will be propagated to
# the correct driver.
if mode != WORKER_MODE:
worker.current_job_id = JobID(worker.worker_id)
if not isinstance(job_id, JobID):
raise TypeError("The type of given job id must be JobID.")
worker.current_job_id = job_id
# All workers start out as non-actors. A worker can be turned into an actor
# after it is created.
@@ -1752,12 +1777,6 @@ def connect(node,
worker.local_mode_manager = LocalModeManager()
return
# Create a Redis client.
# The Redis client can safely be shared between threads. However, that is
# not true of Redis pubsub clients. See the documentation at
# https://github.com/andymccurdy/redis-py#thread-safety.
worker.redis_client = node.create_redis_client()
# For driver's check that the version information matches the version
# information that the Ray cluster was started with.
try:
@@ -1836,7 +1855,6 @@ def connect(node,
# Create an object store client.
worker.plasma_client = thread_safe_client(
plasma.connect(node.plasma_store_socket_name, None, 0, 300))
job_id_str = _random_string()
# If this is a driver, set the current task ID, the task driver ID, and set
# the task index to 0.
@@ -1868,7 +1886,7 @@ def connect(node,
function_descriptor.get_function_descriptor_list(),
[], # arguments.
0, # num_returns.
TaskID(job_id_str[:TaskID.size()]), # parent_task_id.
TaskID(worker.worker_id[:TaskID.size()]), # parent_task_id.
0, # parent_counter.
ActorID.nil(), # actor_creation_id.
ObjectID.nil(), # actor_creation_dummy_object_id.
@@ -1901,7 +1919,7 @@ def connect(node,
node.raylet_socket_name,
ClientID(worker.worker_id),
(mode == WORKER_MODE),
JobID(job_id_str),
worker.current_job_id,
)
# Start the import thread