mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 03:10:54 +08:00
[ID Refactor] Shorten the length of JobID to 4 bytes (#5110)
* WIP * Fix * Add jobid test * Fix * Add python part * Fix * Fix tes * Remove TODOs * Fix C++ tests * Lint * Fix * Fix exporting functions in multiple ray.init * Fix java test * Fix lint * Fix linting * Address comments. * FIx * Address and fix linting * Refine and fix * Fix * address * Address comments. * Fix linting * Fix * Address * Address comments. * Address * Address * Fix * Fix * Fix * Fix lint * Fix * Fix linting * Address comments. * Fix linting * Address comments. * Fix linting * address comments. * Fix
This commit is contained in:
+20
-13
@@ -185,11 +185,11 @@ class ActorClass(object):
|
||||
task.
|
||||
_resources: The default resources required by the actor creation task.
|
||||
_actor_method_cpus: The number of CPUs required by actor method tasks.
|
||||
_last_job_id_exported_for: The ID of the job of the last Ray
|
||||
session during which this actor class definition was exported. This
|
||||
is an imperfect mechanism used to determine if we need to export
|
||||
the remote function again. It is imperfect in the sense that the
|
||||
actor class definition could be exported multiple times by
|
||||
_last_export_session_and_job: A pair of the last exported session
|
||||
and job to help us to know whether this function was exported.
|
||||
This is an imperfect mechanism used to determine if we need to
|
||||
export the remote function again. It is imperfect in the sense that
|
||||
the actor class definition could be exported multiple times by
|
||||
different workers.
|
||||
_actor_methods: The actor methods.
|
||||
_method_decorators: Optional decorators that should be applied to the
|
||||
@@ -211,7 +211,7 @@ class ActorClass(object):
|
||||
self._num_cpus = num_cpus
|
||||
self._num_gpus = num_gpus
|
||||
self._resources = resources
|
||||
self._last_job_id_exported_for = None
|
||||
self._last_export_session_and_job = None
|
||||
|
||||
self._actor_methods = inspect.getmembers(
|
||||
self._modified_class, ray.utils.is_function_or_method)
|
||||
@@ -344,12 +344,13 @@ class ActorClass(object):
|
||||
*copy.deepcopy(args), **copy.deepcopy(kwargs))
|
||||
else:
|
||||
# Export the actor.
|
||||
if (self._last_job_id_exported_for is None or
|
||||
self._last_job_id_exported_for != worker.current_job_id):
|
||||
# If this actor class was exported in a previous session, we
|
||||
# need to export this function again, because current GCS
|
||||
if (self._last_export_session_and_job !=
|
||||
worker.current_session_and_job):
|
||||
# If this actor class was not exported in this session and job,
|
||||
# we need to export this function again, because current GCS
|
||||
# doesn't have it.
|
||||
self._last_job_id_exported_for = worker.current_job_id
|
||||
self._last_export_session_and_job = (
|
||||
worker.current_session_and_job)
|
||||
worker.function_actor_manager.export_actor_class(
|
||||
self._modified_class, self._actor_method_names)
|
||||
|
||||
@@ -387,7 +388,8 @@ class ActorClass(object):
|
||||
actor_id, self._modified_class.__module__, self._class_name,
|
||||
actor_cursor, self._actor_method_names, self._method_decorators,
|
||||
self._method_signatures, self._actor_method_num_return_vals,
|
||||
actor_cursor, actor_method_cpu, worker.current_job_id)
|
||||
actor_cursor, actor_method_cpu, worker.current_job_id,
|
||||
worker.current_session_and_job)
|
||||
# We increment the actor counter by 1 to account for the actor creation
|
||||
# task.
|
||||
actor_handle._ray_actor_counter += 1
|
||||
@@ -465,6 +467,7 @@ class ActorHandle(object):
|
||||
actor_creation_dummy_object_id,
|
||||
actor_method_cpus,
|
||||
actor_job_id,
|
||||
session_and_job,
|
||||
actor_handle_id=None):
|
||||
assert isinstance(actor_id, ActorID)
|
||||
assert isinstance(actor_job_id, ray.JobID)
|
||||
@@ -490,6 +493,7 @@ class ActorHandle(object):
|
||||
actor_creation_dummy_object_id)
|
||||
self._ray_actor_method_cpus = actor_method_cpus
|
||||
self._ray_actor_job_id = actor_job_id
|
||||
self._ray_session_and_job = session_and_job
|
||||
self._ray_new_actor_handles = []
|
||||
self._ray_actor_lock = threading.Lock()
|
||||
|
||||
@@ -610,8 +614,10 @@ class ActorHandle(object):
|
||||
# there are ANY handles in scope in the process that created the actor,
|
||||
# not just the first one.
|
||||
worker = ray.worker.get_global_worker()
|
||||
exported_in_current_session_and_job = (
|
||||
self._ray_session_and_job == worker.current_session_and_job)
|
||||
if (worker.mode == ray.worker.SCRIPT_MODE
|
||||
and self._ray_actor_job_id.binary() != worker.worker_id):
|
||||
and not exported_in_current_session_and_job):
|
||||
# If the worker is a driver and driver id has changed because
|
||||
# Ray was shut down re-initialized, the actor is already cleaned up
|
||||
# and we don't need to send `__ray_terminate__` again.
|
||||
@@ -729,6 +735,7 @@ class ActorHandle(object):
|
||||
# This is the ID of the job that owns the actor, not
|
||||
# necessarily the job that owns this actor handle.
|
||||
state["actor_job_id"],
|
||||
worker.current_session_and_job,
|
||||
actor_handle_id=actor_handle_id)
|
||||
|
||||
def __getstate__(self):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from libcpp cimport bool as c_bool
|
||||
from libcpp.string cimport string as c_string
|
||||
from libc.stdint cimport uint8_t, int64_t
|
||||
from libc.stdint cimport uint8_t, uint32_t, int64_t
|
||||
|
||||
cdef extern from "ray/common/id.h" namespace "ray" nogil:
|
||||
cdef cppclass CBaseID[T]:
|
||||
@@ -78,11 +78,20 @@ cdef extern from "ray/common/id.h" namespace "ray" nogil:
|
||||
@staticmethod
|
||||
CFunctionID FromBinary(const c_string &binary)
|
||||
|
||||
cdef cppclass CJobID "ray::JobID"(CUniqueID):
|
||||
cdef cppclass CJobID "ray::JobID"(CBaseID[CJobID]):
|
||||
|
||||
@staticmethod
|
||||
CJobID FromBinary(const c_string &binary)
|
||||
|
||||
@staticmethod
|
||||
const CJobID Nil()
|
||||
|
||||
@staticmethod
|
||||
size_t Size()
|
||||
|
||||
@staticmethod
|
||||
CJobID FromInt(uint32_t value)
|
||||
|
||||
cdef cppclass CTaskID "ray::TaskID"(CBaseID[CTaskID]):
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -109,7 +109,6 @@ cdef class UniqueID(BaseID):
|
||||
def nil(cls):
|
||||
return cls(CUniqueID.Nil().Binary())
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_random(cls):
|
||||
return cls(os.urandom(CUniqueID.Size()))
|
||||
@@ -194,7 +193,7 @@ cdef class TaskID(BaseID):
|
||||
return cls(CTaskID.Nil().Binary())
|
||||
|
||||
@classmethod
|
||||
def size(cla):
|
||||
def size(cls):
|
||||
return CTaskID.Size()
|
||||
|
||||
@classmethod
|
||||
@@ -212,15 +211,43 @@ cdef class ClientID(UniqueID):
|
||||
return <CClientID>self.data
|
||||
|
||||
|
||||
cdef class JobID(UniqueID):
|
||||
cdef class JobID(BaseID):
|
||||
cdef CJobID data
|
||||
|
||||
def __init__(self, id):
|
||||
check_id(id)
|
||||
check_id(id, CJobID.Size())
|
||||
self.data = CJobID.FromBinary(<c_string>id)
|
||||
|
||||
cdef CJobID native(self):
|
||||
return <CJobID>self.data
|
||||
|
||||
@classmethod
|
||||
def from_int(cls, value):
|
||||
return cls(CJobID.FromInt(value).Binary())
|
||||
|
||||
@classmethod
|
||||
def nil(cls):
|
||||
return cls(CJobID.Nil().Binary())
|
||||
|
||||
@classmethod
|
||||
def size(cls):
|
||||
return CJobID.Size()
|
||||
|
||||
def binary(self):
|
||||
return self.data.Binary()
|
||||
|
||||
def hex(self):
|
||||
return decode(self.data.Hex())
|
||||
|
||||
def size(self):
|
||||
return CJobID.Size()
|
||||
|
||||
def is_nil(self):
|
||||
return self.data.IsNil()
|
||||
|
||||
cdef size_t hash(self):
|
||||
return self.data.Hash()
|
||||
|
||||
cdef class WorkerID(UniqueID):
|
||||
|
||||
def __init__(self, id):
|
||||
|
||||
@@ -43,8 +43,8 @@ class RemoteFunction(object):
|
||||
return the resulting ObjectIDs. For an example, see
|
||||
"test_decorated_function" in "python/ray/tests/test_basic.py".
|
||||
_function_signature: The function signature.
|
||||
_last_job_id_exported_for: The ID of the job ID of the last Ray
|
||||
session during which this remote function definition was exported.
|
||||
_last_export_session_and_job: A pair of the last exported session
|
||||
and job to help us to know whether this function was exported.
|
||||
This is an imperfect mechanism used to determine if we need to
|
||||
export the remote function again. It is imperfect in the sense that
|
||||
the actor class definition could be exported multiple times by
|
||||
@@ -71,9 +71,7 @@ class RemoteFunction(object):
|
||||
ray.signature.check_signature_supported(self._function)
|
||||
self._function_signature = ray.signature.extract_signature(
|
||||
self._function)
|
||||
|
||||
self._last_job_id_exported_for = None
|
||||
|
||||
self._last_export_session_and_job = None
|
||||
# Override task.remote's signature and docstring
|
||||
@wraps(function)
|
||||
def _remote_proxy(*args, **kwargs):
|
||||
@@ -114,11 +112,11 @@ class RemoteFunction(object):
|
||||
worker = ray.worker.get_global_worker()
|
||||
worker.check_connected()
|
||||
|
||||
if (self._last_job_id_exported_for is None
|
||||
or self._last_job_id_exported_for != worker.current_job_id):
|
||||
# If this function was exported in a previous session, we need to
|
||||
# export this function again, because current GCS doesn't have it.
|
||||
self._last_job_id_exported_for = worker.current_job_id
|
||||
if self._last_export_session_and_job != worker.current_session_and_job:
|
||||
# If this function was not exported in this session and job,
|
||||
# we need to export this function again, because current GCS
|
||||
# doesn't have it.
|
||||
self._last_export_session_and_job = worker.current_session_and_job
|
||||
worker.function_actor_manager.export(self)
|
||||
|
||||
kwargs = {} if kwargs is None else kwargs
|
||||
|
||||
@@ -615,6 +615,9 @@ def start_redis(node_ip_address,
|
||||
# can access it and know whether or not to enable cross-languages.
|
||||
primary_redis_client.set("INCLUDE_JAVA", 1 if include_java else 0)
|
||||
|
||||
# Init job counter to GCS.
|
||||
primary_redis_client.set("JobCounter", 0)
|
||||
|
||||
# Store version information in the primary Redis shard.
|
||||
_put_version_info_in_redis(primary_redis_client)
|
||||
|
||||
|
||||
@@ -2510,7 +2510,8 @@ def test_global_state_api(shutdown_only):
|
||||
|
||||
assert ray.objects() == {}
|
||||
|
||||
job_id = ray.utils.binary_to_hex(ray.worker.global_worker.worker_id)
|
||||
job_id = ray.utils.compute_job_id_from_driver(
|
||||
ray.WorkerID(ray.worker.global_worker.worker_id))
|
||||
driver_task_id = ray.worker.global_worker.current_task_id.hex()
|
||||
|
||||
# One task is put in the task table which corresponds to this driver.
|
||||
@@ -2524,7 +2525,7 @@ def test_global_state_api(shutdown_only):
|
||||
assert task_spec["TaskID"] == driver_task_id
|
||||
assert task_spec["ActorID"] == nil_id_hex
|
||||
assert task_spec["Args"] == []
|
||||
assert task_spec["JobID"] == job_id
|
||||
assert task_spec["JobID"] == job_id.hex()
|
||||
assert task_spec["FunctionID"] == nil_id_hex
|
||||
assert task_spec["ReturnObjectIDs"] == []
|
||||
|
||||
@@ -2552,7 +2553,7 @@ def test_global_state_api(shutdown_only):
|
||||
task_spec = task_table[task_id]["TaskSpec"]
|
||||
assert task_spec["ActorID"] == nil_id_hex
|
||||
assert task_spec["Args"] == [1, "hi", x_id]
|
||||
assert task_spec["JobID"] == job_id
|
||||
assert task_spec["JobID"] == job_id.hex()
|
||||
assert task_spec["ReturnObjectIDs"] == [result_id]
|
||||
|
||||
assert task_table[task_id] == ray.tasks(task_id)
|
||||
@@ -2583,7 +2584,7 @@ def test_global_state_api(shutdown_only):
|
||||
job_table = ray.jobs()
|
||||
|
||||
assert len(job_table) == 1
|
||||
assert job_table[0]["JobID"] == job_id
|
||||
assert job_table[0]["JobID"] == job_id.hex()
|
||||
assert job_table[0]["NodeManagerAddress"] == node_ip_address
|
||||
|
||||
|
||||
@@ -2691,7 +2692,7 @@ def test_workers(shutdown_only):
|
||||
|
||||
|
||||
def test_specific_job_id():
|
||||
dummy_driver_id = ray.JobID(b"00112233445566778899")
|
||||
dummy_driver_id = ray.JobID.from_int(1)
|
||||
ray.init(num_cpus=1, job_id=dummy_driver_id)
|
||||
|
||||
# in driver
|
||||
|
||||
@@ -232,6 +232,20 @@ def hex_to_binary(hex_identifier):
|
||||
return binascii.unhexlify(hex_identifier)
|
||||
|
||||
|
||||
# TODO(qwang): Remove these hepler functions
|
||||
# once we separate `WorkerID` from `UniqueID`.
|
||||
def compute_job_id_from_driver(driver_id):
|
||||
assert isinstance(driver_id, ray.WorkerID)
|
||||
return ray.JobID(driver_id.binary()[0:ray.JobID.size()])
|
||||
|
||||
|
||||
def compute_driver_id_from_job(job_id):
|
||||
assert isinstance(job_id, ray.JobID)
|
||||
rest_length = ray_constants.ID_SIZE - job_id.size()
|
||||
driver_id_str = job_id.binary() + (rest_length * b"\xff")
|
||||
return ray.WorkerID(driver_id_str)
|
||||
|
||||
|
||||
def get_cuda_visible_devices():
|
||||
"""Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
|
||||
|
||||
|
||||
+39
-21
@@ -18,6 +18,7 @@ import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import random
|
||||
|
||||
# Ray modules
|
||||
import pyarrow
|
||||
@@ -217,6 +218,13 @@ class Worker(object):
|
||||
def current_task_id(self):
|
||||
return self.task_context.current_task_id
|
||||
|
||||
@property
|
||||
def current_session_and_job(self):
|
||||
"""Get the current session index and job id as pair."""
|
||||
assert isinstance(self._session_index, int)
|
||||
assert isinstance(self.current_job_id, ray.JobID)
|
||||
return self._session_index, self.current_job_id
|
||||
|
||||
def mark_actor_init_failed(self, error):
|
||||
"""Called to mark this actor as failed during initialization."""
|
||||
|
||||
@@ -1718,27 +1726,44 @@ def connect(node,
|
||||
|
||||
worker.profiler = profiling.Profiler(worker, worker.threads_stopped)
|
||||
|
||||
if mode is not LOCAL_MODE:
|
||||
# Create a Redis client to primary.
|
||||
# The Redis client can safely be shared between threads. However,
|
||||
# that is not true of Redis pubsub clients. See the documentation at
|
||||
# https://github.com/andymccurdy/redis-py#thread-safety.
|
||||
worker.redis_client = node.create_redis_client()
|
||||
|
||||
# Initialize some fields.
|
||||
if mode is WORKER_MODE:
|
||||
# We should not specify the job_id if it's `WORKER_MODE`.
|
||||
assert job_id is None
|
||||
job_id = JobID.nil()
|
||||
# TODO(qwang): Rename this to `worker_id_str` or type to `WorkerID`
|
||||
worker.worker_id = _random_string()
|
||||
if setproctitle:
|
||||
setproctitle.setproctitle("ray_worker")
|
||||
elif mode is LOCAL_MODE:
|
||||
# Code path of local mode
|
||||
if job_id is None:
|
||||
job_id = JobID.from_int(random.randint(1, 100000))
|
||||
worker.worker_id = ray.utils.compute_driver_id_from_job(
|
||||
job_id).binary()
|
||||
else:
|
||||
# This is the code path of driver mode.
|
||||
if job_id is None:
|
||||
job_id = JobID.from_random()
|
||||
# TODO(qwang): use `GcsClient::GenerateJobId()` here.
|
||||
job_id = JobID.from_int(
|
||||
int(worker.redis_client.incr("JobCounter")))
|
||||
# When tasks are executed on remote workers in the context of multiple
|
||||
# drivers, the current job ID is used to keep track of which job is
|
||||
# responsible for the task so that error messages will be propagated to
|
||||
# the correct driver.
|
||||
worker.worker_id = ray.utils.compute_driver_id_from_job(
|
||||
job_id).binary()
|
||||
|
||||
if not isinstance(job_id, JobID):
|
||||
raise TypeError("The type of given job id must be JobID.")
|
||||
|
||||
worker.worker_id = job_id.binary()
|
||||
|
||||
# When tasks are executed on remote workers in the context of multiple
|
||||
# drivers, the current job ID is used to keep track of which driver is
|
||||
# responsible for the task so that error messages will be propagated to
|
||||
# the correct driver.
|
||||
if mode != WORKER_MODE:
|
||||
worker.current_job_id = JobID(worker.worker_id)
|
||||
if not isinstance(job_id, JobID):
|
||||
raise TypeError("The type of given job id must be JobID.")
|
||||
worker.current_job_id = job_id
|
||||
|
||||
# All workers start out as non-actors. A worker can be turned into an actor
|
||||
# after it is created.
|
||||
@@ -1752,12 +1777,6 @@ def connect(node,
|
||||
worker.local_mode_manager = LocalModeManager()
|
||||
return
|
||||
|
||||
# Create a Redis client.
|
||||
# The Redis client can safely be shared between threads. However, that is
|
||||
# not true of Redis pubsub clients. See the documentation at
|
||||
# https://github.com/andymccurdy/redis-py#thread-safety.
|
||||
worker.redis_client = node.create_redis_client()
|
||||
|
||||
# For driver's check that the version information matches the version
|
||||
# information that the Ray cluster was started with.
|
||||
try:
|
||||
@@ -1836,7 +1855,6 @@ def connect(node,
|
||||
# Create an object store client.
|
||||
worker.plasma_client = thread_safe_client(
|
||||
plasma.connect(node.plasma_store_socket_name, None, 0, 300))
|
||||
job_id_str = _random_string()
|
||||
|
||||
# If this is a driver, set the current task ID, the task driver ID, and set
|
||||
# the task index to 0.
|
||||
@@ -1868,7 +1886,7 @@ def connect(node,
|
||||
function_descriptor.get_function_descriptor_list(),
|
||||
[], # arguments.
|
||||
0, # num_returns.
|
||||
TaskID(job_id_str[:TaskID.size()]), # parent_task_id.
|
||||
TaskID(worker.worker_id[:TaskID.size()]), # parent_task_id.
|
||||
0, # parent_counter.
|
||||
ActorID.nil(), # actor_creation_id.
|
||||
ObjectID.nil(), # actor_creation_dummy_object_id.
|
||||
@@ -1901,7 +1919,7 @@ def connect(node,
|
||||
node.raylet_socket_name,
|
||||
ClientID(worker.worker_id),
|
||||
(mode == WORKER_MODE),
|
||||
JobID(job_id_str),
|
||||
worker.current_job_id,
|
||||
)
|
||||
|
||||
# Start the import thread
|
||||
|
||||
Reference in New Issue
Block a user