mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 10:19:18 +08:00
Convert the raylet client (the code in local_scheduler_client.cc) to proper C++. (#3511)
* refactoring * fix bugs * create client class * create client class for java; bug fix * remove legacy code * improve code by using std::string, std::unique_ptr rename private fields and removing legacy code * rename class * improve naming * fix * rename files * fix names * change name * change return types * make a mutex private field * fix comments * fix bugs * lint * bug fix * bug fix * move too short functions into the header file * Loose crash conditions for some APIs. * Apply suggestions from code review Co-Authored-By: suquark <suquark@gmail.com> * format * update * rename python APIs * fix java * more fixes * change types of cpython interface * more fixes * improve error processing * improve error processing for java wrapper * lint * fix java * make fields const * use pointers for [out] parameters * fix java & error msg * fix resource leak, etc.
This commit is contained in:
+3
-4
@@ -791,7 +791,7 @@ def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
|
||||
# Disconnect the worker from the local scheduler. The point of
|
||||
# this is so that when the worker kills itself below, the local
|
||||
# scheduler won't push an error message to the driver.
|
||||
worker.local_scheduler_client.disconnect()
|
||||
worker.raylet_client.disconnect()
|
||||
sys.exit(0)
|
||||
assert False, "This process should have terminated."
|
||||
|
||||
@@ -832,8 +832,7 @@ def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
|
||||
# the local scheduler will not be included, and may not be runnable
|
||||
# on checkpoint resumption.
|
||||
actor_id = ray.ObjectID(worker.actor_id)
|
||||
frontier = worker.local_scheduler_client.get_actor_frontier(
|
||||
actor_id)
|
||||
frontier = worker.raylet_client.get_actor_frontier(actor_id)
|
||||
# Save the checkpoint in Redis. TODO(rkn): Checkpoints
|
||||
# should not be stored in Redis. Fix this.
|
||||
set_actor_checkpoint(worker, worker.actor_id, checkpoint_index,
|
||||
@@ -863,7 +862,7 @@ def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
|
||||
# Set the number of tasks executed so far.
|
||||
worker.actor_task_counter = checkpoint_index
|
||||
# Set the actor frontier in the local scheduler.
|
||||
worker.local_scheduler_client.set_actor_frontier(frontier)
|
||||
worker.raylet_client.set_actor_frontier(frontier)
|
||||
checkpoint_resumed = True
|
||||
|
||||
return checkpoint_resumed
|
||||
|
||||
@@ -36,10 +36,10 @@ def warmup():
|
||||
|
||||
|
||||
def fetch(oids):
|
||||
local_sched_client = ray.worker.global_worker.local_scheduler_client
|
||||
raylet_client = ray.worker.global_worker.raylet_client
|
||||
for o in oids:
|
||||
ray_obj_id = ray.ObjectID(o)
|
||||
local_sched_client.fetch_or_reconstruct([ray_obj_id], True)
|
||||
raylet_client.fetch_or_reconstruct([ray_obj_id], True)
|
||||
|
||||
|
||||
def run_timeline(sess, ops, feed_dict=None, write_timeline=False, name=""):
|
||||
|
||||
@@ -42,4 +42,4 @@ def free(object_ids, local_only=False, worker=None):
|
||||
if len(object_ids) == 0:
|
||||
return
|
||||
|
||||
worker.local_scheduler_client.free(object_ids, local_only)
|
||||
worker.raylet_client.free_objects(object_ids, local_only)
|
||||
|
||||
@@ -119,7 +119,7 @@ class Profiler(object):
|
||||
else:
|
||||
component_type = "driver"
|
||||
|
||||
self.worker.local_scheduler_client.push_profile_events(
|
||||
self.worker.raylet_client.push_profile_events(
|
||||
component_type, ray.ObjectID(self.worker.worker_id),
|
||||
self.worker.node_ip_address, events)
|
||||
|
||||
|
||||
@@ -2,12 +2,12 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.core.src.ray.raylet.liblocal_scheduler_library_python import (
|
||||
Task, LocalSchedulerClient, ObjectID, check_simple_value, compute_task_id,
|
||||
from ray.core.src.ray.raylet.libraylet_library_python import (
|
||||
Task, RayletClient, ObjectID, check_simple_value, compute_task_id,
|
||||
task_from_string, task_to_string, _config, common_error)
|
||||
|
||||
__all__ = [
|
||||
"Task", "LocalSchedulerClient", "ObjectID", "check_simple_value",
|
||||
"Task", "RayletClient", "ObjectID", "check_simple_value",
|
||||
"compute_task_id", "task_from_string", "task_to_string",
|
||||
"start_local_scheduler", "_config", "common_error"
|
||||
]
|
||||
|
||||
@@ -39,8 +39,8 @@ class TaskPool(object):
|
||||
|
||||
for worker, obj_id in self.completed():
|
||||
plasma_id = ray.pyarrow.plasma.ObjectID(obj_id.id())
|
||||
(ray.worker.global_worker.local_scheduler_client.
|
||||
fetch_or_reconstruct([obj_id], True))
|
||||
(ray.worker.global_worker.raylet_client.fetch_or_reconstruct(
|
||||
[obj_id], True))
|
||||
self._fetching.append((worker, obj_id))
|
||||
|
||||
remaining = []
|
||||
|
||||
+1
-1
@@ -69,7 +69,7 @@ def push_error_to_driver(worker,
|
||||
if driver_id is None:
|
||||
driver_id = ray_constants.NIL_JOB_ID.id()
|
||||
data = {} if data is None else data
|
||||
worker.local_scheduler_client.push_error(
|
||||
worker.raylet_client.push_error(
|
||||
ray.ObjectID(driver_id), error_type, message, time.time())
|
||||
|
||||
|
||||
|
||||
+13
-13
@@ -455,7 +455,7 @@ class Worker(object):
|
||||
]
|
||||
for i in range(0, len(object_ids),
|
||||
ray._config.worker_fetch_request_size()):
|
||||
self.local_scheduler_client.fetch_or_reconstruct(
|
||||
self.raylet_client.fetch_or_reconstruct(
|
||||
object_ids[i:(i + ray._config.worker_fetch_request_size())],
|
||||
True)
|
||||
|
||||
@@ -490,7 +490,7 @@ class Worker(object):
|
||||
ray._config.worker_fetch_request_size())
|
||||
for i in range(0, len(object_ids_to_fetch),
|
||||
fetch_request_size):
|
||||
self.local_scheduler_client.fetch_or_reconstruct(
|
||||
self.raylet_client.fetch_or_reconstruct(
|
||||
ray_object_ids_to_fetch[i:(
|
||||
i + fetch_request_size)], False,
|
||||
current_task_id)
|
||||
@@ -511,7 +511,7 @@ class Worker(object):
|
||||
|
||||
# If there were objects that we weren't able to get locally,
|
||||
# let the local scheduler know that we're now unblocked.
|
||||
self.local_scheduler_client.notify_unblocked(current_task_id)
|
||||
self.raylet_client.notify_unblocked(current_task_id)
|
||||
|
||||
assert len(final_results) == len(object_ids)
|
||||
return final_results
|
||||
@@ -628,7 +628,7 @@ class Worker(object):
|
||||
actor_creation_id, actor_creation_dummy_object_id, actor_id,
|
||||
actor_handle_id, actor_counter, execution_dependencies,
|
||||
resources, placement_resources)
|
||||
self.local_scheduler_client.submit(task)
|
||||
self.raylet_client.submit_task(task)
|
||||
|
||||
return task.returns()
|
||||
|
||||
@@ -936,7 +936,7 @@ class Worker(object):
|
||||
reached_max_executions = (self.function_actor_manager.get_task_counter(
|
||||
driver_id, function_id.id()) == execution_info.max_calls)
|
||||
if reached_max_executions:
|
||||
self.local_scheduler_client.disconnect()
|
||||
self.raylet_client.disconnect()
|
||||
sys.exit(0)
|
||||
|
||||
def _get_next_task_from_local_scheduler(self):
|
||||
@@ -946,7 +946,7 @@ class Worker(object):
|
||||
A task from the local scheduler.
|
||||
"""
|
||||
with profiling.profile("worker_idle", worker=self):
|
||||
task = self.local_scheduler_client.get_task()
|
||||
task = self.raylet_client.get_task()
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
|
||||
@@ -982,7 +982,7 @@ def get_gpu_ids():
|
||||
raise Exception("ray.get_gpu_ids() currently does not work in PYTHON "
|
||||
"MODE.")
|
||||
|
||||
all_resource_ids = global_worker.local_scheduler_client.resource_ids()
|
||||
all_resource_ids = global_worker.raylet_client.resource_ids()
|
||||
assigned_ids = [
|
||||
resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
|
||||
]
|
||||
@@ -1010,7 +1010,7 @@ def get_resource_ids():
|
||||
"ray.get_resource_ids() currently does not work in PYTHON "
|
||||
"MODE.")
|
||||
|
||||
return global_worker.local_scheduler_client.resource_ids()
|
||||
return global_worker.raylet_client.resource_ids()
|
||||
|
||||
|
||||
def _webui_url_helper(client):
|
||||
@@ -1733,8 +1733,8 @@ def shutdown(worker=global_worker):
|
||||
will need to reload the module.
|
||||
"""
|
||||
disconnect(worker)
|
||||
if hasattr(worker, "local_scheduler_client"):
|
||||
del worker.local_scheduler_client
|
||||
if hasattr(worker, "raylet_client"):
|
||||
del worker.raylet_client
|
||||
if hasattr(worker, "plasma_client"):
|
||||
worker.plasma_client.disconnect()
|
||||
|
||||
@@ -2120,7 +2120,7 @@ def connect(info,
|
||||
# multithreading per worker.
|
||||
worker.multithreading_warned = False
|
||||
|
||||
worker.local_scheduler_client = ray.raylet.LocalSchedulerClient(
|
||||
worker.raylet_client = ray.raylet.RayletClient(
|
||||
raylet_socket, worker.worker_id, is_worker, worker.current_task_id)
|
||||
|
||||
# Start the import thread
|
||||
@@ -2406,7 +2406,7 @@ def put(value, worker=global_worker):
|
||||
if worker.mode == LOCAL_MODE:
|
||||
# In LOCAL_MODE, ray.put is the identity operation.
|
||||
return value
|
||||
object_id = worker.local_scheduler_client.compute_put_id(
|
||||
object_id = worker.raylet_client.compute_put_id(
|
||||
worker.current_task_id, worker.put_index)
|
||||
worker.put_object(object_id, value)
|
||||
worker.put_index += 1
|
||||
@@ -2486,7 +2486,7 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker):
|
||||
current_task_id = worker.get_current_thread_task_id()
|
||||
|
||||
timeout = timeout if timeout is not None else 2**30
|
||||
ready_ids, remaining_ids = worker.local_scheduler_client.wait(
|
||||
ready_ids, remaining_ids = worker.raylet_client.wait(
|
||||
object_ids, num_returns, timeout, False, current_task_id)
|
||||
return ready_ids, remaining_ids
|
||||
|
||||
|
||||
+1
-1
@@ -23,7 +23,7 @@ ray_files = [
|
||||
"ray/core/src/ray/thirdparty/redis/src/redis-server",
|
||||
"ray/core/src/ray/gcs/redis_module/libray_redis_module.so",
|
||||
"ray/core/src/plasma/plasma_store_server",
|
||||
"ray/core/src/ray/raylet/liblocal_scheduler_library_python.so",
|
||||
"ray/core/src/ray/raylet/libraylet_library_python.so",
|
||||
"ray/core/src/ray/raylet/raylet_monitor", "ray/core/src/ray/raylet/raylet",
|
||||
"ray/WebUI.ipynb"
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user