Convert the raylet client (the code in local_scheduler_client.cc) to proper C++. (#3511)

* refactoring

* fix bugs

* create client class

* create client class for java; bug fix

* remove legacy code

* improve code by using std::string, std::unique_ptr rename private fields and removing legacy code

* rename class

* improve naming

* fix

* rename files

* fix names

* change name

* change return types

* make a mutex private field

* fix comments

* fix bugs

* lint

* bug fix

* bug fix

* move too short functions into the header file

* Loose crash conditions for some APIs.

* Apply suggestions from code review

Co-Authored-By: suquark <suquark@gmail.com>

* format

* update

* rename python APIs

* fix java

* more fixes

* change types of cpython interface

* more fixes

* improve error processing

* improve error processing for java wrapper

* lint

* fix java

* make fields const

* use pointers for [out] parameters

* fix java & error msg

* fix resource leak, etc.
This commit is contained in:
Si-Yuan
2018-12-13 13:39:10 -08:00
committed by Philipp Moritz
parent 5dcc333199
commit 84fae57ab5
22 changed files with 739 additions and 828 deletions
+3 -4
View File
@@ -791,7 +791,7 @@ def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
# Disconnect the worker from the local scheduler. The point of
# this is so that when the worker kills itself below, the local
# scheduler won't push an error message to the driver.
worker.local_scheduler_client.disconnect()
worker.raylet_client.disconnect()
sys.exit(0)
assert False, "This process should have terminated."
@@ -832,8 +832,7 @@ def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
# the local scheduler will not be included, and may not be runnable
# on checkpoint resumption.
actor_id = ray.ObjectID(worker.actor_id)
frontier = worker.local_scheduler_client.get_actor_frontier(
actor_id)
frontier = worker.raylet_client.get_actor_frontier(actor_id)
# Save the checkpoint in Redis. TODO(rkn): Checkpoints
# should not be stored in Redis. Fix this.
set_actor_checkpoint(worker, worker.actor_id, checkpoint_index,
@@ -863,7 +862,7 @@ def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
# Set the number of tasks executed so far.
worker.actor_task_counter = checkpoint_index
# Set the actor frontier in the local scheduler.
worker.local_scheduler_client.set_actor_frontier(frontier)
worker.raylet_client.set_actor_frontier(frontier)
checkpoint_resumed = True
return checkpoint_resumed
+2 -2
View File
@@ -36,10 +36,10 @@ def warmup():
def fetch(oids):
local_sched_client = ray.worker.global_worker.local_scheduler_client
raylet_client = ray.worker.global_worker.raylet_client
for o in oids:
ray_obj_id = ray.ObjectID(o)
local_sched_client.fetch_or_reconstruct([ray_obj_id], True)
raylet_client.fetch_or_reconstruct([ray_obj_id], True)
def run_timeline(sess, ops, feed_dict=None, write_timeline=False, name=""):
+1 -1
View File
@@ -42,4 +42,4 @@ def free(object_ids, local_only=False, worker=None):
if len(object_ids) == 0:
return
worker.local_scheduler_client.free(object_ids, local_only)
worker.raylet_client.free_objects(object_ids, local_only)
+1 -1
View File
@@ -119,7 +119,7 @@ class Profiler(object):
else:
component_type = "driver"
self.worker.local_scheduler_client.push_profile_events(
self.worker.raylet_client.push_profile_events(
component_type, ray.ObjectID(self.worker.worker_id),
self.worker.node_ip_address, events)
+3 -3
View File
@@ -2,12 +2,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.core.src.ray.raylet.liblocal_scheduler_library_python import (
Task, LocalSchedulerClient, ObjectID, check_simple_value, compute_task_id,
from ray.core.src.ray.raylet.libraylet_library_python import (
Task, RayletClient, ObjectID, check_simple_value, compute_task_id,
task_from_string, task_to_string, _config, common_error)
__all__ = [
"Task", "LocalSchedulerClient", "ObjectID", "check_simple_value",
"Task", "RayletClient", "ObjectID", "check_simple_value",
"compute_task_id", "task_from_string", "task_to_string",
"start_local_scheduler", "_config", "common_error"
]
+2 -2
View File
@@ -39,8 +39,8 @@ class TaskPool(object):
for worker, obj_id in self.completed():
plasma_id = ray.pyarrow.plasma.ObjectID(obj_id.id())
(ray.worker.global_worker.local_scheduler_client.
fetch_or_reconstruct([obj_id], True))
(ray.worker.global_worker.raylet_client.fetch_or_reconstruct(
[obj_id], True))
self._fetching.append((worker, obj_id))
remaining = []
+1 -1
View File
@@ -69,7 +69,7 @@ def push_error_to_driver(worker,
if driver_id is None:
driver_id = ray_constants.NIL_JOB_ID.id()
data = {} if data is None else data
worker.local_scheduler_client.push_error(
worker.raylet_client.push_error(
ray.ObjectID(driver_id), error_type, message, time.time())
+13 -13
View File
@@ -455,7 +455,7 @@ class Worker(object):
]
for i in range(0, len(object_ids),
ray._config.worker_fetch_request_size()):
self.local_scheduler_client.fetch_or_reconstruct(
self.raylet_client.fetch_or_reconstruct(
object_ids[i:(i + ray._config.worker_fetch_request_size())],
True)
@@ -490,7 +490,7 @@ class Worker(object):
ray._config.worker_fetch_request_size())
for i in range(0, len(object_ids_to_fetch),
fetch_request_size):
self.local_scheduler_client.fetch_or_reconstruct(
self.raylet_client.fetch_or_reconstruct(
ray_object_ids_to_fetch[i:(
i + fetch_request_size)], False,
current_task_id)
@@ -511,7 +511,7 @@ class Worker(object):
# If there were objects that we weren't able to get locally,
# let the local scheduler know that we're now unblocked.
self.local_scheduler_client.notify_unblocked(current_task_id)
self.raylet_client.notify_unblocked(current_task_id)
assert len(final_results) == len(object_ids)
return final_results
@@ -628,7 +628,7 @@ class Worker(object):
actor_creation_id, actor_creation_dummy_object_id, actor_id,
actor_handle_id, actor_counter, execution_dependencies,
resources, placement_resources)
self.local_scheduler_client.submit(task)
self.raylet_client.submit_task(task)
return task.returns()
@@ -936,7 +936,7 @@ class Worker(object):
reached_max_executions = (self.function_actor_manager.get_task_counter(
driver_id, function_id.id()) == execution_info.max_calls)
if reached_max_executions:
self.local_scheduler_client.disconnect()
self.raylet_client.disconnect()
sys.exit(0)
def _get_next_task_from_local_scheduler(self):
@@ -946,7 +946,7 @@ class Worker(object):
A task from the local scheduler.
"""
with profiling.profile("worker_idle", worker=self):
task = self.local_scheduler_client.get_task()
task = self.raylet_client.get_task()
# Automatically restrict the GPUs available to this task.
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
@@ -982,7 +982,7 @@ def get_gpu_ids():
raise Exception("ray.get_gpu_ids() currently does not work in PYTHON "
"MODE.")
all_resource_ids = global_worker.local_scheduler_client.resource_ids()
all_resource_ids = global_worker.raylet_client.resource_ids()
assigned_ids = [
resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
]
@@ -1010,7 +1010,7 @@ def get_resource_ids():
"ray.get_resource_ids() currently does not work in PYTHON "
"MODE.")
return global_worker.local_scheduler_client.resource_ids()
return global_worker.raylet_client.resource_ids()
def _webui_url_helper(client):
@@ -1733,8 +1733,8 @@ def shutdown(worker=global_worker):
will need to reload the module.
"""
disconnect(worker)
if hasattr(worker, "local_scheduler_client"):
del worker.local_scheduler_client
if hasattr(worker, "raylet_client"):
del worker.raylet_client
if hasattr(worker, "plasma_client"):
worker.plasma_client.disconnect()
@@ -2120,7 +2120,7 @@ def connect(info,
# multithreading per worker.
worker.multithreading_warned = False
worker.local_scheduler_client = ray.raylet.LocalSchedulerClient(
worker.raylet_client = ray.raylet.RayletClient(
raylet_socket, worker.worker_id, is_worker, worker.current_task_id)
# Start the import thread
@@ -2406,7 +2406,7 @@ def put(value, worker=global_worker):
if worker.mode == LOCAL_MODE:
# In LOCAL_MODE, ray.put is the identity operation.
return value
object_id = worker.local_scheduler_client.compute_put_id(
object_id = worker.raylet_client.compute_put_id(
worker.current_task_id, worker.put_index)
worker.put_object(object_id, value)
worker.put_index += 1
@@ -2486,7 +2486,7 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker):
current_task_id = worker.get_current_thread_task_id()
timeout = timeout if timeout is not None else 2**30
ready_ids, remaining_ids = worker.local_scheduler_client.wait(
ready_ids, remaining_ids = worker.raylet_client.wait(
object_ids, num_returns, timeout, False, current_task_id)
return ready_ids, remaining_ids
+1 -1
View File
@@ -23,7 +23,7 @@ ray_files = [
"ray/core/src/ray/thirdparty/redis/src/redis-server",
"ray/core/src/ray/gcs/redis_module/libray_redis_module.so",
"ray/core/src/plasma/plasma_store_server",
"ray/core/src/ray/raylet/liblocal_scheduler_library_python.so",
"ray/core/src/ray/raylet/libraylet_library_python.so",
"ray/core/src/ray/raylet/raylet_monitor", "ray/core/src/ray/raylet/raylet",
"ray/WebUI.ipynb"
]