[Core] Added support for submission-time task names. (#10449)

* Added support for submission-time task names.

* Suggestions from code review: add missing consts

Co-authored-by: SangBin Cho <rkooo567@gmail.com>

* Add num_returns arg to actor method options docstring example.

* Add process name line and proctitle assertion to submission-time task name section of advanced docs.

* Add submission-time task name --> proctitle test for Python worker.

* Added Python actor options tests for num_returns and name.

* Added Java test for submission-time task names.

* Add dashboard image to task name docs section.

* Move to fstrings.

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
Clark Zinzow
2020-09-03 12:45:24 -06:00
committed by GitHub
parent 71274954d1
commit 0c0b0d0a73
37 changed files with 361 additions and 135 deletions
+16 -10
View File
@@ -343,6 +343,7 @@ def switch_worker_log_if_needed(worker, next_job_id):
cdef execute_task(
CTaskType task_type,
const c_string name,
const CRayFunction &ray_function,
const unordered_map[c_string, double] &c_resources,
const c_vector[shared_ptr[CRayObject]] &c_args,
@@ -386,16 +387,18 @@ cdef execute_task(
extra_data = (b'{"name": ' + function_name.encode("ascii") +
b' "task_id": ' + task_id.hex().encode("ascii") + b'}')
task_name = name.decode("utf-8")
title = f"ray::{task_name}"
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
title = "ray::{}()".format(function_name)
next_title = "ray::IDLE"
function_executor = execution_info.function
else:
actor = worker.actors[core_worker.get_actor_id()]
class_name = actor.__class__.__name__
title = "ray::{}.{}()".format(class_name, function_name)
next_title = "ray::{}".format(class_name)
worker_name = "ray_{}_{}".format(class_name, os.getpid())
next_title = f"ray::{class_name}"
pid = os.getpid()
worker_name = f"ray_{class_name}_{pid}"
if c_resources.find(b"memory") != c_resources.end():
worker.memory_monitor.set_heap_limit(
worker_name,
@@ -470,8 +473,7 @@ cdef execute_task(
if (<int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK):
actor = worker.actors[core_worker.get_actor_id()]
class_name = actor.__class__.__name__
actor_title = "{}({}, {})".format(
class_name, repr(args), repr(kwargs))
actor_title = f"{class_name}({args!r}, {kwargs!r})"
core_worker.set_actor_title(actor_title.encode("utf-8"))
# Execute the task.
with core_worker.profile_event(b"task:execute"):
@@ -535,6 +537,7 @@ cdef execute_task(
cdef CRayStatus task_execution_handler(
CTaskType task_type,
const c_string task_name,
const CRayFunction &ray_function,
const unordered_map[c_string, double] &c_resources,
const c_vector[shared_ptr[CRayObject]] &c_args,
@@ -547,8 +550,9 @@ cdef CRayStatus task_execution_handler(
try:
# The call to execute_task should never raise an exception. If
# it does, that indicates that there was an internal error.
execute_task(task_type, ray_function, c_resources, c_args,
c_arg_reference_ids, c_return_ids, returns)
execute_task(task_type, task_name, ray_function, c_resources,
c_args, c_arg_reference_ids, c_return_ids,
returns)
except Exception:
traceback_str = traceback.format_exc() + (
"An unexpected internal error occurred while the worker "
@@ -985,6 +989,7 @@ cdef class CoreWorker:
Language language,
FunctionDescriptor function_descriptor,
args,
c_string name,
int num_returns,
resources,
int max_retries,
@@ -1002,7 +1007,7 @@ cdef class CoreWorker:
with self.profile_event(b"submit_task"):
prepare_resources(resources, &c_resources)
task_options = CTaskOptions(
num_returns, c_resources)
name, num_returns, c_resources)
ray_function = CRayFunction(
language.lang, function_descriptor.descriptor)
prepare_args(self, language, args, &args_vector)
@@ -1112,6 +1117,7 @@ cdef class CoreWorker:
ActorID actor_id,
FunctionDescriptor function_descriptor,
args,
c_string name,
int num_returns,
double num_method_cpus):
@@ -1126,7 +1132,7 @@ cdef class CoreWorker:
with self.profile_event(b"submit_task"):
if num_method_cpus > 0:
c_resources[b"CPU"] = num_method_cpus
task_options = CTaskOptions(num_returns, c_resources)
task_options = CTaskOptions(name, num_returns, c_resources)
ray_function = CRayFunction(
language.lang, function_descriptor.descriptor)
prepare_args(self, language, args, &args_vector)
+29 -4
View File
@@ -100,7 +100,27 @@ class ActorMethod:
def remote(self, *args, **kwargs):
return self._remote(args, kwargs)
def _remote(self, args=None, kwargs=None, num_returns=None):
def options(self, **options):
"""Convenience method for executing an actor method call with options.
Same arguments as func._remote(), but returns a wrapped function
that a non-underscore .remote() can be called on.
Examples:
# The following two calls are equivalent.
>>> actor.my_method._remote(args=[x, y], name="foo", num_returns=2)
>>> actor.my_method.options(name="foo", num_returns=2).remote(x, y)
"""
func_cls = self
class FuncWrapper:
def remote(self, *args, **kwargs):
return func_cls._remote(args=args, kwargs=kwargs, **options)
return FuncWrapper()
def _remote(self, args=None, kwargs=None, name="", num_returns=None):
if num_returns is None:
num_returns = self._num_returns
@@ -112,6 +132,7 @@ class ActorMethod:
self._method_name,
args=args,
kwargs=kwargs,
name=name,
num_returns=num_returns)
# Apply the decorator if there is one.
@@ -317,8 +338,10 @@ class ActorClass:
max_task_retries, num_cpus, num_gpus, memory,
object_store_memory, resources):
for attribute in [
"remote", "_remote", "_ray_from_modified_class",
"_ray_from_function_descriptor"
"remote",
"_remote",
"_ray_from_modified_class",
"_ray_from_function_descriptor",
]:
if hasattr(modified_class, attribute):
logger.warning("Creating an actor from class "
@@ -679,6 +702,7 @@ class ActorHandle:
method_name,
args=None,
kwargs=None,
name="",
num_returns=None):
"""Method execution stub for an actor handle.
@@ -691,6 +715,7 @@ class ActorHandle:
method_name: The name of the actor method to execute.
args: A list of arguments for the actor method.
kwargs: A dictionary of keyword arguments for the actor method.
name (str): The name to give the actor method call task.
num_returns (int): The number of return values for the method.
Returns:
@@ -724,7 +749,7 @@ class ActorHandle:
object_refs = worker.core_worker.submit_actor_task(
self._ray_actor_language, self._ray_actor_id, function_descriptor,
list_args, num_returns, self._ray_actor_method_cpus)
list_args, name, num_returns, self._ray_actor_method_cpus)
if len(object_refs) == 1:
object_refs = object_refs[0]
+1 -1
View File
@@ -241,7 +241,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
cdef cppclass CTaskOptions "ray::TaskOptions":
CTaskOptions()
CTaskOptions(int num_returns,
CTaskOptions(c_string name, int num_returns,
unordered_map[c_string, double] &resources)
cdef cppclass CActorCreationOptions "ray::ActorCreationOptions":
+1
View File
@@ -217,6 +217,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
c_string stderr_file
(CRayStatus(
CTaskType task_type,
const c_string name,
const CRayFunction &ray_function,
const unordered_map[c_string, double] &resources,
const c_vector[shared_ptr[CRayObject]] &args,
+3 -2
View File
@@ -152,7 +152,8 @@ class RemoteFunction:
resources=None,
max_retries=None,
placement_group=None,
placement_group_bundle_index=-1):
placement_group_bundle_index=-1,
name=""):
"""Submit the remote function for execution."""
worker = ray.worker.global_worker
worker.check_connected()
@@ -212,7 +213,7 @@ class RemoteFunction:
"Cross language remote function " \
"cannot be executed locally."
object_refs = worker.core_worker.submit_task(
self._language, self._function_descriptor, list_args,
self._language, self._function_descriptor, list_args, name,
num_returns, resources, max_retries, placement_group.id,
placement_group_bundle_index)
+31
View File
@@ -14,6 +14,10 @@ import ray
import ray.test_utils
import ray.cluster_utils
# NOTE: We have to import setproctitle after ray because we bundle setproctitle
# with ray.
import setproctitle
def test_caching_actors(shutdown_only):
# Test defining actors before ray.init() has been called.
@@ -673,6 +677,33 @@ def test_multiple_return_values(ray_start_regular_shared):
assert ray.get([id3a, id3b, id3c]) == [1, 2, 3]
def test_options_num_returns(ray_start_regular_shared):
@ray.remote
class Foo:
def method(self):
return 1, 2
f = Foo.remote()
obj = f.method.remote()
assert ray.get(obj) == (1, 2)
obj1, obj2 = f.method.options(num_returns=2).remote()
assert ray.get([obj1, obj2]) == [1, 2]
def test_options_name(ray_start_regular_shared):
@ray.remote
class Foo:
def method(self, name):
assert setproctitle.getproctitle() == f"ray::{name}"
f = Foo.remote()
ray.get(f.method.options(name="foo").remote("foo"))
ray.get(f.method.options(name="bar").remote("bar"))
def test_define_actor(ray_start_regular_shared):
@ray.remote
class Test:
+28 -6
View File
@@ -35,7 +35,7 @@ def attempt_to_load_balance(remote_function,
[remote_function.remote(*args) for _ in range(total_tasks)])
names = set(locations)
counts = [locations.count(name) for name in names]
logger.info("Counts are {}.".format(counts))
logger.info(f"Counts are {counts}.")
if (len(names) == num_nodes
and all(count >= minimum_count for count in counts)):
break
@@ -346,6 +346,28 @@ def test_ray_setproctitle(ray_start_2_cpus):
ray.get(unique_1.remote())
def test_ray_task_name_setproctitle(ray_start_2_cpus):
method_task_name = "foo"
@ray.remote
class UniqueName:
def __init__(self):
assert setproctitle.getproctitle() == "ray::UniqueName.__init__()"
def f(self):
assert setproctitle.getproctitle() == f"ray::{method_task_name}"
task_name = "bar"
@ray.remote
def unique_1():
assert task_name in setproctitle.getproctitle()
actor = UniqueName.remote()
ray.get(actor.f.options(name=method_task_name).remote())
ray.get(unique_1.options(name=task_name).remote())
@pytest.mark.skipif(
os.getenv("TRAVIS") is None,
reason="This test should only be run on Travis.")
@@ -508,7 +530,7 @@ def test_invalid_unicode_in_worker_log(shutdown_only):
# Wait till first worker log file is created.
while True:
log_file_paths = glob.glob("{}/worker*.out".format(logs_dir))
log_file_paths = glob.glob(f"{logs_dir}/worker*.out")
if len(log_file_paths) == 0:
time.sleep(0.2)
else:
@@ -546,13 +568,13 @@ def test_move_log_files_to_old(shutdown_only):
# Make sure no log files are in the "old" directory before the actors
# are killed.
assert len(glob.glob("{}/old/worker*.out".format(logs_dir))) == 0
assert len(glob.glob(f"{logs_dir}/old/worker*.out")) == 0
# Now kill the actors so the files get moved to logs/old/.
[a.__ray_terminate__.remote() for a in actors]
while True:
log_file_paths = glob.glob("{}/old/worker*.out".format(logs_dir))
log_file_paths = glob.glob(f"{logs_dir}/old/worker*.out")
if len(log_file_paths) > 0:
with open(log_file_paths[0], "r") as f:
assert "function f finished\n" in f.readlines()
@@ -641,7 +663,7 @@ Blacklisted: No
"""
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
expected_dict = {
"{}V100".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}V100": 1,
}
assert constraints_dict == expected_dict
@@ -658,7 +680,7 @@ Blacklisted: No
"""
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
expected_dict = {
"{}T4".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}T4": 1,
}
assert constraints_dict == expected_dict