mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 12:10:40 +08:00
[Core] Added support for submission-time task names. (#10449)
* Added support for submission-time task names. * Suggestions from code review: add missing consts Co-authored-by: SangBin Cho <rkooo567@gmail.com> * Add num_returns arg to actor method options docstring example. * Add process name line and proctitle assertion to submission-time task name section of advanced docs. * Add submission-time task name --> proctitle test for Python worker. * Added Python actor options tests for num_returns and name. * Added Java test for submission-time task names. * Add dashboard image to task name docs section. * Move to fstrings. Co-authored-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
+16
-10
@@ -343,6 +343,7 @@ def switch_worker_log_if_needed(worker, next_job_id):
|
||||
|
||||
cdef execute_task(
|
||||
CTaskType task_type,
|
||||
const c_string name,
|
||||
const CRayFunction &ray_function,
|
||||
const unordered_map[c_string, double] &c_resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
@@ -386,16 +387,18 @@ cdef execute_task(
|
||||
extra_data = (b'{"name": ' + function_name.encode("ascii") +
|
||||
b' "task_id": ' + task_id.hex().encode("ascii") + b'}')
|
||||
|
||||
task_name = name.decode("utf-8")
|
||||
title = f"ray::{task_name}"
|
||||
|
||||
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
|
||||
title = "ray::{}()".format(function_name)
|
||||
next_title = "ray::IDLE"
|
||||
function_executor = execution_info.function
|
||||
else:
|
||||
actor = worker.actors[core_worker.get_actor_id()]
|
||||
class_name = actor.__class__.__name__
|
||||
title = "ray::{}.{}()".format(class_name, function_name)
|
||||
next_title = "ray::{}".format(class_name)
|
||||
worker_name = "ray_{}_{}".format(class_name, os.getpid())
|
||||
next_title = f"ray::{class_name}"
|
||||
pid = os.getpid()
|
||||
worker_name = f"ray_{class_name}_{pid}"
|
||||
if c_resources.find(b"memory") != c_resources.end():
|
||||
worker.memory_monitor.set_heap_limit(
|
||||
worker_name,
|
||||
@@ -470,8 +473,7 @@ cdef execute_task(
|
||||
if (<int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK):
|
||||
actor = worker.actors[core_worker.get_actor_id()]
|
||||
class_name = actor.__class__.__name__
|
||||
actor_title = "{}({}, {})".format(
|
||||
class_name, repr(args), repr(kwargs))
|
||||
actor_title = f"{class_name}({args!r}, {kwargs!r})"
|
||||
core_worker.set_actor_title(actor_title.encode("utf-8"))
|
||||
# Execute the task.
|
||||
with core_worker.profile_event(b"task:execute"):
|
||||
@@ -535,6 +537,7 @@ cdef execute_task(
|
||||
|
||||
cdef CRayStatus task_execution_handler(
|
||||
CTaskType task_type,
|
||||
const c_string task_name,
|
||||
const CRayFunction &ray_function,
|
||||
const unordered_map[c_string, double] &c_resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
@@ -547,8 +550,9 @@ cdef CRayStatus task_execution_handler(
|
||||
try:
|
||||
# The call to execute_task should never raise an exception. If
|
||||
# it does, that indicates that there was an internal error.
|
||||
execute_task(task_type, ray_function, c_resources, c_args,
|
||||
c_arg_reference_ids, c_return_ids, returns)
|
||||
execute_task(task_type, task_name, ray_function, c_resources,
|
||||
c_args, c_arg_reference_ids, c_return_ids,
|
||||
returns)
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + (
|
||||
"An unexpected internal error occurred while the worker "
|
||||
@@ -985,6 +989,7 @@ cdef class CoreWorker:
|
||||
Language language,
|
||||
FunctionDescriptor function_descriptor,
|
||||
args,
|
||||
c_string name,
|
||||
int num_returns,
|
||||
resources,
|
||||
int max_retries,
|
||||
@@ -1002,7 +1007,7 @@ cdef class CoreWorker:
|
||||
with self.profile_event(b"submit_task"):
|
||||
prepare_resources(resources, &c_resources)
|
||||
task_options = CTaskOptions(
|
||||
num_returns, c_resources)
|
||||
name, num_returns, c_resources)
|
||||
ray_function = CRayFunction(
|
||||
language.lang, function_descriptor.descriptor)
|
||||
prepare_args(self, language, args, &args_vector)
|
||||
@@ -1112,6 +1117,7 @@ cdef class CoreWorker:
|
||||
ActorID actor_id,
|
||||
FunctionDescriptor function_descriptor,
|
||||
args,
|
||||
c_string name,
|
||||
int num_returns,
|
||||
double num_method_cpus):
|
||||
|
||||
@@ -1126,7 +1132,7 @@ cdef class CoreWorker:
|
||||
with self.profile_event(b"submit_task"):
|
||||
if num_method_cpus > 0:
|
||||
c_resources[b"CPU"] = num_method_cpus
|
||||
task_options = CTaskOptions(num_returns, c_resources)
|
||||
task_options = CTaskOptions(name, num_returns, c_resources)
|
||||
ray_function = CRayFunction(
|
||||
language.lang, function_descriptor.descriptor)
|
||||
prepare_args(self, language, args, &args_vector)
|
||||
|
||||
+29
-4
@@ -100,7 +100,27 @@ class ActorMethod:
|
||||
def remote(self, *args, **kwargs):
|
||||
return self._remote(args, kwargs)
|
||||
|
||||
def _remote(self, args=None, kwargs=None, num_returns=None):
|
||||
def options(self, **options):
|
||||
"""Convenience method for executing an actor method call with options.
|
||||
|
||||
Same arguments as func._remote(), but returns a wrapped function
|
||||
that a non-underscore .remote() can be called on.
|
||||
|
||||
Examples:
|
||||
# The following two calls are equivalent.
|
||||
>>> actor.my_method._remote(args=[x, y], name="foo", num_returns=2)
|
||||
>>> actor.my_method.options(name="foo", num_returns=2).remote(x, y)
|
||||
"""
|
||||
|
||||
func_cls = self
|
||||
|
||||
class FuncWrapper:
|
||||
def remote(self, *args, **kwargs):
|
||||
return func_cls._remote(args=args, kwargs=kwargs, **options)
|
||||
|
||||
return FuncWrapper()
|
||||
|
||||
def _remote(self, args=None, kwargs=None, name="", num_returns=None):
|
||||
if num_returns is None:
|
||||
num_returns = self._num_returns
|
||||
|
||||
@@ -112,6 +132,7 @@ class ActorMethod:
|
||||
self._method_name,
|
||||
args=args,
|
||||
kwargs=kwargs,
|
||||
name=name,
|
||||
num_returns=num_returns)
|
||||
|
||||
# Apply the decorator if there is one.
|
||||
@@ -317,8 +338,10 @@ class ActorClass:
|
||||
max_task_retries, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources):
|
||||
for attribute in [
|
||||
"remote", "_remote", "_ray_from_modified_class",
|
||||
"_ray_from_function_descriptor"
|
||||
"remote",
|
||||
"_remote",
|
||||
"_ray_from_modified_class",
|
||||
"_ray_from_function_descriptor",
|
||||
]:
|
||||
if hasattr(modified_class, attribute):
|
||||
logger.warning("Creating an actor from class "
|
||||
@@ -679,6 +702,7 @@ class ActorHandle:
|
||||
method_name,
|
||||
args=None,
|
||||
kwargs=None,
|
||||
name="",
|
||||
num_returns=None):
|
||||
"""Method execution stub for an actor handle.
|
||||
|
||||
@@ -691,6 +715,7 @@ class ActorHandle:
|
||||
method_name: The name of the actor method to execute.
|
||||
args: A list of arguments for the actor method.
|
||||
kwargs: A dictionary of keyword arguments for the actor method.
|
||||
name (str): The name to give the actor method call task.
|
||||
num_returns (int): The number of return values for the method.
|
||||
|
||||
Returns:
|
||||
@@ -724,7 +749,7 @@ class ActorHandle:
|
||||
|
||||
object_refs = worker.core_worker.submit_actor_task(
|
||||
self._ray_actor_language, self._ray_actor_id, function_descriptor,
|
||||
list_args, num_returns, self._ray_actor_method_cpus)
|
||||
list_args, name, num_returns, self._ray_actor_method_cpus)
|
||||
|
||||
if len(object_refs) == 1:
|
||||
object_refs = object_refs[0]
|
||||
|
||||
@@ -241,7 +241,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
|
||||
|
||||
cdef cppclass CTaskOptions "ray::TaskOptions":
|
||||
CTaskOptions()
|
||||
CTaskOptions(int num_returns,
|
||||
CTaskOptions(c_string name, int num_returns,
|
||||
unordered_map[c_string, double] &resources)
|
||||
|
||||
cdef cppclass CActorCreationOptions "ray::ActorCreationOptions":
|
||||
|
||||
@@ -217,6 +217,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
c_string stderr_file
|
||||
(CRayStatus(
|
||||
CTaskType task_type,
|
||||
const c_string name,
|
||||
const CRayFunction &ray_function,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &args,
|
||||
|
||||
@@ -152,7 +152,8 @@ class RemoteFunction:
|
||||
resources=None,
|
||||
max_retries=None,
|
||||
placement_group=None,
|
||||
placement_group_bundle_index=-1):
|
||||
placement_group_bundle_index=-1,
|
||||
name=""):
|
||||
"""Submit the remote function for execution."""
|
||||
worker = ray.worker.global_worker
|
||||
worker.check_connected()
|
||||
@@ -212,7 +213,7 @@ class RemoteFunction:
|
||||
"Cross language remote function " \
|
||||
"cannot be executed locally."
|
||||
object_refs = worker.core_worker.submit_task(
|
||||
self._language, self._function_descriptor, list_args,
|
||||
self._language, self._function_descriptor, list_args, name,
|
||||
num_returns, resources, max_retries, placement_group.id,
|
||||
placement_group_bundle_index)
|
||||
|
||||
|
||||
@@ -14,6 +14,10 @@ import ray
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
|
||||
# NOTE: We have to import setproctitle after ray because we bundle setproctitle
|
||||
# with ray.
|
||||
import setproctitle
|
||||
|
||||
|
||||
def test_caching_actors(shutdown_only):
|
||||
# Test defining actors before ray.init() has been called.
|
||||
@@ -673,6 +677,33 @@ def test_multiple_return_values(ray_start_regular_shared):
|
||||
assert ray.get([id3a, id3b, id3c]) == [1, 2, 3]
|
||||
|
||||
|
||||
def test_options_num_returns(ray_start_regular_shared):
|
||||
@ray.remote
|
||||
class Foo:
|
||||
def method(self):
|
||||
return 1, 2
|
||||
|
||||
f = Foo.remote()
|
||||
|
||||
obj = f.method.remote()
|
||||
assert ray.get(obj) == (1, 2)
|
||||
|
||||
obj1, obj2 = f.method.options(num_returns=2).remote()
|
||||
assert ray.get([obj1, obj2]) == [1, 2]
|
||||
|
||||
|
||||
def test_options_name(ray_start_regular_shared):
|
||||
@ray.remote
|
||||
class Foo:
|
||||
def method(self, name):
|
||||
assert setproctitle.getproctitle() == f"ray::{name}"
|
||||
|
||||
f = Foo.remote()
|
||||
|
||||
ray.get(f.method.options(name="foo").remote("foo"))
|
||||
ray.get(f.method.options(name="bar").remote("bar"))
|
||||
|
||||
|
||||
def test_define_actor(ray_start_regular_shared):
|
||||
@ray.remote
|
||||
class Test:
|
||||
|
||||
@@ -35,7 +35,7 @@ def attempt_to_load_balance(remote_function,
|
||||
[remote_function.remote(*args) for _ in range(total_tasks)])
|
||||
names = set(locations)
|
||||
counts = [locations.count(name) for name in names]
|
||||
logger.info("Counts are {}.".format(counts))
|
||||
logger.info(f"Counts are {counts}.")
|
||||
if (len(names) == num_nodes
|
||||
and all(count >= minimum_count for count in counts)):
|
||||
break
|
||||
@@ -346,6 +346,28 @@ def test_ray_setproctitle(ray_start_2_cpus):
|
||||
ray.get(unique_1.remote())
|
||||
|
||||
|
||||
def test_ray_task_name_setproctitle(ray_start_2_cpus):
|
||||
method_task_name = "foo"
|
||||
|
||||
@ray.remote
|
||||
class UniqueName:
|
||||
def __init__(self):
|
||||
assert setproctitle.getproctitle() == "ray::UniqueName.__init__()"
|
||||
|
||||
def f(self):
|
||||
assert setproctitle.getproctitle() == f"ray::{method_task_name}"
|
||||
|
||||
task_name = "bar"
|
||||
|
||||
@ray.remote
|
||||
def unique_1():
|
||||
assert task_name in setproctitle.getproctitle()
|
||||
|
||||
actor = UniqueName.remote()
|
||||
ray.get(actor.f.options(name=method_task_name).remote())
|
||||
ray.get(unique_1.options(name=task_name).remote())
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("TRAVIS") is None,
|
||||
reason="This test should only be run on Travis.")
|
||||
@@ -508,7 +530,7 @@ def test_invalid_unicode_in_worker_log(shutdown_only):
|
||||
|
||||
# Wait till first worker log file is created.
|
||||
while True:
|
||||
log_file_paths = glob.glob("{}/worker*.out".format(logs_dir))
|
||||
log_file_paths = glob.glob(f"{logs_dir}/worker*.out")
|
||||
if len(log_file_paths) == 0:
|
||||
time.sleep(0.2)
|
||||
else:
|
||||
@@ -546,13 +568,13 @@ def test_move_log_files_to_old(shutdown_only):
|
||||
|
||||
# Make sure no log files are in the "old" directory before the actors
|
||||
# are killed.
|
||||
assert len(glob.glob("{}/old/worker*.out".format(logs_dir))) == 0
|
||||
assert len(glob.glob(f"{logs_dir}/old/worker*.out")) == 0
|
||||
|
||||
# Now kill the actors so the files get moved to logs/old/.
|
||||
[a.__ray_terminate__.remote() for a in actors]
|
||||
|
||||
while True:
|
||||
log_file_paths = glob.glob("{}/old/worker*.out".format(logs_dir))
|
||||
log_file_paths = glob.glob(f"{logs_dir}/old/worker*.out")
|
||||
if len(log_file_paths) > 0:
|
||||
with open(log_file_paths[0], "r") as f:
|
||||
assert "function f finished\n" in f.readlines()
|
||||
@@ -641,7 +663,7 @@ Blacklisted: No
|
||||
"""
|
||||
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
|
||||
expected_dict = {
|
||||
"{}V100".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
|
||||
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}V100": 1,
|
||||
}
|
||||
assert constraints_dict == expected_dict
|
||||
|
||||
@@ -658,7 +680,7 @@ Blacklisted: No
|
||||
"""
|
||||
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
|
||||
expected_dict = {
|
||||
"{}T4".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
|
||||
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}T4": 1,
|
||||
}
|
||||
assert constraints_dict == expected_dict
|
||||
|
||||
|
||||
Reference in New Issue
Block a user