mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 14:36:45 +08:00
Print warning when defining very large remote function or actor. (#2179)
* Print warning when defining very large remote function or actor. * Add weak test. * Check that warnings appear in test. * Make wait_for_errors actually fail in failure_test.py. * Use constants for error types. * Fix
This commit is contained in:
committed by
Philipp Moritz
parent
1475600c81
commit
125fe1c09c
+18
-3
@@ -10,6 +10,7 @@ import traceback
|
||||
|
||||
import ray.cloudpickle as pickle
|
||||
import ray.local_scheduler
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.signature as signature
|
||||
import ray.worker
|
||||
from ray.utils import _random_string, is_cython, push_error_to_driver
|
||||
@@ -164,7 +165,7 @@ def save_and_log_checkpoint(worker, actor):
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
"checkpoint",
|
||||
ray_constants.CHECKPOINT_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=worker.task_driver_id.id(),
|
||||
data={
|
||||
@@ -188,7 +189,7 @@ def restore_and_log_checkpoint(worker, actor):
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
"checkpoint",
|
||||
ray_constants.CHECKPOINT_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=worker.task_driver_id.id(),
|
||||
data={
|
||||
@@ -330,7 +331,7 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
# Log the error message.
|
||||
push_error_to_driver(
|
||||
worker.redis_client,
|
||||
"register_actor_signatures",
|
||||
ray_constants.REGISTER_ACTOR_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id,
|
||||
data={"actor_id": actor_id_str})
|
||||
@@ -392,6 +393,20 @@ def export_actor_class(class_id, Class, actor_method_names,
|
||||
"actor_method_names": json.dumps(list(actor_method_names))
|
||||
}
|
||||
|
||||
if (len(actor_class_info["class"]) >
|
||||
ray_constants.PICKLE_OBJECT_WARNING_SIZE):
|
||||
warning_message = ("Warning: The actor {} has size {} when pickled. "
|
||||
"It will be stored in Redis, which could cause "
|
||||
"memory issues. This may mean that the actor "
|
||||
"definition uses a large array or other object."
|
||||
.format(actor_class_info["class_name"],
|
||||
len(actor_class_info["class"])))
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=worker.task_driver_id.id())
|
||||
|
||||
if worker.mode is None:
|
||||
# This means that 'ray.init()' has not been called yet and so we must
|
||||
# cache the actor class definition and export it when 'ray.init()' is
|
||||
|
||||
@@ -12,6 +12,27 @@ def env_integer(key, default):
|
||||
return default
|
||||
|
||||
|
||||
# Different types of Ray errors that can be pushed to the driver.
|
||||
# TODO(rkn): These should be defined in flatbuffers and must be synced with
|
||||
# the existing C++ definitions.
|
||||
WAIT_FOR_CLASS_PUSH_ERROR = "wait_for_class"
|
||||
PICKLING_LARGE_OBJECT_PUSH_ERROR = "pickling_large_object"
|
||||
WAIT_FOR_FUNCTION_PUSH_ERROR = "wait_for_function"
|
||||
TASK_PUSH_ERROR = "task"
|
||||
REGISTER_REMOTE_FUNCTION_PUSH_ERROR = "register_remote_function"
|
||||
FUNCTION_TO_RUN_PUSH_ERROR = "function_to_run"
|
||||
VERSION_MISMATCH_PUSH_ERROR = "version_mismatch"
|
||||
CHECKPOINT_PUSH_ERROR = "checkpoint"
|
||||
REGISTER_ACTOR_PUSH_ERROR = "register_actor"
|
||||
WORKER_CRASH_PUSH_ERROR = "worker_crash"
|
||||
WORKER_DIED_PUSH_ERROR = "worker_died"
|
||||
PUT_RECONSTRUCTION_PUSH_ERROR = "put_reconstruction"
|
||||
HASH_MISMATCH_PUSH_ERROR = "object_hash_mismatch"
|
||||
|
||||
# If a remote function or actor (or some other export) has serialized size
|
||||
# greater than this quantity, print an warning.
|
||||
PICKLE_OBJECT_WARNING_SIZE = 10**7
|
||||
|
||||
# Abort autoscaling if more than this number of errors are encountered. This
|
||||
# is a safety feature to prevent e.g. runaway node launches.
|
||||
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
|
||||
|
||||
+41
-6
@@ -28,6 +28,7 @@ import ray.services as services
|
||||
import ray.signature
|
||||
import ray.local_scheduler
|
||||
import ray.plasma
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.utils import random_string, binary_to_hex, is_cython
|
||||
|
||||
# Import flatbuffer bindings.
|
||||
@@ -415,7 +416,7 @@ class Worker(object):
|
||||
if not warning_sent:
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
"wait_for_class",
|
||||
ray_constants.WAIT_FOR_CLASS_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=self.task_driver_id.id())
|
||||
warning_sent = True
|
||||
@@ -637,6 +638,19 @@ class Worker(object):
|
||||
else:
|
||||
del function.__globals__[function.__name__]
|
||||
|
||||
if len(pickled_function) > ray_constants.PICKLE_OBJECT_WARNING_SIZE:
|
||||
warning_message = ("Warning: The remote function {} has size {} "
|
||||
"when pickled. It will be stored in Redis, "
|
||||
"which could cause memory issues. This may "
|
||||
"mean that the function definition uses a "
|
||||
"large array or other object.".format(
|
||||
function_name, len(pickled_function)))
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=self.task_driver_id.id())
|
||||
|
||||
self.redis_client.hmset(
|
||||
key, {
|
||||
"driver_id": self.task_driver_id.id(),
|
||||
@@ -684,6 +698,22 @@ class Worker(object):
|
||||
# In this case, the function has already been exported, so
|
||||
# we don't need to export it again.
|
||||
return
|
||||
|
||||
if (len(pickled_function) >
|
||||
ray_constants.PICKLE_OBJECT_WARNING_SIZE):
|
||||
warning_message = ("Warning: The function {} has size {} when "
|
||||
"pickled. It will be stored in Redis, "
|
||||
"which could cause memory issues. This may "
|
||||
"mean that the remote function definition "
|
||||
"uses a large array or other object."
|
||||
.format(function.__name__,
|
||||
len(pickled_function)))
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=self.task_driver_id.id())
|
||||
|
||||
# Run the function on all workers.
|
||||
self.redis_client.hmset(
|
||||
key, {
|
||||
@@ -735,7 +765,7 @@ class Worker(object):
|
||||
if not warning_sent:
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
"wait_for_function",
|
||||
ray_constants.WAIT_FOR_FUNCTION_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=driver_id)
|
||||
warning_sent = True
|
||||
@@ -896,7 +926,7 @@ class Worker(object):
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
"task",
|
||||
ray_constants.TASK_PUSH_ERROR,
|
||||
str(failure_object),
|
||||
driver_id=self.task_driver_id.id(),
|
||||
data={
|
||||
@@ -1132,6 +1162,11 @@ def error_info(worker=global_worker):
|
||||
for error_key in error_keys:
|
||||
if error_applies_to_driver(error_key, worker=worker):
|
||||
error_contents = worker.redis_client.hgetall(error_key)
|
||||
error_contents = {
|
||||
"type": error_contents[b"type"].decode("ascii"),
|
||||
"message": error_contents[b"message"].decode("ascii"),
|
||||
"data": error_contents[b"data"].decode("ascii")
|
||||
}
|
||||
errors.append(error_contents)
|
||||
|
||||
return errors
|
||||
@@ -1823,7 +1858,7 @@ def fetch_and_register_remote_function(key, worker=global_worker):
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
"register_remote_function",
|
||||
ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=driver_id,
|
||||
data={
|
||||
@@ -1868,7 +1903,7 @@ def fetch_and_execute_function_to_run(key, worker=global_worker):
|
||||
and hasattr(function, "__name__")) else ""
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
"function_to_run",
|
||||
ray_constants.FUNCTION_TO_RUN_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=driver_id,
|
||||
data={"name": name})
|
||||
@@ -2028,7 +2063,7 @@ def connect(info,
|
||||
traceback_str = traceback.format_exc()
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
"version_mismatch",
|
||||
ray_constants.VERSION_MISMATCH_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=None)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user