Print warning when defining very large remote function or actor. (#2179)

* Print warning when defining very large remote function or actor.

* Add weak test.

* Check that warnings appear in test.

* Make wait_for_errors actually fail in failure_test.py.

* Use constants for error types.

* Fix
This commit is contained in:
Robert Nishihara
2018-06-09 19:59:15 -07:00
committed by Philipp Moritz
parent 1475600c81
commit 125fe1c09c
7 changed files with 161 additions and 56 deletions
+18 -3
View File
@@ -10,6 +10,7 @@ import traceback
import ray.cloudpickle as pickle
import ray.local_scheduler
import ray.ray_constants as ray_constants
import ray.signature as signature
import ray.worker
from ray.utils import _random_string, is_cython, push_error_to_driver
@@ -164,7 +165,7 @@ def save_and_log_checkpoint(worker, actor):
# Log the error message.
ray.utils.push_error_to_driver(
worker.redis_client,
"checkpoint",
ray_constants.CHECKPOINT_PUSH_ERROR,
traceback_str,
driver_id=worker.task_driver_id.id(),
data={
@@ -188,7 +189,7 @@ def restore_and_log_checkpoint(worker, actor):
# Log the error message.
ray.utils.push_error_to_driver(
worker.redis_client,
"checkpoint",
ray_constants.CHECKPOINT_PUSH_ERROR,
traceback_str,
driver_id=worker.task_driver_id.id(),
data={
@@ -330,7 +331,7 @@ def fetch_and_register_actor(actor_class_key, worker):
# Log the error message.
push_error_to_driver(
worker.redis_client,
"register_actor_signatures",
ray_constants.REGISTER_ACTOR_PUSH_ERROR,
traceback_str,
driver_id,
data={"actor_id": actor_id_str})
@@ -392,6 +393,20 @@ def export_actor_class(class_id, Class, actor_method_names,
"actor_method_names": json.dumps(list(actor_method_names))
}
if (len(actor_class_info["class"]) >
ray_constants.PICKLE_OBJECT_WARNING_SIZE):
warning_message = ("Warning: The actor {} has size {} when pickled. "
"It will be stored in Redis, which could cause "
"memory issues. This may mean that the actor "
"definition uses a large array or other object."
.format(actor_class_info["class_name"],
len(actor_class_info["class"])))
ray.utils.push_error_to_driver(
worker.redis_client,
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
warning_message,
driver_id=worker.task_driver_id.id())
if worker.mode is None:
# This means that 'ray.init()' has not been called yet and so we must
# cache the actor class definition and export it when 'ray.init()' is
+21
View File
@@ -12,6 +12,27 @@ def env_integer(key, default):
return default
# Different types of Ray errors that can be pushed to the driver.
# TODO(rkn): These should be defined in flatbuffers and must be synced with
# the existing C++ definitions.
WAIT_FOR_CLASS_PUSH_ERROR = "wait_for_class"
PICKLING_LARGE_OBJECT_PUSH_ERROR = "pickling_large_object"
WAIT_FOR_FUNCTION_PUSH_ERROR = "wait_for_function"
TASK_PUSH_ERROR = "task"
REGISTER_REMOTE_FUNCTION_PUSH_ERROR = "register_remote_function"
FUNCTION_TO_RUN_PUSH_ERROR = "function_to_run"
VERSION_MISMATCH_PUSH_ERROR = "version_mismatch"
CHECKPOINT_PUSH_ERROR = "checkpoint"
REGISTER_ACTOR_PUSH_ERROR = "register_actor"
WORKER_CRASH_PUSH_ERROR = "worker_crash"
WORKER_DIED_PUSH_ERROR = "worker_died"
PUT_RECONSTRUCTION_PUSH_ERROR = "put_reconstruction"
HASH_MISMATCH_PUSH_ERROR = "object_hash_mismatch"
# If a remote function or actor (or some other export) has serialized size
# greater than this quantity, print an warning.
PICKLE_OBJECT_WARNING_SIZE = 10**7
# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
+41 -6
View File
@@ -28,6 +28,7 @@ import ray.services as services
import ray.signature
import ray.local_scheduler
import ray.plasma
import ray.ray_constants as ray_constants
from ray.utils import random_string, binary_to_hex, is_cython
# Import flatbuffer bindings.
@@ -415,7 +416,7 @@ class Worker(object):
if not warning_sent:
ray.utils.push_error_to_driver(
self.redis_client,
"wait_for_class",
ray_constants.WAIT_FOR_CLASS_PUSH_ERROR,
warning_message,
driver_id=self.task_driver_id.id())
warning_sent = True
@@ -637,6 +638,19 @@ class Worker(object):
else:
del function.__globals__[function.__name__]
if len(pickled_function) > ray_constants.PICKLE_OBJECT_WARNING_SIZE:
warning_message = ("Warning: The remote function {} has size {} "
"when pickled. It will be stored in Redis, "
"which could cause memory issues. This may "
"mean that the function definition uses a "
"large array or other object.".format(
function_name, len(pickled_function)))
ray.utils.push_error_to_driver(
self.redis_client,
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
warning_message,
driver_id=self.task_driver_id.id())
self.redis_client.hmset(
key, {
"driver_id": self.task_driver_id.id(),
@@ -684,6 +698,22 @@ class Worker(object):
# In this case, the function has already been exported, so
# we don't need to export it again.
return
if (len(pickled_function) >
ray_constants.PICKLE_OBJECT_WARNING_SIZE):
warning_message = ("Warning: The function {} has size {} when "
"pickled. It will be stored in Redis, "
"which could cause memory issues. This may "
"mean that the remote function definition "
"uses a large array or other object."
.format(function.__name__,
len(pickled_function)))
ray.utils.push_error_to_driver(
self.redis_client,
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
warning_message,
driver_id=self.task_driver_id.id())
# Run the function on all workers.
self.redis_client.hmset(
key, {
@@ -735,7 +765,7 @@ class Worker(object):
if not warning_sent:
ray.utils.push_error_to_driver(
self.redis_client,
"wait_for_function",
ray_constants.WAIT_FOR_FUNCTION_PUSH_ERROR,
warning_message,
driver_id=driver_id)
warning_sent = True
@@ -896,7 +926,7 @@ class Worker(object):
# Log the error message.
ray.utils.push_error_to_driver(
self.redis_client,
"task",
ray_constants.TASK_PUSH_ERROR,
str(failure_object),
driver_id=self.task_driver_id.id(),
data={
@@ -1132,6 +1162,11 @@ def error_info(worker=global_worker):
for error_key in error_keys:
if error_applies_to_driver(error_key, worker=worker):
error_contents = worker.redis_client.hgetall(error_key)
error_contents = {
"type": error_contents[b"type"].decode("ascii"),
"message": error_contents[b"message"].decode("ascii"),
"data": error_contents[b"data"].decode("ascii")
}
errors.append(error_contents)
return errors
@@ -1823,7 +1858,7 @@ def fetch_and_register_remote_function(key, worker=global_worker):
# Log the error message.
ray.utils.push_error_to_driver(
worker.redis_client,
"register_remote_function",
ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR,
traceback_str,
driver_id=driver_id,
data={
@@ -1868,7 +1903,7 @@ def fetch_and_execute_function_to_run(key, worker=global_worker):
and hasattr(function, "__name__")) else ""
ray.utils.push_error_to_driver(
worker.redis_client,
"function_to_run",
ray_constants.FUNCTION_TO_RUN_PUSH_ERROR,
traceback_str,
driver_id=driver_id,
data={"name": name})
@@ -2028,7 +2063,7 @@ def connect(info,
traceback_str = traceback.format_exc()
ray.utils.push_error_to_driver(
worker.redis_client,
"version_mismatch",
ray_constants.VERSION_MISMATCH_PUSH_ERROR,
traceback_str,
driver_id=None)