Implement unsafe method for flushing entire object table and task table. (#1824)

* Implement unsafe method for flushing entire object table and task table.

* Add test.

* Fix test.
This commit is contained in:
Robert Nishihara
2018-04-04 18:29:24 -07:00
committed by Philipp Moritz
parent 888e70f1be
commit 5bde5e75e7
3 changed files with 103 additions and 2 deletions
+3 -2
View File
@@ -3,6 +3,7 @@ from __future__ import division
from __future__ import print_function
from .tfutils import TensorFlowVariables
from .features import flush_redis_unsafe
from .features import flush_redis_unsafe, flush_task_and_object_metadata_unsafe
__all__ = ["TensorFlowVariables", "flush_redis_unsafe"]
__all__ = ["TensorFlowVariables", "flush_redis_unsafe",
"flush_task_and_object_metadata_unsafe"]
+46
View File
@@ -4,6 +4,10 @@ from __future__ import print_function
import ray
OBJECT_INFO_PREFIX = b"OI:"
OBJECT_LOCATION_PREFIX = b"OL:"
TASK_TABLE_PREFIX = b"TT:"
def flush_redis_unsafe():
"""This removes some non-critical state from the primary Redis shard.
@@ -35,3 +39,45 @@ def flush_redis_unsafe():
else:
num_deleted = 0
print("Deleted {} event logs from Redis.".format(num_deleted))
def flush_task_and_object_metadata_unsafe():
"""This removes some critical state from the Redis shards.
In a multitenant environment, this will flush metadata for all jobs, which
may be undesirable.
This removes all of the object and task metadata. This can be used to try
to address out-of-memory errors caused by the accumulation of metadata in
Redis. However, after running this command, fault tolerance will most
likely not work.
"""
if not hasattr(ray.worker.global_worker, "redis_client"):
raise Exception("ray.experimental.flush_redis_unsafe cannot be called "
"before ray.init() has been called.")
def flush_shard(redis_client):
# Flush the task table. Note that this also flushes the driver tasks
# which may be undesirable.
num_task_keys_deleted = 0
for key in redis_client.scan_iter(match=TASK_TABLE_PREFIX + b"*"):
num_task_keys_deleted += redis_client.delete(key)
print("Deleted {} task keys from Redis.".format(num_task_keys_deleted))
# Flush the object information.
num_object_keys_deleted = 0
for key in redis_client.scan_iter(match=OBJECT_INFO_PREFIX + b"*"):
num_object_keys_deleted += redis_client.delete(key)
print("Deleted {} object info keys from Redis.".format(
num_object_keys_deleted))
# Flush the object locations.
num_object_location_keys_deleted = 0
for key in redis_client.scan_iter(match=OBJECT_LOCATION_PREFIX + b"*"):
num_object_location_keys_deleted += redis_client.delete(key)
print("Deleted {} object location keys from Redis.".format(
num_object_location_keys_deleted))
# Loop over the shards and flush all of them.
for redis_client in ray.worker.global_state.redis_clients:
flush_shard(redis_client)