[Object Spilling] Remove LRU eviction (#13977)

* done.

* formatting.

* done.

* done.
This commit is contained in:
SangBin Cho
2021-02-15 14:24:53 -08:00
committed by GitHub
parent e457872fe1
commit 4ad79ca963
20 changed files with 128 additions and 310 deletions
-4
View File
@@ -120,10 +120,6 @@ class Node:
raise ValueError(
"Internal config parameters can only be set on the head node.")
if ray_params._lru_evict:
assert (connect_only or
head), "LRU Evict can only be passed into the head node."
self._raylet_ip_address = raylet_ip_address
ray_params.update_if_absent(
+5 -14
View File
@@ -102,7 +102,6 @@ class RayParams:
_system_config (dict): Configuration for overriding RayConfig
defaults. Used to set system configuration and for experimental Ray
core feature flags.
lru_evict (bool): Enable LRU eviction if space is needed.
enable_object_reconstruction (bool): Enable plasma reconstruction on
failure.
start_initial_python_workers_for_first_job (bool): If true, start
@@ -199,30 +198,22 @@ class RayParams:
self.start_initial_python_workers_for_first_job = (
start_initial_python_workers_for_first_job)
self._system_config = _system_config or {}
self._lru_evict = lru_evict
self._enable_object_reconstruction = enable_object_reconstruction
self._check_usage()
# Set the internal config options for LRU eviction.
if lru_evict:
# Turn off object pinning.
if self._system_config is None:
self._system_config = dict()
if self._system_config.get("object_pinning_enabled", False):
raise Exception(
"Object pinning cannot be enabled if using LRU eviction.")
self._system_config["object_pinning_enabled"] = False
self._system_config["free_objects_period_milliseconds"] = 1000
raise DeprecationWarning(
"The lru_evict flag is deprecated as Ray natively "
"supports object spilling. Please read "
"https://docs.ray.io/en/master/memory-management.html#object-spilling " # noqa
"for more details.")
# Set the internal config options for object reconstruction.
if enable_object_reconstruction:
# Turn off object pinning.
if self._system_config is None:
self._system_config = dict()
if lru_evict:
raise Exception(
"Object reconstruction cannot be enabled if using LRU "
"eviction.")
print(self._system_config)
self._system_config["lineage_pinning_enabled"] = True
self._system_config["free_objects_period_milliseconds"] = -1
+8 -14
View File
@@ -32,10 +32,9 @@ def ray_init_with_task_retry_delay():
@pytest.mark.parametrize(
"ray_start_regular", [{
"object_store_memory": 150 * 1024 * 1024,
"_lru_evict": True,
}],
indirect=True)
def test_actor_eviction(ray_start_regular):
def test_actor_spilled(ray_start_regular):
object_store_memory = 150 * 1024 * 1024
@ray.remote
@@ -58,19 +57,14 @@ def test_actor_eviction(ray_start_regular):
ray.get(obj)
# Get each object again. At this point, the earlier objects should have
# been evicted.
num_evicted, num_success = 0, 0
# been spilled.
num_success = 0
for obj in objects:
try:
val = ray.get(obj)
assert isinstance(val, np.ndarray), val
num_success += 1
except ray.exceptions.ObjectLostError:
num_evicted += 1
# Some objects should have been evicted, and some should still be in the
# object store.
assert num_evicted > 0
assert num_success > 0
val = ray.get(obj)
assert isinstance(val, np.ndarray), val
num_success += 1
# All of objects should've been spilled, so all of them should succeed.
assert num_success == len(objects)
@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.")
+1 -4
View File
@@ -344,10 +344,7 @@ def test_initialized_local_mode(shutdown_only_with_initialization_check):
def test_wait_reconstruction(shutdown_only):
ray.init(
num_cpus=1,
object_store_memory=int(10**8),
_system_config={"object_pinning_enabled": 0})
ray.init(num_cpus=1, object_store_memory=int(10**8))
@ray.remote
def f():
+2 -4
View File
@@ -342,7 +342,7 @@ def test_call_chain(ray_start_cluster):
@pytest.mark.skipif(client_test_enabled(), reason="message size")
def test_system_config_when_connecting(ray_start_cluster):
config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200}
config = {"object_timeout_milliseconds": 200}
cluster = ray.cluster_utils.Cluster()
cluster.add_node(
_system_config=config, object_store_memory=100 * 1024 * 1024)
@@ -360,9 +360,7 @@ def test_system_config_when_connecting(ray_start_cluster):
put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
del put_ref
# This would not raise an exception if object pinning was enabled.
with pytest.raises(ray.exceptions.ObjectLostError):
ray.get(obj_ref)
ray.get(obj_ref)
def test_get_multiple(ray_start_regular_shared):
-50
View File
@@ -1120,56 +1120,6 @@ def test_fill_object_store_exception(shutdown_only):
ray.put(np.zeros(10**8 + 2, dtype=np.uint8))
def test_fill_object_store_lru_fallback(shutdown_only):
config = {
"free_objects_batch_size": 1,
}
ray.init(
num_cpus=2,
object_store_memory=10**8,
_lru_evict=True,
_system_config=config)
@ray.remote
def expensive_task():
return np.zeros((10**8) // 2, dtype=np.uint8)
# Check that objects out of scope are cleaned up quickly.
ray.get(expensive_task.remote())
start = time.time()
for _ in range(3):
ray.get(expensive_task.remote())
end = time.time()
assert end - start < 3
obj_refs = []
for _ in range(3):
obj_ref = expensive_task.remote()
ray.get(obj_ref)
obj_refs.append(obj_ref)
@ray.remote
class LargeMemoryActor:
def some_expensive_task(self):
return np.zeros(10**8 // 2, dtype=np.uint8)
def test(self):
return 1
actor = LargeMemoryActor.remote()
for _ in range(3):
obj_ref = actor.some_expensive_task.remote()
ray.get(obj_ref)
obj_refs.append(obj_ref)
# Make sure actor does not die
ray.get(actor.test.remote())
for _ in range(3):
obj_ref = ray.put(np.zeros(10**8 // 2, dtype=np.uint8))
ray.get(obj_ref)
obj_refs.append(obj_ref)
@pytest.mark.parametrize(
"ray_start_cluster", [{
"num_nodes": 1,
+1 -3
View File
@@ -245,9 +245,7 @@ def test_pending_task_dependency_pinning(one_worker_100MiB):
def test_feature_flag(shutdown_only):
ray.init(
object_store_memory=100 * 1024 * 1024,
_system_config={"object_pinning_enabled": 0})
ray.init(object_store_memory=100 * 1024 * 1024)
@ray.remote
def f(array):
-9
View File
@@ -601,12 +601,6 @@ def init(
directory for the Ray process. Defaults to an OS-specific
conventional location, e.g., "/tmp/ray".
_java_worker_options: Overwrite the options to start Java workers.
_lru_evict (bool): If True, when an object store is full, it will evict
objects in LRU order to make more space and when under memory
pressure, ray.ObjectLostError may be thrown. If False, then
reference counting will be used to decide which objects are safe
to evict and when under memory pressure, ray.ObjectStoreFullError
may be thrown.
_metrics_export_port(int): Port number Ray exposes system metrics
through a Prometheus endpoint. It is currently under active
development, and the API is subject to change.
@@ -744,9 +738,6 @@ def init(
if _system_config is not None and len(_system_config) != 0:
raise ValueError("When connecting to an existing cluster, "
"_system_config must not be provided.")
if _lru_evict:
raise ValueError("When connecting to an existing cluster, "
"_lru_evict must not be provided.")
if _enable_object_reconstruction:
raise ValueError(
"When connecting to an existing cluster, "