mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 22:54:45 +08:00
[Object Spilling] Remove LRU eviction (#13977)
* done. * formatting. * done. * done.
This commit is contained in:
@@ -120,10 +120,6 @@ class Node:
|
||||
raise ValueError(
|
||||
"Internal config parameters can only be set on the head node.")
|
||||
|
||||
if ray_params._lru_evict:
|
||||
assert (connect_only or
|
||||
head), "LRU Evict can only be passed into the head node."
|
||||
|
||||
self._raylet_ip_address = raylet_ip_address
|
||||
|
||||
ray_params.update_if_absent(
|
||||
|
||||
+5
-14
@@ -102,7 +102,6 @@ class RayParams:
|
||||
_system_config (dict): Configuration for overriding RayConfig
|
||||
defaults. Used to set system configuration and for experimental Ray
|
||||
core feature flags.
|
||||
lru_evict (bool): Enable LRU eviction if space is needed.
|
||||
enable_object_reconstruction (bool): Enable plasma reconstruction on
|
||||
failure.
|
||||
start_initial_python_workers_for_first_job (bool): If true, start
|
||||
@@ -199,30 +198,22 @@ class RayParams:
|
||||
self.start_initial_python_workers_for_first_job = (
|
||||
start_initial_python_workers_for_first_job)
|
||||
self._system_config = _system_config or {}
|
||||
self._lru_evict = lru_evict
|
||||
self._enable_object_reconstruction = enable_object_reconstruction
|
||||
self._check_usage()
|
||||
|
||||
# Set the internal config options for LRU eviction.
|
||||
if lru_evict:
|
||||
# Turn off object pinning.
|
||||
if self._system_config is None:
|
||||
self._system_config = dict()
|
||||
if self._system_config.get("object_pinning_enabled", False):
|
||||
raise Exception(
|
||||
"Object pinning cannot be enabled if using LRU eviction.")
|
||||
self._system_config["object_pinning_enabled"] = False
|
||||
self._system_config["free_objects_period_milliseconds"] = 1000
|
||||
raise DeprecationWarning(
|
||||
"The lru_evict flag is deprecated as Ray natively "
|
||||
"supports object spilling. Please read "
|
||||
"https://docs.ray.io/en/master/memory-management.html#object-spilling " # noqa
|
||||
"for more details.")
|
||||
|
||||
# Set the internal config options for object reconstruction.
|
||||
if enable_object_reconstruction:
|
||||
# Turn off object pinning.
|
||||
if self._system_config is None:
|
||||
self._system_config = dict()
|
||||
if lru_evict:
|
||||
raise Exception(
|
||||
"Object reconstruction cannot be enabled if using LRU "
|
||||
"eviction.")
|
||||
print(self._system_config)
|
||||
self._system_config["lineage_pinning_enabled"] = True
|
||||
self._system_config["free_objects_period_milliseconds"] = -1
|
||||
|
||||
@@ -32,10 +32,9 @@ def ray_init_with_task_retry_delay():
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular", [{
|
||||
"object_store_memory": 150 * 1024 * 1024,
|
||||
"_lru_evict": True,
|
||||
}],
|
||||
indirect=True)
|
||||
def test_actor_eviction(ray_start_regular):
|
||||
def test_actor_spilled(ray_start_regular):
|
||||
object_store_memory = 150 * 1024 * 1024
|
||||
|
||||
@ray.remote
|
||||
@@ -58,19 +57,14 @@ def test_actor_eviction(ray_start_regular):
|
||||
ray.get(obj)
|
||||
|
||||
# Get each object again. At this point, the earlier objects should have
|
||||
# been evicted.
|
||||
num_evicted, num_success = 0, 0
|
||||
# been spilled.
|
||||
num_success = 0
|
||||
for obj in objects:
|
||||
try:
|
||||
val = ray.get(obj)
|
||||
assert isinstance(val, np.ndarray), val
|
||||
num_success += 1
|
||||
except ray.exceptions.ObjectLostError:
|
||||
num_evicted += 1
|
||||
# Some objects should have been evicted, and some should still be in the
|
||||
# object store.
|
||||
assert num_evicted > 0
|
||||
assert num_success > 0
|
||||
val = ray.get(obj)
|
||||
assert isinstance(val, np.ndarray), val
|
||||
num_success += 1
|
||||
# All of objects should've been spilled, so all of them should succeed.
|
||||
assert num_success == len(objects)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Very flaky on Windows.")
|
||||
|
||||
@@ -344,10 +344,7 @@ def test_initialized_local_mode(shutdown_only_with_initialization_check):
|
||||
|
||||
|
||||
def test_wait_reconstruction(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=int(10**8),
|
||||
_system_config={"object_pinning_enabled": 0})
|
||||
ray.init(num_cpus=1, object_store_memory=int(10**8))
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
|
||||
@@ -342,7 +342,7 @@ def test_call_chain(ray_start_cluster):
|
||||
|
||||
@pytest.mark.skipif(client_test_enabled(), reason="message size")
|
||||
def test_system_config_when_connecting(ray_start_cluster):
|
||||
config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200}
|
||||
config = {"object_timeout_milliseconds": 200}
|
||||
cluster = ray.cluster_utils.Cluster()
|
||||
cluster.add_node(
|
||||
_system_config=config, object_store_memory=100 * 1024 * 1024)
|
||||
@@ -360,9 +360,7 @@ def test_system_config_when_connecting(ray_start_cluster):
|
||||
put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
|
||||
del put_ref
|
||||
|
||||
# This would not raise an exception if object pinning was enabled.
|
||||
with pytest.raises(ray.exceptions.ObjectLostError):
|
||||
ray.get(obj_ref)
|
||||
ray.get(obj_ref)
|
||||
|
||||
|
||||
def test_get_multiple(ray_start_regular_shared):
|
||||
|
||||
@@ -1120,56 +1120,6 @@ def test_fill_object_store_exception(shutdown_only):
|
||||
ray.put(np.zeros(10**8 + 2, dtype=np.uint8))
|
||||
|
||||
|
||||
def test_fill_object_store_lru_fallback(shutdown_only):
|
||||
config = {
|
||||
"free_objects_batch_size": 1,
|
||||
}
|
||||
ray.init(
|
||||
num_cpus=2,
|
||||
object_store_memory=10**8,
|
||||
_lru_evict=True,
|
||||
_system_config=config)
|
||||
|
||||
@ray.remote
|
||||
def expensive_task():
|
||||
return np.zeros((10**8) // 2, dtype=np.uint8)
|
||||
|
||||
# Check that objects out of scope are cleaned up quickly.
|
||||
ray.get(expensive_task.remote())
|
||||
start = time.time()
|
||||
for _ in range(3):
|
||||
ray.get(expensive_task.remote())
|
||||
end = time.time()
|
||||
assert end - start < 3
|
||||
|
||||
obj_refs = []
|
||||
for _ in range(3):
|
||||
obj_ref = expensive_task.remote()
|
||||
ray.get(obj_ref)
|
||||
obj_refs.append(obj_ref)
|
||||
|
||||
@ray.remote
|
||||
class LargeMemoryActor:
|
||||
def some_expensive_task(self):
|
||||
return np.zeros(10**8 // 2, dtype=np.uint8)
|
||||
|
||||
def test(self):
|
||||
return 1
|
||||
|
||||
actor = LargeMemoryActor.remote()
|
||||
for _ in range(3):
|
||||
obj_ref = actor.some_expensive_task.remote()
|
||||
ray.get(obj_ref)
|
||||
obj_refs.append(obj_ref)
|
||||
# Make sure actor does not die
|
||||
ray.get(actor.test.remote())
|
||||
|
||||
for _ in range(3):
|
||||
obj_ref = ray.put(np.zeros(10**8 // 2, dtype=np.uint8))
|
||||
ray.get(obj_ref)
|
||||
obj_refs.append(obj_ref)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_nodes": 1,
|
||||
|
||||
@@ -245,9 +245,7 @@ def test_pending_task_dependency_pinning(one_worker_100MiB):
|
||||
|
||||
|
||||
def test_feature_flag(shutdown_only):
|
||||
ray.init(
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_system_config={"object_pinning_enabled": 0})
|
||||
ray.init(object_store_memory=100 * 1024 * 1024)
|
||||
|
||||
@ray.remote
|
||||
def f(array):
|
||||
|
||||
@@ -601,12 +601,6 @@ def init(
|
||||
directory for the Ray process. Defaults to an OS-specific
|
||||
conventional location, e.g., "/tmp/ray".
|
||||
_java_worker_options: Overwrite the options to start Java workers.
|
||||
_lru_evict (bool): If True, when an object store is full, it will evict
|
||||
objects in LRU order to make more space and when under memory
|
||||
pressure, ray.ObjectLostError may be thrown. If False, then
|
||||
reference counting will be used to decide which objects are safe
|
||||
to evict and when under memory pressure, ray.ObjectStoreFullError
|
||||
may be thrown.
|
||||
_metrics_export_port(int): Port number Ray exposes system metrics
|
||||
through a Prometheus endpoint. It is currently under active
|
||||
development, and the API is subject to change.
|
||||
@@ -744,9 +738,6 @@ def init(
|
||||
if _system_config is not None and len(_system_config) != 0:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"_system_config must not be provided.")
|
||||
if _lru_evict:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"_lru_evict must not be provided.")
|
||||
if _enable_object_reconstruction:
|
||||
raise ValueError(
|
||||
"When connecting to an existing cluster, "
|
||||
|
||||
Reference in New Issue
Block a user