[core] Move out-of-memory handling into the plasma store and support async object creation (#12186)

* Refactor to extract creation request queue

* timer on oom

* move timer out

* Move evict_if_full and on_store_full into plasma store

* Remove client-side code

* revert

* Distinguish between transient and permanent OOM delays

* update

* Move out create request queue, unit test

* unit test

* Fix max retries

* test

* Do not pin restored objects

* First pass to add polling requests, unit test passes

* worker plasma client retries plasma requests

* cleanup

* Clean up after disconnected clients, check memory leaks

* Support immediate requests in request queue

* Option to try creating immediately

* lint

* Fix build, address comments

* doc

* fixes

* debug travis

* debug

* debug

* debug

* debug

* Revert "debug"

This reverts commit 6bf2f6ee5640e71630c4aecdb7ebf54911ea32db.

Revert "debug"

This reverts commit 73017099c9b06cdaae1217bf0e0f4d23ed68a9e5.

Revert "debug"

This reverts commit 5a155529e28cee9461a598b0cdf7b6a3cc194c93.

Revert "debug"

This reverts commit b50c2101afd45d4cf663daae857bfe1b40387703.

Revert "debug travis"

This reverts commit 012b8721dedf9bca46294ae75eee2815b160368b.

* Skip if new scheduler enabled

* error message

* merge
This commit is contained in:
Stephanie Wang
2020-12-02 13:25:54 -05:00
committed by GitHub
parent 786f839ff3
commit 443339ab19
32 changed files with 1010 additions and 278 deletions
+2 -1
View File
@@ -348,7 +348,8 @@ def test_system_config_when_connecting(ray_start_cluster):
obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
for _ in range(5):
ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
del put_ref
# This would not raise an exception if object pinning was enabled.
with pytest.raises(ray.exceptions.ObjectLostError):
+24
View File
@@ -7,6 +7,7 @@ import warnings
import ray
from ray.cluster_utils import Cluster
from ray.exceptions import GetTimeoutError
if (multiprocessing.cpu_count() < 40
or ray.utils.get_system_memory() < 50 * 10**9):
@@ -33,6 +34,29 @@ def ray_start_cluster_with_resource():
cluster.shutdown()
@pytest.mark.parametrize(
"ray_start_cluster_head", [{
"num_cpus": 0,
"object_store_memory": 75 * 1024 * 1024,
}],
indirect=True)
def test_object_transfer_during_oom(ray_start_cluster_head):
cluster = ray_start_cluster_head
cluster.add_node(object_store_memory=75 * 1024 * 1024)
@ray.remote
def put():
return np.random.rand(5 * 1024 * 1024) # 40 MB data
local_ref = ray.put(np.random.rand(5 * 1024 * 1024))
remote_ref = put.remote()
with pytest.raises(GetTimeoutError):
ray.get(remote_ref, timeout=1)
del local_ref
ray.get(remote_ref)
# This test is here to make sure that when we broadcast an object to a bunch of
# machines, we don't have too many excess object transfers.
@pytest.mark.skip(reason="TODO(ekl)")
+8 -5
View File
@@ -12,7 +12,7 @@ import psutil
import ray
from ray.external_storage import (create_url_with_offset,
parse_url_with_offset)
from ray.test_utils import wait_for_condition
from ray.test_utils import new_scheduler_enabled, wait_for_condition
bucket_name = "object-spilling-test"
spill_local_path = "/tmp/spill"
@@ -338,16 +338,19 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
@pytest.mark.skip(
"Temporarily disabled until OutOfMemory retries can be moved "
"into the plasma store")
@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs")
def test_spill_during_get(object_spilling_config, shutdown_only):
ray.init(
num_cpus=4,
object_store_memory=100 * 1024 * 1024,
_system_config={
"automatic_object_spilling_enabled": True,
"max_io_workers": 2,
"object_store_full_initial_delay_ms": 100,
# NOTE(swang): Use infinite retries because the OOM timer can still
# get accidentally triggered when objects are released too slowly
# (see github.com/ray-project/ray/issues/12040).
"object_store_full_max_retries": -1,
"max_io_workers": 1,
"object_spilling_config": object_spilling_config,
"min_spilling_size": 0,
},
+13 -4
View File
@@ -10,8 +10,8 @@ import pytest
import ray
import ray.cluster_utils
from ray.test_utils import SignalActor, put_object, wait_for_condition, \
new_scheduler_enabled
from ray.test_utils import (SignalActor, put_object, wait_for_condition,
new_scheduler_enabled)
logger = logging.getLogger(__name__)
@@ -167,7 +167,7 @@ def test_dependency_refcounts(ray_start_regular):
check_refcounts({})
@pytest.mark.skipif(new_scheduler_enabled(), reason="dynres notimpl")
@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs")
def test_actor_creation_task(ray_start_regular):
@ray.remote
def large_object():
@@ -269,7 +269,16 @@ def test_feature_flag(shutdown_only):
# The ray.get below fails with only LRU eviction, as the object
# that was ray.put by the actor should have been evicted.
_fill_object_store_and_get(actor.get_large_object.remote(), succeed=False)
ref = actor.get_large_object.remote()
ray.get(ref)
# Keep refs in scope so that they don't get GCed immediately.
for _ in range(5):
put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
del put_ref
wait_for_condition(
lambda: not ray.worker.global_worker.core_worker.object_exists(ref))
def test_out_of_band_serialized_object_ref(one_worker_100MiB):