Move plasma retry logic into plasma store provider (#7328)

This commit is contained in:
Edward Oakes
2020-02-26 16:57:02 -08:00
committed by GitHub
parent aec03656d5
commit 2ad9bc5684
6 changed files with 67 additions and 63 deletions
+11 -30
View File
@@ -93,10 +93,6 @@ from ray.exceptions import (
)
from ray.experimental.no_return import NoReturn
from ray.utils import decode
from ray.ray_constants import (
DEFAULT_PUT_OBJECT_DELAY,
DEFAULT_PUT_OBJECT_RETRIES,
)
cimport cpython
@@ -673,32 +669,17 @@ cdef class CoreWorker:
size_t data_size, ObjectID object_id,
c_vector[CObjectID] contained_ids,
CObjectID *c_object_id, shared_ptr[CBuffer] *data):
delay = ray_constants.DEFAULT_PUT_OBJECT_DELAY
for attempt in reversed(
range(ray_constants.DEFAULT_PUT_OBJECT_RETRIES)):
try:
if object_id is None:
with nogil:
check_status(self.core_worker.get().Create(
metadata, data_size, contained_ids,
c_object_id, data))
else:
c_object_id[0] = object_id.native()
with nogil:
check_status(self.core_worker.get().Create(
metadata, data_size,
c_object_id[0], data))
break
except ObjectStoreFullError as e:
if attempt:
logger.warning("Waiting {} seconds for space to free up "
"in the object store.".format(delay))
gc.collect()
time.sleep(delay)
delay *= 2
else:
self.dump_object_store_memory_usage()
raise e
if object_id is None:
with nogil:
check_status(self.core_worker.get().Create(
metadata, data_size, contained_ids,
c_object_id, data))
else:
c_object_id[0] = object_id.native()
with nogil:
check_status(self.core_worker.get().Create(
metadata, data_size,
c_object_id[0], data))
# If data is nullptr, that means the ObjectID already existed,
# which we ignore.
-6
View File
@@ -22,12 +22,6 @@ ID_SIZE = 20
# The default maximum number of bytes to allocate to the object store unless
# overridden by the user.
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 20 * 10**9
# The default number of retries to call `put` when the object store is full.
DEFAULT_PUT_OBJECT_RETRIES = 5
# The default seconds for delay between calls to retry `put` when
# the object store is full. This delay is exponentially doubled up to
# DEFAULT_PUT_OBJECT_RETRIES times.
DEFAULT_PUT_OBJECT_DELAY = 1
# The smallest cap on the memory used by the object store that we allow.
# This must be greater than MEMORY_RESOURCE_UNIT_BYTES * 0.7
OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
+14 -11
View File
@@ -892,30 +892,33 @@ def test_connect_with_disconnected_node(shutdown_only):
@pytest.mark.parametrize(
"ray_start_cluster_head", [{
"num_cpus": 5,
"object_store_memory": 10**8
"object_store_memory": 10**8,
"_internal_config": json.dumps({
"object_store_full_max_retries": 0
})
}],
indirect=True)
@pytest.mark.parametrize("num_actors", [1, 2, 5])
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head, num_actors):
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
@ray.remote
class LargeMemoryActor:
def some_expensive_task(self):
return np.zeros(10**8 // 2, dtype=np.uint8)
actors = [LargeMemoryActor.remote() for _ in range(num_actors)]
actors = [LargeMemoryActor.remote() for _ in range(5)]
for _ in range(10):
pending = [a.some_expensive_task.remote() for a in actors]
while pending:
[done], pending = ray.wait(pending, num_returns=1)
@pytest.mark.parametrize(
"ray_start_cluster_head", [{
"num_cpus": 2,
"object_store_memory": 10**8
}],
indirect=True)
def test_fill_object_store_exception(ray_start_cluster_head):
def test_fill_object_store_exception(shutdown_only):
ray.init(
num_cpus=2,
object_store_memory=10**8,
_internal_config=json.dumps({
"object_store_full_max_retries": 0
}))
@ray.remote
def expensive_task():
return np.zeros((10**8) // 10, dtype=np.uint8)
@@ -25,6 +25,7 @@ logger = logging.getLogger(__name__)
def one_worker_100MiB(request):
config = json.dumps({
"distributed_ref_counting_enabled": 1,
"object_store_full_max_retries": 1,
})
yield ray.init(
num_cpus=1,