mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 18:29:08 +08:00
Move plasma retry logic into plasma store provider (#7328)
This commit is contained in:
+11
-30
@@ -93,10 +93,6 @@ from ray.exceptions import (
|
||||
)
|
||||
from ray.experimental.no_return import NoReturn
|
||||
from ray.utils import decode
|
||||
from ray.ray_constants import (
|
||||
DEFAULT_PUT_OBJECT_DELAY,
|
||||
DEFAULT_PUT_OBJECT_RETRIES,
|
||||
)
|
||||
|
||||
cimport cpython
|
||||
|
||||
@@ -673,32 +669,17 @@ cdef class CoreWorker:
|
||||
size_t data_size, ObjectID object_id,
|
||||
c_vector[CObjectID] contained_ids,
|
||||
CObjectID *c_object_id, shared_ptr[CBuffer] *data):
|
||||
delay = ray_constants.DEFAULT_PUT_OBJECT_DELAY
|
||||
for attempt in reversed(
|
||||
range(ray_constants.DEFAULT_PUT_OBJECT_RETRIES)):
|
||||
try:
|
||||
if object_id is None:
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Create(
|
||||
metadata, data_size, contained_ids,
|
||||
c_object_id, data))
|
||||
else:
|
||||
c_object_id[0] = object_id.native()
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Create(
|
||||
metadata, data_size,
|
||||
c_object_id[0], data))
|
||||
break
|
||||
except ObjectStoreFullError as e:
|
||||
if attempt:
|
||||
logger.warning("Waiting {} seconds for space to free up "
|
||||
"in the object store.".format(delay))
|
||||
gc.collect()
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
else:
|
||||
self.dump_object_store_memory_usage()
|
||||
raise e
|
||||
if object_id is None:
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Create(
|
||||
metadata, data_size, contained_ids,
|
||||
c_object_id, data))
|
||||
else:
|
||||
c_object_id[0] = object_id.native()
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Create(
|
||||
metadata, data_size,
|
||||
c_object_id[0], data))
|
||||
|
||||
# If data is nullptr, that means the ObjectID already existed,
|
||||
# which we ignore.
|
||||
|
||||
@@ -22,12 +22,6 @@ ID_SIZE = 20
|
||||
# The default maximum number of bytes to allocate to the object store unless
|
||||
# overridden by the user.
|
||||
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 20 * 10**9
|
||||
# The default number of retries to call `put` when the object store is full.
|
||||
DEFAULT_PUT_OBJECT_RETRIES = 5
|
||||
# The default seconds for delay between calls to retry `put` when
|
||||
# the object store is full. This delay is exponentially doubled up to
|
||||
# DEFAULT_PUT_OBJECT_RETRIES times.
|
||||
DEFAULT_PUT_OBJECT_DELAY = 1
|
||||
# The smallest cap on the memory used by the object store that we allow.
|
||||
# This must be greater than MEMORY_RESOURCE_UNIT_BYTES * 0.7
|
||||
OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
|
||||
|
||||
@@ -892,30 +892,33 @@ def test_connect_with_disconnected_node(shutdown_only):
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [{
|
||||
"num_cpus": 5,
|
||||
"object_store_memory": 10**8
|
||||
"object_store_memory": 10**8,
|
||||
"_internal_config": json.dumps({
|
||||
"object_store_full_max_retries": 0
|
||||
})
|
||||
}],
|
||||
indirect=True)
|
||||
@pytest.mark.parametrize("num_actors", [1, 2, 5])
|
||||
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head, num_actors):
|
||||
def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
|
||||
@ray.remote
|
||||
class LargeMemoryActor:
|
||||
def some_expensive_task(self):
|
||||
return np.zeros(10**8 // 2, dtype=np.uint8)
|
||||
|
||||
actors = [LargeMemoryActor.remote() for _ in range(num_actors)]
|
||||
actors = [LargeMemoryActor.remote() for _ in range(5)]
|
||||
for _ in range(10):
|
||||
pending = [a.some_expensive_task.remote() for a in actors]
|
||||
while pending:
|
||||
[done], pending = ray.wait(pending, num_returns=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [{
|
||||
"num_cpus": 2,
|
||||
"object_store_memory": 10**8
|
||||
}],
|
||||
indirect=True)
|
||||
def test_fill_object_store_exception(ray_start_cluster_head):
|
||||
def test_fill_object_store_exception(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=2,
|
||||
object_store_memory=10**8,
|
||||
_internal_config=json.dumps({
|
||||
"object_store_full_max_retries": 0
|
||||
}))
|
||||
|
||||
@ray.remote
|
||||
def expensive_task():
|
||||
return np.zeros((10**8) // 10, dtype=np.uint8)
|
||||
|
||||
@@ -25,6 +25,7 @@ logger = logging.getLogger(__name__)
|
||||
def one_worker_100MiB(request):
|
||||
config = json.dumps({
|
||||
"distributed_ref_counting_enabled": 1,
|
||||
"object_store_full_max_retries": 1,
|
||||
})
|
||||
yield ray.init(
|
||||
num_cpus=1,
|
||||
|
||||
Reference in New Issue
Block a user