[core] Option to fallback to LRU on OutOfMemory (#7410)

* Add a test for LRU fallback

* Update error message

* Upgrade arrow to master

* Integrate with arrow

* Revert "Bazel mirrors (#7385)"

This reverts commit 44aded5272.

* Don't LRU evict

* Revert "Revert "Bazel mirrors (#7385)""

This reverts commit b6359fea78d1bd3925452ca88ac71e0c9e5c7dd3.

* Add lru_evict flag

* fix internal config

* Fix

* upgrade arrow

* debug

* Set free period in config for lru_evict, override max retries to fix
test

* Fix test?

* fix test

* Revert "debug"

This reverts commit 98f01c63a267f38218f5047b1866e4c1c8280017.

* fix exception str

* Fix ref count test

* Shorten travis test?
This commit is contained in:
Stephanie Wang
2020-03-14 11:28:43 -07:00
committed by GitHub
parent 52cf77f5a9
commit 53549314c5
10 changed files with 123 additions and 36 deletions
+26 -6
View File
@@ -550,7 +550,8 @@ def init(address=None,
temp_dir=None,
load_code_from_local=False,
use_pickle=True,
_internal_config=None):
_internal_config=None,
lru_evict=False):
"""Connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
@@ -646,6 +647,12 @@ def init(address=None,
use_pickle: Deprecated.
_internal_config (str): JSON configuration for overriding
RayConfig defaults. For testing purposes ONLY.
lru_evict (bool): If True, when an object store is full, it will evict
objects in LRU order to make more space and when under memory
pressure, ray.UnreconstructableError may be thrown. If False, then
reference counting will be used to decide which objects are safe to
evict and when under memory pressure, ray.ObjectStoreFullError may
be thrown.
Returns:
Address information about the started processes.
@@ -695,6 +702,18 @@ def init(address=None,
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
_internal_config = (json.loads(_internal_config)
if _internal_config else {})
# Set the internal config options for LRU eviction.
if lru_evict:
# Turn off object pinning.
if _internal_config.get("object_pinning_enabled", False):
raise Exception(
"Object pinning cannot be enabled if using LRU eviction.")
_internal_config["object_pinning_enabled"] = False
_internal_config["object_store_full_max_retries"] = -1
_internal_config["free_objects_period_milliseconds"] = 1000
global _global_node
if driver_mode == LOCAL_MODE:
# If starting Ray in LOCAL_MODE, don't start any other processes.
@@ -779,8 +798,9 @@ def init(address=None,
raise ValueError("When connecting to an existing cluster, "
"raylet_socket_name must not be provided.")
if _internal_config is not None:
raise ValueError("When connecting to an existing cluster, "
"_internal_config must not be provided.")
logger.warning(
"When connecting to an existing cluster, "
"_internal_config must match the cluster's _internal_config.")
# In this case, we only need to connect the node.
ray_params = ray.parameter.RayParams(
@@ -789,7 +809,8 @@ def init(address=None,
redis_password=redis_password,
object_id_seed=object_id_seed,
temp_dir=temp_dir,
load_code_from_local=load_code_from_local)
load_code_from_local=load_code_from_local,
_internal_config=_internal_config)
_global_node = ray.node.Node(
ray_params,
head=False,
@@ -804,8 +825,7 @@ def init(address=None,
worker=global_worker,
driver_object_store_memory=driver_object_store_memory,
job_id=job_id,
internal_config=json.loads(_internal_config)
if _internal_config else {})
internal_config=_internal_config)
for hook in _post_init_hooks:
hook()