mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 01:57:09 +08:00
[core] Option to fallback to LRU on OutOfMemory (#7410)
* Add a test for LRU fallback
* Update error message
* Upgrade arrow to master
* Integrate with arrow
* Revert "Bazel mirrors (#7385)"
This reverts commit 44aded5272.
* Don't LRU evict
* Revert "Revert "Bazel mirrors (#7385)""
This reverts commit b6359fea78d1bd3925452ca88ac71e0c9e5c7dd3.
* Add lru_evict flag
* fix internal config
* Fix
* upgrade arrow
* debug
* Set free period in config for lru_evict, override max retries to fix
test
* Fix test?
* fix test
* Revert "debug"
This reverts commit 98f01c63a267f38218f5047b1866e4c1c8280017.
* fix exception str
* Fix ref count test
* Shorten travis test?
This commit is contained in:
+26
-6
@@ -550,7 +550,8 @@ def init(address=None,
|
||||
temp_dir=None,
|
||||
load_code_from_local=False,
|
||||
use_pickle=True,
|
||||
_internal_config=None):
|
||||
_internal_config=None,
|
||||
lru_evict=False):
|
||||
"""Connect to an existing Ray cluster or start one and connect to it.
|
||||
|
||||
This method handles two cases. Either a Ray cluster already exists and we
|
||||
@@ -646,6 +647,12 @@ def init(address=None,
|
||||
use_pickle: Deprecated.
|
||||
_internal_config (str): JSON configuration for overriding
|
||||
RayConfig defaults. For testing purposes ONLY.
|
||||
lru_evict (bool): If True, when an object store is full, it will evict
|
||||
objects in LRU order to make more space and when under memory
|
||||
pressure, ray.UnreconstructableError may be thrown. If False, then
|
||||
reference counting will be used to decide which objects are safe to
|
||||
evict and when under memory pressure, ray.ObjectStoreFullError may
|
||||
be thrown.
|
||||
|
||||
Returns:
|
||||
Address information about the started processes.
|
||||
@@ -695,6 +702,18 @@ def init(address=None,
|
||||
if node_ip_address is not None:
|
||||
node_ip_address = services.address_to_ip(node_ip_address)
|
||||
|
||||
_internal_config = (json.loads(_internal_config)
|
||||
if _internal_config else {})
|
||||
# Set the internal config options for LRU eviction.
|
||||
if lru_evict:
|
||||
# Turn off object pinning.
|
||||
if _internal_config.get("object_pinning_enabled", False):
|
||||
raise Exception(
|
||||
"Object pinning cannot be enabled if using LRU eviction.")
|
||||
_internal_config["object_pinning_enabled"] = False
|
||||
_internal_config["object_store_full_max_retries"] = -1
|
||||
_internal_config["free_objects_period_milliseconds"] = 1000
|
||||
|
||||
global _global_node
|
||||
if driver_mode == LOCAL_MODE:
|
||||
# If starting Ray in LOCAL_MODE, don't start any other processes.
|
||||
@@ -779,8 +798,9 @@ def init(address=None,
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"raylet_socket_name must not be provided.")
|
||||
if _internal_config is not None:
|
||||
raise ValueError("When connecting to an existing cluster, "
|
||||
"_internal_config must not be provided.")
|
||||
logger.warning(
|
||||
"When connecting to an existing cluster, "
|
||||
"_internal_config must match the cluster's _internal_config.")
|
||||
|
||||
# In this case, we only need to connect the node.
|
||||
ray_params = ray.parameter.RayParams(
|
||||
@@ -789,7 +809,8 @@ def init(address=None,
|
||||
redis_password=redis_password,
|
||||
object_id_seed=object_id_seed,
|
||||
temp_dir=temp_dir,
|
||||
load_code_from_local=load_code_from_local)
|
||||
load_code_from_local=load_code_from_local,
|
||||
_internal_config=_internal_config)
|
||||
_global_node = ray.node.Node(
|
||||
ray_params,
|
||||
head=False,
|
||||
@@ -804,8 +825,7 @@ def init(address=None,
|
||||
worker=global_worker,
|
||||
driver_object_store_memory=driver_object_store_memory,
|
||||
job_id=job_id,
|
||||
internal_config=json.loads(_internal_config)
|
||||
if _internal_config else {})
|
||||
internal_config=_internal_config)
|
||||
|
||||
for hook in _post_init_hooks:
|
||||
hook()
|
||||
|
||||
Reference in New Issue
Block a user