mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 13:54:27 +08:00
[Object spilling] Update object directory and reload spilled objects automatically (#11021)
* Fix pytest... * Release objects that have been spilled * GCS object table interface refactor * Add spilled URL to object location info * refactor to include spilled URL in notifications * improve tests * Add spilled URL to object directory results * Remove force restore call * Merge spilled URL and location * fix * CI * build * osx * Fix multitenancy issues * Skip windows tests
This commit is contained in:
@@ -1497,13 +1497,6 @@ cdef class CoreWorker:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker()
|
||||
.SpillObjects(object_ids))
|
||||
|
||||
def force_restore_spilled_objects(self, object_refs):
|
||||
cdef c_vector[CObjectID] object_ids
|
||||
object_ids = ObjectRefsToVector(object_refs)
|
||||
with nogil:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker()
|
||||
.ForceRestoreSpilledObjects(object_ids))
|
||||
|
||||
cdef void async_set_result(shared_ptr[CRayObject] obj,
|
||||
CObjectID object_ref,
|
||||
void *future) with gil:
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from .dynamic_resources import set_resource
|
||||
from .object_spilling import force_spill_objects, force_restore_spilled_objects
|
||||
from .object_spilling import force_spill_objects
|
||||
__all__ = [
|
||||
"set_resource",
|
||||
"force_spill_objects",
|
||||
"force_restore_spilled_objects",
|
||||
]
|
||||
|
||||
@@ -16,20 +16,3 @@ def force_spill_objects(object_refs):
|
||||
f"Attempting to call `force_spill_objects` on the "
|
||||
f"value {object_ref}, which is not an ray.ObjectRef.")
|
||||
return core_worker.force_spill_objects(object_refs)
|
||||
|
||||
|
||||
def force_restore_spilled_objects(object_refs):
|
||||
"""Force restoring objects from external storage.
|
||||
|
||||
Args:
|
||||
object_refs: Object refs of the objects to be
|
||||
restored.
|
||||
"""
|
||||
core_worker = ray.worker.global_worker.core_worker
|
||||
# Make sure that the values are object refs.
|
||||
for object_ref in object_refs:
|
||||
if not isinstance(object_ref, ray.ObjectRef):
|
||||
raise TypeError(
|
||||
f"Attempting to call `force_restore_spilled_objects` on the "
|
||||
f"value {object_ref}, which is not an ray.ObjectRef.")
|
||||
return core_worker.force_restore_spilled_objects(object_refs)
|
||||
|
||||
@@ -200,8 +200,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const double capacity,
|
||||
const CNodeID &client_Id)
|
||||
CRayStatus SpillObjects(const c_vector[CObjectID] &object_ids)
|
||||
CRayStatus ForceRestoreSpilledObjects(
|
||||
const c_vector[CObjectID] &object_ids)
|
||||
|
||||
cdef cppclass CCoreWorkerOptions "ray::CoreWorkerOptions":
|
||||
CWorkerType worker_type
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import json
|
||||
import random
|
||||
import platform
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
@@ -8,6 +10,8 @@ import psutil
|
||||
import ray
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_objects_manually(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
@@ -25,7 +29,6 @@ def test_spill_objects_manually(shutdown_only):
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
pinned_objects = set()
|
||||
spilled_objects = set()
|
||||
|
||||
# Create objects of more than 200 MiB.
|
||||
for _ in range(25):
|
||||
@@ -38,7 +41,6 @@ def test_spill_objects_manually(shutdown_only):
|
||||
except ray.exceptions.ObjectStoreFullError:
|
||||
ref_to_spill = pinned_objects.pop()
|
||||
ray.experimental.force_spill_objects([ref_to_spill])
|
||||
spilled_objects.add(ref_to_spill)
|
||||
|
||||
def is_worker(cmdline):
|
||||
return cmdline and cmdline[0].startswith("ray::")
|
||||
@@ -54,17 +56,16 @@ def test_spill_objects_manually(shutdown_only):
|
||||
# restoring objects back.
|
||||
refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
|
||||
ray.experimental.force_spill_objects(refs_to_spill)
|
||||
spilled_objects.update(refs_to_spill)
|
||||
|
||||
# randomly sample objects
|
||||
for _ in range(100):
|
||||
ref = random.choice(replay_buffer)
|
||||
if ref in spilled_objects:
|
||||
ray.experimental.force_restore_spilled_objects([ref])
|
||||
sample = ray.get(ref)
|
||||
assert np.array_equal(sample, arr)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_objects_manually_from_workers(shutdown_only):
|
||||
# Limit our object store to 100 MiB of memory.
|
||||
ray.init(
|
||||
@@ -82,15 +83,22 @@ def test_spill_objects_manually_from_workers(shutdown_only):
|
||||
|
||||
@ray.remote
|
||||
def _worker():
|
||||
arr = np.random.rand(100 * 1024)
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
ref = ray.put(arr)
|
||||
ray.experimental.force_spill_objects([ref])
|
||||
ray.experimental.force_restore_spilled_objects([ref])
|
||||
assert np.array_equal(ray.get(ref), arr)
|
||||
return ref
|
||||
|
||||
ray.get([_worker.remote() for _ in range(50)])
|
||||
# Create objects of more than 200 MiB.
|
||||
replay_buffer = [ray.get(_worker.remote()) for _ in range(25)]
|
||||
values = {ref: np.copy(ray.get(ref)) for ref in replay_buffer}
|
||||
# Randomly sample objects.
|
||||
for _ in range(100):
|
||||
ref = random.choice(replay_buffer)
|
||||
sample = ray.get(ref)
|
||||
assert np.array_equal(sample, values[ref])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Not implemented yet.")
|
||||
def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
@@ -118,27 +126,29 @@ def test_spill_objects_manually_with_workers(shutdown_only):
|
||||
assert np.array_equal(restored, arr)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [{
|
||||
"num_cpus": 0,
|
||||
"object_store_memory": 75 * 1024 * 1024,
|
||||
"_object_spilling_config": {
|
||||
"object_spilling_config": {
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
},
|
||||
"_system_config": json.dumps({
|
||||
"_system_config": {
|
||||
"object_store_full_max_retries": 0,
|
||||
"max_io_workers": 4,
|
||||
}),
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_spill_remote_object(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
cluster.add_node(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_object_spilling_config={
|
||||
object_spilling_config={
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": "/tmp"
|
||||
@@ -149,23 +159,33 @@ def test_spill_remote_object(ray_start_cluster_head):
|
||||
def put():
|
||||
return np.random.rand(5 * 1024 * 1024) # 40 MB data
|
||||
|
||||
# Create 2 objects. Only 1 should fit.
|
||||
@ray.remote
|
||||
def depends(arg):
|
||||
return
|
||||
|
||||
ref = put.remote()
|
||||
ray.get(ref)
|
||||
copy = np.copy(ray.get(ref))
|
||||
# Evict local copy.
|
||||
ray.put(np.random.rand(5 * 1024 * 1024)) # 40 MB data
|
||||
# Remote copy should not fit.
|
||||
with pytest.raises(ray.exceptions.RayTaskError):
|
||||
ray.get(put.remote())
|
||||
time.sleep(1)
|
||||
# Spill 1 object. The second should now fit.
|
||||
ray.experimental.force_spill_objects([ref])
|
||||
ray.get(put.remote())
|
||||
|
||||
# TODO(swang): Restoring from the object directory is not yet supported.
|
||||
# ray.experimental.force_restore_spilled_objects([ref])
|
||||
# sample = ray.get(ref)
|
||||
# assert np.array_equal(sample, copy)
|
||||
sample = ray.get(ref)
|
||||
assert np.array_equal(sample, copy)
|
||||
# Evict the spilled object.
|
||||
del sample
|
||||
ray.get(put.remote())
|
||||
ray.put(np.random.rand(5 * 1024 * 1024)) # 40 MB data
|
||||
|
||||
# Test passing the spilled object as an arg to another task.
|
||||
ray.get(depends.remote(ref))
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="have not been fully implemented")
|
||||
@pytest.mark.skip(reason="Not implemented yet.")
|
||||
def test_spill_objects_automatically(shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
@@ -196,3 +216,7 @@ def test_spill_objects_automatically(shutdown_only):
|
||||
ref = random.choice(replay_buffer)
|
||||
sample = ray.get(ref, timeout=0)
|
||||
assert np.array_equal(sample, arr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-sv", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user