[Object spilling] Update object directory and reload spilled objects automatically (#11021)

* Fix pytest...

* Release objects that have been spilled

* GCS object table interface refactor

* Add spilled URL to object location info

* refactor to include spilled URL in notifications

* improve tests

* Add spilled URL to object directory results

* Remove force restore call

* Merge spilled URL and location

* fix

* CI

* build

* osx

* Fix multitenancy issues

* Skip windows tests
This commit is contained in:
Stephanie Wang
2020-10-02 15:52:42 -07:00
committed by GitHub
parent c17169dc11
commit ada58abcd9
43 changed files with 499 additions and 430 deletions
+45 -21
View File
@@ -1,5 +1,7 @@
import json
import random
import platform
import sys
import time
import numpy as np
@@ -8,6 +10,8 @@ import psutil
import ray
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_objects_manually(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
@@ -25,7 +29,6 @@ def test_spill_objects_manually(shutdown_only):
arr = np.random.rand(1024 * 1024) # 8 MB data
replay_buffer = []
pinned_objects = set()
spilled_objects = set()
# Create objects of more than 200 MiB.
for _ in range(25):
@@ -38,7 +41,6 @@ def test_spill_objects_manually(shutdown_only):
except ray.exceptions.ObjectStoreFullError:
ref_to_spill = pinned_objects.pop()
ray.experimental.force_spill_objects([ref_to_spill])
spilled_objects.add(ref_to_spill)
def is_worker(cmdline):
return cmdline and cmdline[0].startswith("ray::")
@@ -54,17 +56,16 @@ def test_spill_objects_manually(shutdown_only):
# restoring objects back.
refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
ray.experimental.force_spill_objects(refs_to_spill)
spilled_objects.update(refs_to_spill)
# randomly sample objects
for _ in range(100):
ref = random.choice(replay_buffer)
if ref in spilled_objects:
ray.experimental.force_restore_spilled_objects([ref])
sample = ray.get(ref)
assert np.array_equal(sample, arr)
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_objects_manually_from_workers(shutdown_only):
# Limit our object store to 100 MiB of memory.
ray.init(
@@ -82,15 +83,22 @@ def test_spill_objects_manually_from_workers(shutdown_only):
@ray.remote
def _worker():
arr = np.random.rand(100 * 1024)
arr = np.random.rand(1024 * 1024) # 8 MB data
ref = ray.put(arr)
ray.experimental.force_spill_objects([ref])
ray.experimental.force_restore_spilled_objects([ref])
assert np.array_equal(ray.get(ref), arr)
return ref
ray.get([_worker.remote() for _ in range(50)])
# Create objects of more than 200 MiB.
replay_buffer = [ray.get(_worker.remote()) for _ in range(25)]
values = {ref: np.copy(ray.get(ref)) for ref in replay_buffer}
# Randomly sample objects.
for _ in range(100):
ref = random.choice(replay_buffer)
sample = ray.get(ref)
assert np.array_equal(sample, values[ref])
@pytest.mark.skip(reason="Not implemented yet.")
def test_spill_objects_manually_with_workers(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
@@ -118,27 +126,29 @@ def test_spill_objects_manually_with_workers(shutdown_only):
assert np.array_equal(restored, arr)
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
@pytest.mark.parametrize(
"ray_start_cluster_head", [{
"num_cpus": 0,
"object_store_memory": 75 * 1024 * 1024,
"_object_spilling_config": {
"object_spilling_config": {
"type": "filesystem",
"params": {
"directory_path": "/tmp"
}
},
"_system_config": json.dumps({
"_system_config": {
"object_store_full_max_retries": 0,
"max_io_workers": 4,
}),
},
}],
indirect=True)
def test_spill_remote_object(ray_start_cluster_head):
cluster = ray_start_cluster_head
cluster.add_node(
object_store_memory=75 * 1024 * 1024,
_object_spilling_config={
object_spilling_config={
"type": "filesystem",
"params": {
"directory_path": "/tmp"
@@ -149,23 +159,33 @@ def test_spill_remote_object(ray_start_cluster_head):
def put():
return np.random.rand(5 * 1024 * 1024) # 40 MB data
# Create 2 objects. Only 1 should fit.
@ray.remote
def depends(arg):
return
ref = put.remote()
ray.get(ref)
copy = np.copy(ray.get(ref))
# Evict local copy.
ray.put(np.random.rand(5 * 1024 * 1024)) # 40 MB data
# Remote copy should not fit.
with pytest.raises(ray.exceptions.RayTaskError):
ray.get(put.remote())
time.sleep(1)
# Spill 1 object. The second should now fit.
ray.experimental.force_spill_objects([ref])
ray.get(put.remote())
# TODO(swang): Restoring from the object directory is not yet supported.
# ray.experimental.force_restore_spilled_objects([ref])
# sample = ray.get(ref)
# assert np.array_equal(sample, copy)
sample = ray.get(ref)
assert np.array_equal(sample, copy)
# Evict the spilled object.
del sample
ray.get(put.remote())
ray.put(np.random.rand(5 * 1024 * 1024)) # 40 MB data
# Test passing the spilled object as an arg to another task.
ray.get(depends.remote(ref))
@pytest.mark.skip(reason="have not been fully implemented")
@pytest.mark.skip(reason="Not implemented yet.")
def test_spill_objects_automatically(shutdown_only):
# Limit our object store to 75 MiB of memory.
ray.init(
@@ -196,3 +216,7 @@ def test_spill_objects_automatically(shutdown_only):
ref = random.choice(replay_buffer)
sample = ray.get(ref, timeout=0)
assert np.array_equal(sample, arr)
if __name__ == "__main__":
sys.exit(pytest.main(["-sv", __file__]))