[Core] put small objects in memory store (#8972)

* remove the put in memory store

* put small objects directly in memory store

* cast data type

* fix another place that uses Put to spill to plasma store

* fix multiple tests related to memory limits

* partially fix test_metrics

* remove not functioning codes

* fix core_worker_test

* refactor put to plasma codes

* add a flag for the new feature

* add flag to more places

* do a warmup round for the plasma store

* lint

* lint again

* fix warmup store

* Update _raylet.pyx

Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
Zhuohan Li
2020-07-09 15:39:40 -07:00
committed by GitHub
parent 34b85659d4
commit 8a76f4cbb5
18 changed files with 132 additions and 51 deletions
+9 -2
View File
@@ -774,9 +774,14 @@ cdef class CoreWorker:
CObjectID c_object_id
shared_ptr[CBuffer] data
shared_ptr[CBuffer] metadata
int64_t put_threshold
c_bool put_small_object_in_memory_store
c_vector[CObjectID] c_object_id_vector
metadata = string_to_buffer(serialized_object.metadata)
put_threshold = RayConfig.instance().max_direct_call_object_size()
put_small_object_in_memory_store = (
RayConfig.instance().put_small_object_in_memory_store())
total_bytes = serialized_object.total_bytes
object_already_exists = self._create_put_buffer(
metadata, total_bytes, object_id,
@@ -787,7 +792,8 @@ cdef class CoreWorker:
if total_bytes > 0:
(<SerializedObject>serialized_object).write_to(
Buffer.make(data))
if self.is_local_mode:
if self.is_local_mode or (put_small_object_in_memory_store
and <int64_t>total_bytes < put_threshold):
c_object_id_vector.push_back(c_object_id)
check_status(CCoreWorkerProcess.GetCoreWorker().Put(
CRayObject(data, metadata, c_object_id_vector),
@@ -1103,7 +1109,8 @@ cdef class CoreWorker:
cdef:
CObjectID c_object_id = object_id.native()
CAddress c_owner_address = CAddress()
CCoreWorkerProcess.GetCoreWorker().PromoteToPlasmaAndGetOwnershipInfo(
CCoreWorkerProcess.GetCoreWorker().PromoteObjectToPlasma(c_object_id)
CCoreWorkerProcess.GetCoreWorker().GetOwnershipInfo(
c_object_id, &c_owner_address)
return (object_id,
c_owner_address.SerializeAsString())
+4 -2
View File
@@ -127,11 +127,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const CActorHandle* GetNamedActorHandle(const c_string &name)
void AddLocalReference(const CObjectID &object_id)
void RemoveLocalReference(const CObjectID &object_id)
void PutObjectIntoPlasma(const CRayObject &object,
const CObjectID &object_id)
const CAddress &GetRpcAddress() const
CAddress GetOwnerAddress(const CObjectID &object_id) const
void PromoteObjectToPlasma(const CObjectID &object_id)
void PromoteToPlasmaAndGetOwnershipInfo(const CObjectID &object_id,
CAddress *owner_address)
void GetOwnershipInfo(const CObjectID &object_id,
CAddress *owner_address)
void RegisterOwnershipInfoAndResolveFuture(
const CObjectID &object_id,
const CObjectID &outer_object_id,
+2
View File
@@ -88,3 +88,5 @@ cdef extern from "ray/common/ray_config.h" nogil:
int64_t max_direct_call_object_size() const
c_bool gcs_actor_service_enabled() const
c_bool put_small_object_in_memory_store() const
+4
View File
@@ -157,3 +157,7 @@ cdef class Config:
@staticmethod
def maximum_gcs_deletion_batch_size():
return RayConfig.instance().maximum_gcs_deletion_batch_size()
@staticmethod
def put_small_object_in_memory_store():
return RayConfig.instance().put_small_object_in_memory_store()
+6 -5
View File
@@ -484,15 +484,16 @@ def test_shutdown_disconnect_global_state():
@pytest.mark.parametrize(
"ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
def test_put_pins_object(ray_start_object_store_memory):
x_id = ray.put("HI")
obj = np.ones(200 * 1024, dtype=np.uint8)
x_id = ray.put(obj)
x_binary = x_id.binary()
assert ray.get(ray.ObjectID(x_binary)) == "HI"
assert (ray.get(ray.ObjectID(x_binary)) == obj).all()
# x cannot be evicted since x_id pins it
for _ in range(10):
ray.put(np.zeros(10 * 1024 * 1024))
assert ray.get(x_id) == "HI"
assert ray.get(ray.ObjectID(x_binary)) == "HI"
assert (ray.get(x_id) == obj).all()
assert (ray.get(ray.ObjectID(x_binary)) == obj).all()
# now it can be evicted since x_id pins it but x_binary does not
del x_id
@@ -502,7 +503,7 @@ def test_put_pins_object(ray_start_object_store_memory):
ray.ObjectID(x_binary))
# weakref put
y_id = ray.put("HI", weakref=True)
y_id = ray.put(obj, weakref=True)
for _ in range(10):
ray.put(np.zeros(10 * 1024 * 1024))
with pytest.raises(ray.exceptions.UnreconstructableError):
+9 -4
View File
@@ -2,6 +2,7 @@ import os
import signal
import sys
import time
import numpy as np
import pytest
@@ -54,7 +55,8 @@ def test_dying_worker_get(ray_start_2_cpus):
assert len(ready_ids) == 0
# Seal the object so the store attempts to notify the worker that the
# get has been fulfilled.
ray.worker.global_worker.put_object(1, x_id)
obj = np.ones(200 * 1024, dtype=np.uint8)
ray.worker.global_worker.put_object(obj, x_id)
time.sleep(0.1)
# Make sure that nothing has died.
@@ -97,7 +99,8 @@ ray.get(ray.ObjectID(ray.utils.hex_to_binary("{}")))
assert len(ready_ids) == 0
# Seal the object so the store attempts to notify the worker that the
# get has been fulfilled.
ray.worker.global_worker.put_object(1, x_id)
obj = np.ones(200 * 1024, dtype=np.uint8)
ray.worker.global_worker.put_object(obj, x_id)
time.sleep(0.1)
# Make sure that nothing has died.
@@ -137,7 +140,8 @@ def test_dying_worker_wait(ray_start_2_cpus):
time.sleep(0.1)
# Create the object.
ray.worker.global_worker.put_object(1, x_id)
obj = np.ones(200 * 1024, dtype=np.uint8)
ray.worker.global_worker.put_object(obj, x_id)
time.sleep(0.1)
# Make sure that nothing has died.
@@ -180,7 +184,8 @@ ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))])
assert len(ready_ids) == 0
# Seal the object so the store attempts to notify the worker that the
# wait can return.
ray.worker.global_worker.put_object(1, x_id)
obj = np.ones(200 * 1024, dtype=np.uint8)
ray.worker.global_worker.put_object(obj, x_id)
time.sleep(0.1)
# Make sure that nothing has died.
+1 -1
View File
@@ -819,7 +819,7 @@ def test_raylet_crash_when_get(ray_start_regular):
time.sleep(2)
ray.worker._global_node.kill_raylet()
object_id = ray.put(None)
object_id = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
ray.internal.free(object_id)
while ray.worker.global_worker.core_worker.object_exists(object_id):
time.sleep(1)
+2 -1
View File
@@ -62,7 +62,8 @@ class TestMemoryLimits(unittest.TestCase):
num_cpus=1,
object_store_memory=300 * MB,
driver_object_store_memory=driver_quota)
z = ray.put("hi", weakref=True)
obj = np.ones(200 * 1024, dtype=np.uint8)
z = ray.put(obj, weakref=True)
a = LightActor._remote(object_store_memory=a_quota)
b = GreedyActor._remote(object_store_memory=b_quota)
for _ in range(5):
+4 -4
View File
@@ -4,6 +4,7 @@ import grpc
import pytest
import requests
import time
import numpy as np
import ray
from ray.core.generated import node_manager_pb2
@@ -180,7 +181,7 @@ def test_raylet_info_endpoint(shutdown_only):
self.local_storage = [f.remote() for _ in range(10)]
def remote_store(self):
self.remote_storage = ray.put("test")
self.remote_storage = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
def getpid(self):
return os.getpid()
@@ -443,9 +444,8 @@ def test_memory_dashboard(shutdown_only):
return True
def test_object_pineed_in_memory():
import numpy as np
a = ray.put(np.zeros(1))
a = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
b = ray.get(a) # Noqa F841
del a
@@ -469,7 +469,7 @@ def test_memory_dashboard(shutdown_only):
def f(arg):
time.sleep(1)
a = ray.put(None) # Noqa F841
a = ray.put(np.zeros(200 * 1024, dtype=np.uint8)) # Noqa F841
b = f.remote(a) # Noqa F841
wait_for_condition(memory_table_ready)
+3 -1
View File
@@ -174,8 +174,10 @@ def test_cleanup_on_driver_exit(call_ray_start):
driver_script = """
import time
import ray
import numpy as np
ray.init(address="{}")
object_ids = [ray.put(i) for i in range(1000)]
object_ids = [ray.put(np.zeros(200 * 1024, dtype=np.uint8))
for i in range(1000)]
start_time = time.time()
while time.time() - start_time < 30:
if len(ray.objects()) == 1000:
-8
View File
@@ -1286,14 +1286,6 @@ def connect(node,
worker.core_worker.set_object_store_client_options(
"ray_driver_{}".format(os.getpid()), driver_object_store_memory)
# Put something in the plasma store so that subsequent plasma store
# accesses will be faster. Currently the first access is always slow, and
# we don't want the user to experience this.
if mode != LOCAL_MODE:
temporary_object_id = ray.ObjectID.from_random()
worker.put_object(1, object_id=temporary_object_id)
ray.internal.free([temporary_object_id])
# Start the import thread
worker.import_thread = import_thread.ImportThread(worker, mode,
worker.threads_stopped)