mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:23:10 +08:00
[Core] put small objects in memory store (#8972)
* remove the put in memory store * put small objects directly in memory store * cast data type * fix another place that uses Put to spill to plasma store * fix multiple tests related to memory limits * partially fix test_metrics * remove not functioning codes * fix core_worker_test * refactor put to plasma codes * add a flag for the new feature * add flag to more places * do a warmup round for the plasma store * lint * lint again * fix warmup store * Update _raylet.pyx Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
@@ -774,9 +774,14 @@ cdef class CoreWorker:
|
||||
CObjectID c_object_id
|
||||
shared_ptr[CBuffer] data
|
||||
shared_ptr[CBuffer] metadata
|
||||
int64_t put_threshold
|
||||
c_bool put_small_object_in_memory_store
|
||||
c_vector[CObjectID] c_object_id_vector
|
||||
|
||||
metadata = string_to_buffer(serialized_object.metadata)
|
||||
put_threshold = RayConfig.instance().max_direct_call_object_size()
|
||||
put_small_object_in_memory_store = (
|
||||
RayConfig.instance().put_small_object_in_memory_store())
|
||||
total_bytes = serialized_object.total_bytes
|
||||
object_already_exists = self._create_put_buffer(
|
||||
metadata, total_bytes, object_id,
|
||||
@@ -787,7 +792,8 @@ cdef class CoreWorker:
|
||||
if total_bytes > 0:
|
||||
(<SerializedObject>serialized_object).write_to(
|
||||
Buffer.make(data))
|
||||
if self.is_local_mode:
|
||||
if self.is_local_mode or (put_small_object_in_memory_store
|
||||
and <int64_t>total_bytes < put_threshold):
|
||||
c_object_id_vector.push_back(c_object_id)
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Put(
|
||||
CRayObject(data, metadata, c_object_id_vector),
|
||||
@@ -1103,7 +1109,8 @@ cdef class CoreWorker:
|
||||
cdef:
|
||||
CObjectID c_object_id = object_id.native()
|
||||
CAddress c_owner_address = CAddress()
|
||||
CCoreWorkerProcess.GetCoreWorker().PromoteToPlasmaAndGetOwnershipInfo(
|
||||
CCoreWorkerProcess.GetCoreWorker().PromoteObjectToPlasma(c_object_id)
|
||||
CCoreWorkerProcess.GetCoreWorker().GetOwnershipInfo(
|
||||
c_object_id, &c_owner_address)
|
||||
return (object_id,
|
||||
c_owner_address.SerializeAsString())
|
||||
|
||||
@@ -127,11 +127,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const CActorHandle* GetNamedActorHandle(const c_string &name)
|
||||
void AddLocalReference(const CObjectID &object_id)
|
||||
void RemoveLocalReference(const CObjectID &object_id)
|
||||
void PutObjectIntoPlasma(const CRayObject &object,
|
||||
const CObjectID &object_id)
|
||||
const CAddress &GetRpcAddress() const
|
||||
CAddress GetOwnerAddress(const CObjectID &object_id) const
|
||||
void PromoteObjectToPlasma(const CObjectID &object_id)
|
||||
void PromoteToPlasmaAndGetOwnershipInfo(const CObjectID &object_id,
|
||||
CAddress *owner_address)
|
||||
void GetOwnershipInfo(const CObjectID &object_id,
|
||||
CAddress *owner_address)
|
||||
void RegisterOwnershipInfoAndResolveFuture(
|
||||
const CObjectID &object_id,
|
||||
const CObjectID &outer_object_id,
|
||||
|
||||
@@ -88,3 +88,5 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
int64_t max_direct_call_object_size() const
|
||||
|
||||
c_bool gcs_actor_service_enabled() const
|
||||
|
||||
c_bool put_small_object_in_memory_store() const
|
||||
|
||||
@@ -157,3 +157,7 @@ cdef class Config:
|
||||
@staticmethod
|
||||
def maximum_gcs_deletion_batch_size():
|
||||
return RayConfig.instance().maximum_gcs_deletion_batch_size()
|
||||
|
||||
@staticmethod
|
||||
def put_small_object_in_memory_store():
|
||||
return RayConfig.instance().put_small_object_in_memory_store()
|
||||
|
||||
@@ -484,15 +484,16 @@ def test_shutdown_disconnect_global_state():
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
|
||||
def test_put_pins_object(ray_start_object_store_memory):
|
||||
x_id = ray.put("HI")
|
||||
obj = np.ones(200 * 1024, dtype=np.uint8)
|
||||
x_id = ray.put(obj)
|
||||
x_binary = x_id.binary()
|
||||
assert ray.get(ray.ObjectID(x_binary)) == "HI"
|
||||
assert (ray.get(ray.ObjectID(x_binary)) == obj).all()
|
||||
|
||||
# x cannot be evicted since x_id pins it
|
||||
for _ in range(10):
|
||||
ray.put(np.zeros(10 * 1024 * 1024))
|
||||
assert ray.get(x_id) == "HI"
|
||||
assert ray.get(ray.ObjectID(x_binary)) == "HI"
|
||||
assert (ray.get(x_id) == obj).all()
|
||||
assert (ray.get(ray.ObjectID(x_binary)) == obj).all()
|
||||
|
||||
# now it can be evicted since x_id pins it but x_binary does not
|
||||
del x_id
|
||||
@@ -502,7 +503,7 @@ def test_put_pins_object(ray_start_object_store_memory):
|
||||
ray.ObjectID(x_binary))
|
||||
|
||||
# weakref put
|
||||
y_id = ray.put("HI", weakref=True)
|
||||
y_id = ray.put(obj, weakref=True)
|
||||
for _ in range(10):
|
||||
ray.put(np.zeros(10 * 1024 * 1024))
|
||||
with pytest.raises(ray.exceptions.UnreconstructableError):
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -54,7 +55,8 @@ def test_dying_worker_get(ray_start_2_cpus):
|
||||
assert len(ready_ids) == 0
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
obj = np.ones(200 * 1024, dtype=np.uint8)
|
||||
ray.worker.global_worker.put_object(obj, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
@@ -97,7 +99,8 @@ ray.get(ray.ObjectID(ray.utils.hex_to_binary("{}")))
|
||||
assert len(ready_ids) == 0
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
obj = np.ones(200 * 1024, dtype=np.uint8)
|
||||
ray.worker.global_worker.put_object(obj, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
@@ -137,7 +140,8 @@ def test_dying_worker_wait(ray_start_2_cpus):
|
||||
time.sleep(0.1)
|
||||
|
||||
# Create the object.
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
obj = np.ones(200 * 1024, dtype=np.uint8)
|
||||
ray.worker.global_worker.put_object(obj, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
@@ -180,7 +184,8 @@ ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))])
|
||||
assert len(ready_ids) == 0
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# wait can return.
|
||||
ray.worker.global_worker.put_object(1, x_id)
|
||||
obj = np.ones(200 * 1024, dtype=np.uint8)
|
||||
ray.worker.global_worker.put_object(obj, x_id)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
|
||||
@@ -819,7 +819,7 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
time.sleep(2)
|
||||
ray.worker._global_node.kill_raylet()
|
||||
|
||||
object_id = ray.put(None)
|
||||
object_id = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
|
||||
ray.internal.free(object_id)
|
||||
while ray.worker.global_worker.core_worker.object_exists(object_id):
|
||||
time.sleep(1)
|
||||
|
||||
@@ -62,7 +62,8 @@ class TestMemoryLimits(unittest.TestCase):
|
||||
num_cpus=1,
|
||||
object_store_memory=300 * MB,
|
||||
driver_object_store_memory=driver_quota)
|
||||
z = ray.put("hi", weakref=True)
|
||||
obj = np.ones(200 * 1024, dtype=np.uint8)
|
||||
z = ray.put(obj, weakref=True)
|
||||
a = LightActor._remote(object_store_memory=a_quota)
|
||||
b = GreedyActor._remote(object_store_memory=b_quota)
|
||||
for _ in range(5):
|
||||
|
||||
@@ -4,6 +4,7 @@ import grpc
|
||||
import pytest
|
||||
import requests
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.core.generated import node_manager_pb2
|
||||
@@ -180,7 +181,7 @@ def test_raylet_info_endpoint(shutdown_only):
|
||||
self.local_storage = [f.remote() for _ in range(10)]
|
||||
|
||||
def remote_store(self):
|
||||
self.remote_storage = ray.put("test")
|
||||
self.remote_storage = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
|
||||
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
@@ -443,9 +444,8 @@ def test_memory_dashboard(shutdown_only):
|
||||
return True
|
||||
|
||||
def test_object_pineed_in_memory():
|
||||
import numpy as np
|
||||
|
||||
a = ray.put(np.zeros(1))
|
||||
a = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
|
||||
b = ray.get(a) # Noqa F841
|
||||
del a
|
||||
|
||||
@@ -469,7 +469,7 @@ def test_memory_dashboard(shutdown_only):
|
||||
def f(arg):
|
||||
time.sleep(1)
|
||||
|
||||
a = ray.put(None) # Noqa F841
|
||||
a = ray.put(np.zeros(200 * 1024, dtype=np.uint8)) # Noqa F841
|
||||
b = f.remote(a) # Noqa F841
|
||||
|
||||
wait_for_condition(memory_table_ready)
|
||||
|
||||
@@ -174,8 +174,10 @@ def test_cleanup_on_driver_exit(call_ray_start):
|
||||
driver_script = """
|
||||
import time
|
||||
import ray
|
||||
import numpy as np
|
||||
ray.init(address="{}")
|
||||
object_ids = [ray.put(i) for i in range(1000)]
|
||||
object_ids = [ray.put(np.zeros(200 * 1024, dtype=np.uint8))
|
||||
for i in range(1000)]
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < 30:
|
||||
if len(ray.objects()) == 1000:
|
||||
|
||||
@@ -1286,14 +1286,6 @@ def connect(node,
|
||||
worker.core_worker.set_object_store_client_options(
|
||||
"ray_driver_{}".format(os.getpid()), driver_object_store_memory)
|
||||
|
||||
# Put something in the plasma store so that subsequent plasma store
|
||||
# accesses will be faster. Currently the first access is always slow, and
|
||||
# we don't want the user to experience this.
|
||||
if mode != LOCAL_MODE:
|
||||
temporary_object_id = ray.ObjectID.from_random()
|
||||
worker.put_object(1, object_id=temporary_object_id)
|
||||
ray.internal.free([temporary_object_id])
|
||||
|
||||
# Start the import thread
|
||||
worker.import_thread = import_thread.ImportThread(worker, mode,
|
||||
worker.threads_stopped)
|
||||
|
||||
Reference in New Issue
Block a user