mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 13:06:49 +08:00
Reconstruction for evicted objects (#181)
* First pass at reconstruction in the worker Modify reconstruction stress testing to start Plasma service before rest of Ray cluster TODO about reconstructing ray.puts Fix ray.put error for double creates Distinguish between empty entry and no entry in object table Fix test case Fix Python test Fix tests * Only call reconstruct on objects we have not yet received * Address review comments * Fix reconstruction for Python3 * remove unused code * Address Robert's comments, stress tests are crashing * Test and update the task's scheduling state to suppress duplicate reconstruction requests. * Split result table into two lookups, one for task ID and the other as a test-and-set for the task state * Fix object table tests * Fix redis module result_table_lookup test case * Multinode reconstruction tests * Fix python3 test case * rename * Use new start_redis * Remove unused code * lint * indent * Address Robert's comments * Use start_redis from ray.services in state table tests * Remove unnecessary memset
This commit is contained in:
committed by
Robert Nishihara
parent
f69d4aaaa7
commit
241b539ff8
@@ -55,6 +55,13 @@ ObjectStoreAddress = namedtuple("ObjectStoreAddress", ["name",
|
||||
def address(ip_address, port):
|
||||
return ip_address + ":" + str(port)
|
||||
|
||||
def get_ip_address(address):
|
||||
try:
|
||||
ip_address = address.split(":")[0]
|
||||
except:
|
||||
raise Exception("Unable to parse IP address from address {}".format(address))
|
||||
return ip_address
|
||||
|
||||
def get_port(address):
|
||||
try:
|
||||
port = int(address.split(":")[1])
|
||||
@@ -430,7 +437,8 @@ def start_ray_processes(address_info=None,
|
||||
# A Redis address was provided, so start a Redis server with the given
|
||||
# port. TODO(rkn): We should check that the IP address corresponds to the
|
||||
# machine that this method is running on.
|
||||
redis_ip_address, redis_port = redis_address.split(":")
|
||||
redis_ip_address = get_ip_address(redis_address)
|
||||
redis_port = get_port(redis_address)
|
||||
new_redis_port = start_redis(port=int(redis_port),
|
||||
num_retries=1,
|
||||
cleanup=cleanup,
|
||||
|
||||
+35
-12
@@ -39,6 +39,10 @@ ERROR_KEY_PREFIX = b"Error:"
|
||||
DRIVER_ID_LENGTH = 20
|
||||
ERROR_ID_LENGTH = 20
|
||||
|
||||
# When performing ray.get, wait 1 second before attemping to reconstruct and
|
||||
# fetch the object again.
|
||||
GET_TIMEOUT_MILLISECONDS = 1000
|
||||
|
||||
def random_string():
|
||||
return np.random.bytes(20)
|
||||
|
||||
@@ -421,13 +425,13 @@ class Worker(object):
|
||||
# Serialize and put the object in the object store.
|
||||
try:
|
||||
numbuf.store_list(objectid.id(), self.plasma_client.conn, [value])
|
||||
except plasma.plasma_object_exists_error as e:
|
||||
except numbuf.numbuf_plasma_object_exists_error as e:
|
||||
# The object already exists in the object store, so there is no need to
|
||||
# add it again. TODO(rkn): We need to compare the hashes and make sure
|
||||
# that the objects are in fact the same. We also should return an error
|
||||
# code to the caller instead of printing a message.
|
||||
print("This object already exists in the object store.")
|
||||
return
|
||||
|
||||
global contained_objectids
|
||||
# Optionally do something with the contained_objectids here.
|
||||
contained_objectids = []
|
||||
@@ -443,18 +447,37 @@ class Worker(object):
|
||||
values should be retrieved.
|
||||
"""
|
||||
self.plasma_client.fetch([object_id.id() for object_id in object_ids])
|
||||
# We currently pass in a timeout of one second.
|
||||
unready_ids = object_ids
|
||||
|
||||
# Get the objects. We initially try to get the objects immediately.
|
||||
final_results = numbuf.retrieve_list(
|
||||
[object_id.id() for object_id in object_ids],
|
||||
self.plasma_client.conn,
|
||||
0)
|
||||
# Construct a dictionary mapping object IDs that we haven't gotten yet to
|
||||
# their original index in the object_ids argument.
|
||||
unready_ids = dict((object_id, i) for (i, (object_id, val)) in
|
||||
enumerate(final_results) if val is None)
|
||||
# Try reconstructing any objects we haven't gotten yet. Try to get them
|
||||
# until GET_TIMEOUT_MILLISECONDS milliseconds passes, then repeat.
|
||||
while len(unready_ids) > 0:
|
||||
results = numbuf.retrieve_list([object_id.id() for object_id in object_ids], self.plasma_client.conn, 1000)
|
||||
unready_ids = [object_id for (object_id, val) in results if val is None]
|
||||
# This would be a natural place to issue a command to reconstruct some of
|
||||
# the objects.
|
||||
for unready_id in unready_ids:
|
||||
self.photon_client.reconstruct_object(unready_id)
|
||||
results = numbuf.retrieve_list(list(unready_ids.keys()),
|
||||
self.plasma_client.conn,
|
||||
GET_TIMEOUT_MILLISECONDS)
|
||||
# Remove any entries for objects we received during this iteration so we
|
||||
# don't retrieve the same object twice.
|
||||
for object_id, val in results:
|
||||
if val is not None:
|
||||
index = unready_ids[object_id]
|
||||
final_results[index] = (object_id, val)
|
||||
unready_ids.pop(object_id)
|
||||
|
||||
# Unwrap the object from the list (it was wrapped put_object).
|
||||
assert len(results) == len(object_ids)
|
||||
for i in range(len(results)):
|
||||
assert results[i][0] == object_ids[i].id()
|
||||
return [result[1][0] for result in results]
|
||||
assert len(final_results) == len(object_ids)
|
||||
for i in range(len(final_results)):
|
||||
assert final_results[i][0] == object_ids[i].id()
|
||||
return [result[1][0] for result in final_results]
|
||||
|
||||
def submit_task(self, function_id, func_name, args):
|
||||
"""Submit a remote task to the scheduler.
|
||||
|
||||
Reference in New Issue
Block a user