Dynamic Custom Resources - create and delete resources (#3742)

This commit is contained in:
Romil Bhardwaj
2019-05-11 05:06:04 -07:00
committed by Hao Chen
parent 351753aae5
commit 004440f526
23 changed files with 1041 additions and 65 deletions
+4
View File
@@ -32,6 +32,7 @@ from ray.includes.libraylet cimport (
from ray.includes.unique_ids cimport (
CActorCheckpointID,
CObjectID,
CClientID,
)
from ray.includes.task cimport CTaskSpecification
from ray.includes.ray_config cimport RayConfig
@@ -368,6 +369,9 @@ cdef class RayletClient:
check_status(self.client.get().NotifyActorResumedFromCheckpoint(
actor_id.native(), checkpoint_id.native()))
def set_resource(self, basestring resource_name, double capacity, ClientID client_id):
self.client.get().SetResource(resource_name.encode("ascii"), capacity, CClientID.from_binary(client_id.binary()))
@property
def language(self):
return Language.from_native(self.client.get().GetLanguage())
+2 -1
View File
@@ -10,6 +10,7 @@ from .gcs_flush_policy import (set_flushing_policy, GcsFlushPolicy,
SimpleGcsFlushPolicy)
from .named_actors import get_actor, register_actor
from .api import get, wait
from .dynamic_resources import set_resource
def TensorFlowVariables(*args, **kwargs):
@@ -24,5 +25,5 @@ __all__ = [
"flush_evicted_objects_unsafe", "_flush_finished_tasks_unsafe_shard",
"_flush_evicted_objects_unsafe_shard", "get_actor", "register_actor",
"get", "wait", "set_flushing_policy", "GcsFlushPolicy",
"SimpleGcsFlushPolicy"
"SimpleGcsFlushPolicy", "set_resource"
]
@@ -0,0 +1,35 @@
import ray
def set_resource(resource_name, capacity, client_id=None):
""" Set a resource to a specified capacity.
This creates, updates or deletes a custom resource for a target clientId.
If the resource already exists, it's capacity is updated to the new value.
If the capacity is set to 0, the resource is deleted.
If ClientID is not specified or set to None,
the resource is created on the local client where the actor is running.
Args:
resource_name (str): Name of the resource to be created
capacity (int): Capacity of the new resource. Resource is deleted if
capacity is 0.
client_id (str): The ClientId of the node where the resource is to be
set.
Returns:
None
Raises:
ValueError: This exception is raised when a non-negative capacity is
specified.
"""
if client_id is not None:
client_id_obj = ray.ClientID(ray.utils.hex_to_binary(client_id))
else:
client_id_obj = ray.ClientID.nil()
if (capacity < 0) or (capacity != int(capacity)):
raise ValueError(
"Capacity {} must be a non-negative integer.".format(capacity))
return ray.worker.global_worker.raylet_client.set_resource(
resource_name, capacity, client_id_obj)
+39 -24
View File
@@ -13,6 +13,7 @@ import ray.gcs_utils
from ray.ray_constants import ID_SIZE
from ray import services
from ray.core.generated.EntryType import EntryType
from ray.utils import (decode, binary_to_object_id, binary_to_hex,
hex_to_binary)
@@ -54,29 +55,43 @@ def parse_client_table(redis_client):
}
client_id = ray.utils.binary_to_hex(client.ClientId())
# If this client is being removed, then it must
if client.EntryType() == EntryType.INSERTION:
ordered_client_ids.append(client_id)
node_info[client_id] = {
"ClientID": client_id,
"EntryType": client.EntryType(),
"NodeManagerAddress": decode(
client.NodeManagerAddress(), allow_none=True),
"NodeManagerPort": client.NodeManagerPort(),
"ObjectManagerPort": client.ObjectManagerPort(),
"ObjectStoreSocketName": decode(
client.ObjectStoreSocketName(), allow_none=True),
"RayletSocketName": decode(
client.RayletSocketName(), allow_none=True),
"Resources": resources
}
# If this client is being updated, then it must
# have previously been inserted, and
# it cannot have previously been removed.
if not client.IsInsertion():
assert client_id in node_info, "Client removed not found!"
assert node_info[client_id]["IsInsertion"], (
"Unexpected duplicate removal of client.")
else:
ordered_client_ids.append(client_id)
node_info[client_id] = {
"ClientID": client_id,
"IsInsertion": client.IsInsertion(),
"NodeManagerAddress": decode(
client.NodeManagerAddress(), allow_none=True),
"NodeManagerPort": client.NodeManagerPort(),
"ObjectManagerPort": client.ObjectManagerPort(),
"ObjectStoreSocketName": decode(
client.ObjectStoreSocketName(), allow_none=True),
"RayletSocketName": decode(
client.RayletSocketName(), allow_none=True),
"Resources": resources
}
assert client_id in node_info, "Client not found!"
assert node_info[client_id]["EntryType"] != EntryType.DELETION, (
"Unexpected updation of deleted client.")
res_map = node_info[client_id]["Resources"]
if client.EntryType() == EntryType.RES_CREATEUPDATE:
for res in resources:
res_map[res] = resources[res]
elif client.EntryType() == EntryType.RES_DELETE:
for res in resources:
res_map.pop(res, None)
elif client.EntryType() == EntryType.DELETION:
pass # Do nothing with the resmap if client deletion
else:
raise RuntimeError("Unexpected EntryType {}".format(
client.EntryType()))
node_info[client_id]["Resources"] = res_map
node_info[client_id]["EntryType"] = client.EntryType()
# NOTE: We return the list comprehension below instead of simply doing
# 'list(node_info.values())' in order to have the nodes appear in the order
# that they joined the cluster. Python dictionaries do not preserve
@@ -757,18 +772,18 @@ class GlobalState(object):
resources = defaultdict(int)
clients = self.client_table()
for client in clients:
# Only count resources from live clients.
if client["IsInsertion"]:
# Only count resources from latest entries of live clients.
if client["EntryType"] != EntryType.DELETION:
for key, value in client["Resources"].items():
resources[key] += value
return dict(resources)
def _live_client_ids(self):
"""Returns a set of client IDs corresponding to clients still alive."""
return {
client["ClientID"]
for client in self.client_table() if client["IsInsertion"]
for client in self.client_table()
if (client["EntryType"] != EntryType.DELETION)
}
def available_resources(self):
+1
View File
@@ -72,6 +72,7 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
CActorCheckpointID &checkpoint_id)
CRayStatus NotifyActorResumedFromCheckpoint(
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
CRayStatus SetResource(const c_string &resource_name, const double capacity, const CClientID &client_Id)
CLanguage GetLanguage() const
CClientID GetClientID() const
CDriverID GetDriverID() const
+3 -1
View File
@@ -8,6 +8,7 @@ import time
import redis
import ray
from ray.core.generated.EntryType import EntryType
logger = logging.getLogger(__name__)
@@ -175,7 +176,8 @@ class Cluster(object):
while time.time() - start_time < timeout:
clients = ray.experimental.state.parse_client_table(redis_client)
live_clients = [
client for client in clients if client["IsInsertion"]
client for client in clients
if client["EntryType"] == EntryType.INSERTION
]
expected = len(self.list_all_nodes())
+586
View File
@@ -0,0 +1,586 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import time
import ray
import ray.tests.cluster_utils
import ray.tests.utils
logger = logging.getLogger(__name__)
def test_dynamic_res_creation(ray_start_regular):
# This test creates a resource locally (without specifying the client_id)
res_name = "test_res"
res_capacity = 1.0
@ray.remote
def set_res(resource_name, resource_capacity):
ray.experimental.set_resource(resource_name, resource_capacity)
ray.get(set_res.remote(res_name, res_capacity))
available_res = ray.global_state.available_resources()
cluster_res = ray.global_state.cluster_resources()
assert available_res[res_name] == res_capacity
assert cluster_res[res_name] == res_capacity
def test_dynamic_res_deletion(shutdown_only):
# This test deletes a resource locally (without specifying the client_id)
res_name = "test_res"
res_capacity = 1.0
ray.init(num_cpus=1, resources={res_name: res_capacity})
@ray.remote
def delete_res(resource_name):
ray.experimental.set_resource(resource_name, 0)
ray.get(delete_res.remote(res_name))
available_res = ray.global_state.available_resources()
cluster_res = ray.global_state.cluster_resources()
assert res_name not in available_res
assert res_name not in cluster_res
def test_dynamic_res_infeasible_rescheduling(ray_start_regular):
# This test launches an infeasible task and then creates a
# resource to make the task feasible. This tests if the
# infeasible tasks get rescheduled when resources are
# created at runtime.
res_name = "test_res"
res_capacity = 1.0
@ray.remote
def set_res(resource_name, resource_capacity):
ray.experimental.set_resource(resource_name, resource_capacity)
def f():
return 1
remote_task = ray.remote(resources={res_name: res_capacity})(f)
oid = remote_task.remote() # This is infeasible
ray.get(set_res.remote(res_name, res_capacity)) # Now should be feasible
available_res = ray.global_state.available_resources()
assert available_res[res_name] == res_capacity
successful, unsuccessful = ray.wait([oid], timeout=1)
assert successful # The task completed
def test_dynamic_res_updation_clientid(ray_start_cluster):
# This test does a simple resource capacity update
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 1.0
num_nodes = 3
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
target_clientid = ray.global_state.client_table()[1]["ClientID"]
@ray.remote
def set_res(resource_name, resource_capacity, client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=client_id)
# Create resource
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
# Update resource
new_capacity = res_capacity + 1
ray.get(set_res.remote(res_name, new_capacity, target_clientid))
target_client = next(client for client in ray.global_state.client_table()
if client["ClientID"] == target_clientid)
resources = target_client["Resources"]
assert res_name in resources
assert resources[res_name] == new_capacity
def test_dynamic_res_creation_clientid(ray_start_cluster):
# Creates a resource on a specific client and verifies creation.
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 1.0
num_nodes = 3
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
target_clientid = ray.global_state.client_table()[1]["ClientID"]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
target_client = next(client for client in ray.global_state.client_table()
if client["ClientID"] == target_clientid)
resources = target_client["Resources"]
assert res_name in resources
assert resources[res_name] == res_capacity
def test_dynamic_res_creation_clientid_multiple(ray_start_cluster):
# This test creates resources on multiple clients using the clientid
# specifier
cluster = ray_start_cluster
TIMEOUT = 5
res_name = "test_res"
res_capacity = 1.0
num_nodes = 3
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
target_clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
results = []
for cid in target_clientids:
results.append(set_res.remote(res_name, res_capacity, cid))
ray.get(results)
success = False
start_time = time.time()
while time.time() - start_time < TIMEOUT and not success:
resources_created = []
for cid in target_clientids:
target_client = next(client
for client in ray.global_state.client_table()
if client["ClientID"] == cid)
resources = target_client["Resources"]
resources_created.append(resources[res_name] == res_capacity)
success = all(resources_created)
assert success
def test_dynamic_res_deletion_clientid(ray_start_cluster):
# This test deletes a resource on a given client id
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 1.0
num_nodes = 5
for i in range(num_nodes):
# Create resource on all nodes, but later we'll delete it from a
# target node
cluster.add_node(resources={res_name: res_capacity})
ray.init(redis_address=cluster.redis_address)
target_clientid = ray.global_state.client_table()[1]["ClientID"]
# Launch the delete task
@ray.remote
def delete_res(resource_name, res_client_id):
ray.experimental.set_resource(
resource_name, 0, client_id=res_client_id)
ray.get(delete_res.remote(res_name, target_clientid))
target_client = next(client for client in ray.global_state.client_table()
if client["ClientID"] == target_clientid)
resources = target_client["Resources"]
print(ray.global_state.cluster_resources())
assert res_name not in resources
def test_dynamic_res_creation_scheduler_consistency(ray_start_cluster):
# This makes sure the resource is actually created and the state is
# consistent in the scheduler
# by launching a task which requests the created resource
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 1.0
num_nodes = 5
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
# Create the resource on node1
target_clientid = clientids[1]
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
# Define a task which requires this resource
@ray.remote(resources={res_name: res_capacity})
def test_func():
return 1
result = test_func.remote()
successful, unsuccessful = ray.wait([result], timeout=5)
assert successful # The task completed
def test_dynamic_res_deletion_scheduler_consistency(ray_start_cluster):
# This makes sure the resource is actually deleted and the state is
# consistent in the scheduler by launching an infeasible task which
# requests the created resource
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 1.0
num_nodes = 5
TIMEOUT_DURATION = 1
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
@ray.remote
def delete_res(resource_name, res_client_id):
ray.experimental.set_resource(
resource_name, 0, client_id=res_client_id)
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
# Create the resource on node1
target_clientid = clientids[1]
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
assert ray.global_state.cluster_resources()[res_name] == res_capacity
# Delete the resource
ray.get(delete_res.remote(res_name, target_clientid))
# Define a task which requires this resource. This should not run
@ray.remote(resources={res_name: res_capacity})
def test_func():
return 1
result = test_func.remote()
successful, unsuccessful = ray.wait([result], timeout=TIMEOUT_DURATION)
assert unsuccessful # The task did not complete because it's infeasible
def test_dynamic_res_concurrent_res_increment(ray_start_cluster):
# This test makes sure resource capacity is updated (increment) correctly
# when a task has already acquired some of the resource.
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 5
updated_capacity = 10
num_nodes = 5
TIMEOUT_DURATION = 1
# Create a object ID to have the task wait on
WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")
# Create a object ID to signal that the task is running
TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
target_clientid = clientids[1]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
# Create the resource on node 1
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
assert ray.global_state.cluster_resources()[res_name] == res_capacity
# Task to hold the resource till the driver signals to finish
@ray.remote
def wait_func(running_oid, wait_oid):
# Signal that the task is running
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
# Make the task wait till signalled by driver
ray.get(ray.ObjectID(wait_oid))
@ray.remote
def test_func():
return 1
# Launch the task with resource requirement of 4, thus the new available
# capacity becomes 1
task = wait_func._remote(
args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
resources={res_name: 4})
# Wait till wait_func is launched before updating resource
ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))
# Update the resource capacity
ray.get(set_res.remote(res_name, updated_capacity, target_clientid))
# Signal task to complete
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
ray.get(task)
# Check if scheduler state is consistent by launching a task requiring
# updated capacity
task_2 = test_func._remote(args=[], resources={res_name: updated_capacity})
successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
assert successful # The task completed
# Check if scheduler state is consistent by launching a task requiring
# updated capacity + 1. This should not execute
task_3 = test_func._remote(
args=[], resources={res_name: updated_capacity + 1
}) # This should be infeasible
successful, unsuccessful = ray.wait([task_3], timeout=TIMEOUT_DURATION)
assert unsuccessful # The task did not complete because it's infeasible
assert ray.global_state.available_resources()[res_name] == updated_capacity
def test_dynamic_res_concurrent_res_decrement(ray_start_cluster):
# This test makes sure resource capacity is updated (decremented)
# correctly when a task has already acquired some
# of the resource.
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 5
updated_capacity = 2
num_nodes = 5
TIMEOUT_DURATION = 1
# Create a object ID to have the task wait on
WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")
# Create a object ID to signal that the task is running
TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
target_clientid = clientids[1]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
# Create the resource on node 1
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
assert ray.global_state.cluster_resources()[res_name] == res_capacity
# Task to hold the resource till the driver signals to finish
@ray.remote
def wait_func(running_oid, wait_oid):
# Signal that the task is running
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
# Make the task wait till signalled by driver
ray.get(ray.ObjectID(wait_oid))
@ray.remote
def test_func():
return 1
# Launch the task with resource requirement of 4, thus the new available
# capacity becomes 1
task = wait_func._remote(
args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
resources={res_name: 4})
# Wait till wait_func is launched before updating resource
ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))
# Decrease the resource capacity
ray.get(set_res.remote(res_name, updated_capacity, target_clientid))
# Signal task to complete
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
ray.get(task)
# Check if scheduler state is consistent by launching a task requiring
# updated capacity
task_2 = test_func._remote(args=[], resources={res_name: updated_capacity})
successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
assert successful # The task completed
# Check if scheduler state is consistent by launching a task requiring
# updated capacity + 1. This should not execute
task_3 = test_func._remote(
args=[], resources={res_name: updated_capacity + 1
}) # This should be infeasible
successful, unsuccessful = ray.wait([task_3], timeout=TIMEOUT_DURATION)
assert unsuccessful # The task did not complete because it's infeasible
assert ray.global_state.available_resources()[res_name] == updated_capacity
def test_dynamic_res_concurrent_res_delete(ray_start_cluster):
# This test makes sure resource gets deleted correctly when a task has
# already acquired the resource
cluster = ray_start_cluster
res_name = "test_res"
res_capacity = 5
num_nodes = 5
TIMEOUT_DURATION = 1
# Create a object ID to have the task wait on
WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")
# Create a object ID to signal that the task is running
TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
target_clientid = clientids[1]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
@ray.remote
def delete_res(resource_name, res_client_id):
ray.experimental.set_resource(
resource_name, 0, client_id=res_client_id)
# Create the resource on node 1
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
assert ray.global_state.cluster_resources()[res_name] == res_capacity
# Task to hold the resource till the driver signals to finish
@ray.remote
def wait_func(running_oid, wait_oid):
# Signal that the task is running
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
# Make the task wait till signalled by driver
ray.get(ray.ObjectID(wait_oid))
@ray.remote
def test_func():
return 1
# Launch the task with resource requirement of 4, thus the new available
# capacity becomes 1
task = wait_func._remote(
args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
resources={res_name: 4})
# Wait till wait_func is launched before updating resource
ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))
# Delete the resource
ray.get(delete_res.remote(res_name, target_clientid))
# Signal task to complete
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
ray.get(task)
# Check if scheduler state is consistent by launching a task requiring
# the deleted resource This should not execute
task_2 = test_func._remote(
args=[], resources={res_name: 1}) # This should be infeasible
successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
assert unsuccessful # The task did not complete because it's infeasible
assert res_name not in ray.global_state.available_resources()
def test_dynamic_res_creation_stress(ray_start_cluster):
# This stress tests creates many resources simultaneously on the same
# client and then checks if the final state is consistent
cluster = ray_start_cluster
TIMEOUT = 5
res_capacity = 1
num_nodes = 5
NUM_RES_TO_CREATE = 500
for i in range(num_nodes):
cluster.add_node()
ray.init(redis_address=cluster.redis_address)
clientids = [
client["ClientID"] for client in ray.global_state.client_table()
]
target_clientid = clientids[1]
@ray.remote
def set_res(resource_name, resource_capacity, res_client_id):
ray.experimental.set_resource(
resource_name, resource_capacity, client_id=res_client_id)
@ray.remote
def delete_res(resource_name, res_client_id):
ray.experimental.set_resource(
resource_name, 0, client_id=res_client_id)
results = [
set_res.remote(str(i), res_capacity, target_clientid)
for i in range(0, NUM_RES_TO_CREATE)
]
ray.get(results)
success = False
start_time = time.time()
while time.time() - start_time < TIMEOUT and not success:
resources = ray.global_state.cluster_resources()
all_resources_created = []
for i in range(0, NUM_RES_TO_CREATE):
all_resources_created.append(str(i) in resources)
success = all(all_resources_created)
assert success