mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 20:35:31 +08:00
Dynamic Custom Resources - create and delete resources (#3742)
This commit is contained in:
@@ -32,6 +32,7 @@ from ray.includes.libraylet cimport (
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorCheckpointID,
|
||||
CObjectID,
|
||||
CClientID,
|
||||
)
|
||||
from ray.includes.task cimport CTaskSpecification
|
||||
from ray.includes.ray_config cimport RayConfig
|
||||
@@ -368,6 +369,9 @@ cdef class RayletClient:
|
||||
check_status(self.client.get().NotifyActorResumedFromCheckpoint(
|
||||
actor_id.native(), checkpoint_id.native()))
|
||||
|
||||
def set_resource(self, basestring resource_name, double capacity, ClientID client_id):
|
||||
self.client.get().SetResource(resource_name.encode("ascii"), capacity, CClientID.from_binary(client_id.binary()))
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return Language.from_native(self.client.get().GetLanguage())
|
||||
|
||||
@@ -10,6 +10,7 @@ from .gcs_flush_policy import (set_flushing_policy, GcsFlushPolicy,
|
||||
SimpleGcsFlushPolicy)
|
||||
from .named_actors import get_actor, register_actor
|
||||
from .api import get, wait
|
||||
from .dynamic_resources import set_resource
|
||||
|
||||
|
||||
def TensorFlowVariables(*args, **kwargs):
|
||||
@@ -24,5 +25,5 @@ __all__ = [
|
||||
"flush_evicted_objects_unsafe", "_flush_finished_tasks_unsafe_shard",
|
||||
"_flush_evicted_objects_unsafe_shard", "get_actor", "register_actor",
|
||||
"get", "wait", "set_flushing_policy", "GcsFlushPolicy",
|
||||
"SimpleGcsFlushPolicy"
|
||||
"SimpleGcsFlushPolicy", "set_resource"
|
||||
]
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
import ray
|
||||
|
||||
|
||||
def set_resource(resource_name, capacity, client_id=None):
|
||||
""" Set a resource to a specified capacity.
|
||||
|
||||
This creates, updates or deletes a custom resource for a target clientId.
|
||||
If the resource already exists, it's capacity is updated to the new value.
|
||||
If the capacity is set to 0, the resource is deleted.
|
||||
If ClientID is not specified or set to None,
|
||||
the resource is created on the local client where the actor is running.
|
||||
|
||||
Args:
|
||||
resource_name (str): Name of the resource to be created
|
||||
capacity (int): Capacity of the new resource. Resource is deleted if
|
||||
capacity is 0.
|
||||
client_id (str): The ClientId of the node where the resource is to be
|
||||
set.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Raises:
|
||||
ValueError: This exception is raised when a non-negative capacity is
|
||||
specified.
|
||||
"""
|
||||
if client_id is not None:
|
||||
client_id_obj = ray.ClientID(ray.utils.hex_to_binary(client_id))
|
||||
else:
|
||||
client_id_obj = ray.ClientID.nil()
|
||||
if (capacity < 0) or (capacity != int(capacity)):
|
||||
raise ValueError(
|
||||
"Capacity {} must be a non-negative integer.".format(capacity))
|
||||
return ray.worker.global_worker.raylet_client.set_resource(
|
||||
resource_name, capacity, client_id_obj)
|
||||
@@ -13,6 +13,7 @@ import ray.gcs_utils
|
||||
|
||||
from ray.ray_constants import ID_SIZE
|
||||
from ray import services
|
||||
from ray.core.generated.EntryType import EntryType
|
||||
from ray.utils import (decode, binary_to_object_id, binary_to_hex,
|
||||
hex_to_binary)
|
||||
|
||||
@@ -54,29 +55,43 @@ def parse_client_table(redis_client):
|
||||
}
|
||||
client_id = ray.utils.binary_to_hex(client.ClientId())
|
||||
|
||||
# If this client is being removed, then it must
|
||||
if client.EntryType() == EntryType.INSERTION:
|
||||
ordered_client_ids.append(client_id)
|
||||
node_info[client_id] = {
|
||||
"ClientID": client_id,
|
||||
"EntryType": client.EntryType(),
|
||||
"NodeManagerAddress": decode(
|
||||
client.NodeManagerAddress(), allow_none=True),
|
||||
"NodeManagerPort": client.NodeManagerPort(),
|
||||
"ObjectManagerPort": client.ObjectManagerPort(),
|
||||
"ObjectStoreSocketName": decode(
|
||||
client.ObjectStoreSocketName(), allow_none=True),
|
||||
"RayletSocketName": decode(
|
||||
client.RayletSocketName(), allow_none=True),
|
||||
"Resources": resources
|
||||
}
|
||||
|
||||
# If this client is being updated, then it must
|
||||
# have previously been inserted, and
|
||||
# it cannot have previously been removed.
|
||||
if not client.IsInsertion():
|
||||
assert client_id in node_info, "Client removed not found!"
|
||||
assert node_info[client_id]["IsInsertion"], (
|
||||
"Unexpected duplicate removal of client.")
|
||||
else:
|
||||
ordered_client_ids.append(client_id)
|
||||
|
||||
node_info[client_id] = {
|
||||
"ClientID": client_id,
|
||||
"IsInsertion": client.IsInsertion(),
|
||||
"NodeManagerAddress": decode(
|
||||
client.NodeManagerAddress(), allow_none=True),
|
||||
"NodeManagerPort": client.NodeManagerPort(),
|
||||
"ObjectManagerPort": client.ObjectManagerPort(),
|
||||
"ObjectStoreSocketName": decode(
|
||||
client.ObjectStoreSocketName(), allow_none=True),
|
||||
"RayletSocketName": decode(
|
||||
client.RayletSocketName(), allow_none=True),
|
||||
"Resources": resources
|
||||
}
|
||||
assert client_id in node_info, "Client not found!"
|
||||
assert node_info[client_id]["EntryType"] != EntryType.DELETION, (
|
||||
"Unexpected updation of deleted client.")
|
||||
res_map = node_info[client_id]["Resources"]
|
||||
if client.EntryType() == EntryType.RES_CREATEUPDATE:
|
||||
for res in resources:
|
||||
res_map[res] = resources[res]
|
||||
elif client.EntryType() == EntryType.RES_DELETE:
|
||||
for res in resources:
|
||||
res_map.pop(res, None)
|
||||
elif client.EntryType() == EntryType.DELETION:
|
||||
pass # Do nothing with the resmap if client deletion
|
||||
else:
|
||||
raise RuntimeError("Unexpected EntryType {}".format(
|
||||
client.EntryType()))
|
||||
node_info[client_id]["Resources"] = res_map
|
||||
node_info[client_id]["EntryType"] = client.EntryType()
|
||||
# NOTE: We return the list comprehension below instead of simply doing
|
||||
# 'list(node_info.values())' in order to have the nodes appear in the order
|
||||
# that they joined the cluster. Python dictionaries do not preserve
|
||||
@@ -757,18 +772,18 @@ class GlobalState(object):
|
||||
resources = defaultdict(int)
|
||||
clients = self.client_table()
|
||||
for client in clients:
|
||||
# Only count resources from live clients.
|
||||
if client["IsInsertion"]:
|
||||
# Only count resources from latest entries of live clients.
|
||||
if client["EntryType"] != EntryType.DELETION:
|
||||
for key, value in client["Resources"].items():
|
||||
resources[key] += value
|
||||
|
||||
return dict(resources)
|
||||
|
||||
def _live_client_ids(self):
|
||||
"""Returns a set of client IDs corresponding to clients still alive."""
|
||||
return {
|
||||
client["ClientID"]
|
||||
for client in self.client_table() if client["IsInsertion"]
|
||||
for client in self.client_table()
|
||||
if (client["EntryType"] != EntryType.DELETION)
|
||||
}
|
||||
|
||||
def available_resources(self):
|
||||
|
||||
@@ -72,6 +72,7 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
|
||||
CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus NotifyActorResumedFromCheckpoint(
|
||||
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus SetResource(const c_string &resource_name, const double capacity, const CClientID &client_Id)
|
||||
CLanguage GetLanguage() const
|
||||
CClientID GetClientID() const
|
||||
CDriverID GetDriverID() const
|
||||
|
||||
@@ -8,6 +8,7 @@ import time
|
||||
import redis
|
||||
|
||||
import ray
|
||||
from ray.core.generated.EntryType import EntryType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -175,7 +176,8 @@ class Cluster(object):
|
||||
while time.time() - start_time < timeout:
|
||||
clients = ray.experimental.state.parse_client_table(redis_client)
|
||||
live_clients = [
|
||||
client for client in clients if client["IsInsertion"]
|
||||
client for client in clients
|
||||
if client["EntryType"] == EntryType.INSERTION
|
||||
]
|
||||
|
||||
expected = len(self.list_all_nodes())
|
||||
|
||||
@@ -0,0 +1,586 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.tests.cluster_utils
|
||||
import ray.tests.utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def test_dynamic_res_creation(ray_start_regular):
|
||||
# This test creates a resource locally (without specifying the client_id)
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity):
|
||||
ray.experimental.set_resource(resource_name, resource_capacity)
|
||||
|
||||
ray.get(set_res.remote(res_name, res_capacity))
|
||||
|
||||
available_res = ray.global_state.available_resources()
|
||||
cluster_res = ray.global_state.cluster_resources()
|
||||
|
||||
assert available_res[res_name] == res_capacity
|
||||
assert cluster_res[res_name] == res_capacity
|
||||
|
||||
|
||||
def test_dynamic_res_deletion(shutdown_only):
|
||||
# This test deletes a resource locally (without specifying the client_id)
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
|
||||
ray.init(num_cpus=1, resources={res_name: res_capacity})
|
||||
|
||||
@ray.remote
|
||||
def delete_res(resource_name):
|
||||
ray.experimental.set_resource(resource_name, 0)
|
||||
|
||||
ray.get(delete_res.remote(res_name))
|
||||
|
||||
available_res = ray.global_state.available_resources()
|
||||
cluster_res = ray.global_state.cluster_resources()
|
||||
|
||||
assert res_name not in available_res
|
||||
assert res_name not in cluster_res
|
||||
|
||||
|
||||
def test_dynamic_res_infeasible_rescheduling(ray_start_regular):
|
||||
# This test launches an infeasible task and then creates a
|
||||
# resource to make the task feasible. This tests if the
|
||||
# infeasible tasks get rescheduled when resources are
|
||||
# created at runtime.
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity):
|
||||
ray.experimental.set_resource(resource_name, resource_capacity)
|
||||
|
||||
def f():
|
||||
return 1
|
||||
|
||||
remote_task = ray.remote(resources={res_name: res_capacity})(f)
|
||||
oid = remote_task.remote() # This is infeasible
|
||||
ray.get(set_res.remote(res_name, res_capacity)) # Now should be feasible
|
||||
|
||||
available_res = ray.global_state.available_resources()
|
||||
assert available_res[res_name] == res_capacity
|
||||
|
||||
successful, unsuccessful = ray.wait([oid], timeout=1)
|
||||
assert successful # The task completed
|
||||
|
||||
|
||||
def test_dynamic_res_updation_clientid(ray_start_cluster):
|
||||
# This test does a simple resource capacity update
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
num_nodes = 3
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
target_clientid = ray.global_state.client_table()[1]["ClientID"]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=client_id)
|
||||
|
||||
# Create resource
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
|
||||
# Update resource
|
||||
new_capacity = res_capacity + 1
|
||||
ray.get(set_res.remote(res_name, new_capacity, target_clientid))
|
||||
|
||||
target_client = next(client for client in ray.global_state.client_table()
|
||||
if client["ClientID"] == target_clientid)
|
||||
resources = target_client["Resources"]
|
||||
|
||||
assert res_name in resources
|
||||
assert resources[res_name] == new_capacity
|
||||
|
||||
|
||||
def test_dynamic_res_creation_clientid(ray_start_cluster):
|
||||
# Creates a resource on a specific client and verifies creation.
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
num_nodes = 3
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
target_clientid = ray.global_state.client_table()[1]["ClientID"]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
target_client = next(client for client in ray.global_state.client_table()
|
||||
if client["ClientID"] == target_clientid)
|
||||
resources = target_client["Resources"]
|
||||
|
||||
assert res_name in resources
|
||||
assert resources[res_name] == res_capacity
|
||||
|
||||
|
||||
def test_dynamic_res_creation_clientid_multiple(ray_start_cluster):
|
||||
# This test creates resources on multiple clients using the clientid
|
||||
# specifier
|
||||
cluster = ray_start_cluster
|
||||
|
||||
TIMEOUT = 5
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
num_nodes = 3
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
target_clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
results = []
|
||||
for cid in target_clientids:
|
||||
results.append(set_res.remote(res_name, res_capacity, cid))
|
||||
ray.get(results)
|
||||
|
||||
success = False
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < TIMEOUT and not success:
|
||||
resources_created = []
|
||||
for cid in target_clientids:
|
||||
target_client = next(client
|
||||
for client in ray.global_state.client_table()
|
||||
if client["ClientID"] == cid)
|
||||
resources = target_client["Resources"]
|
||||
resources_created.append(resources[res_name] == res_capacity)
|
||||
success = all(resources_created)
|
||||
assert success
|
||||
|
||||
|
||||
def test_dynamic_res_deletion_clientid(ray_start_cluster):
|
||||
# This test deletes a resource on a given client id
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
num_nodes = 5
|
||||
|
||||
for i in range(num_nodes):
|
||||
# Create resource on all nodes, but later we'll delete it from a
|
||||
# target node
|
||||
cluster.add_node(resources={res_name: res_capacity})
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
target_clientid = ray.global_state.client_table()[1]["ClientID"]
|
||||
|
||||
# Launch the delete task
|
||||
@ray.remote
|
||||
def delete_res(resource_name, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, 0, client_id=res_client_id)
|
||||
|
||||
ray.get(delete_res.remote(res_name, target_clientid))
|
||||
|
||||
target_client = next(client for client in ray.global_state.client_table()
|
||||
if client["ClientID"] == target_clientid)
|
||||
resources = target_client["Resources"]
|
||||
print(ray.global_state.cluster_resources())
|
||||
assert res_name not in resources
|
||||
|
||||
|
||||
def test_dynamic_res_creation_scheduler_consistency(ray_start_cluster):
|
||||
# This makes sure the resource is actually created and the state is
|
||||
# consistent in the scheduler
|
||||
# by launching a task which requests the created resource
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
num_nodes = 5
|
||||
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
# Create the resource on node1
|
||||
target_clientid = clientids[1]
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
|
||||
# Define a task which requires this resource
|
||||
@ray.remote(resources={res_name: res_capacity})
|
||||
def test_func():
|
||||
return 1
|
||||
|
||||
result = test_func.remote()
|
||||
successful, unsuccessful = ray.wait([result], timeout=5)
|
||||
assert successful # The task completed
|
||||
|
||||
|
||||
def test_dynamic_res_deletion_scheduler_consistency(ray_start_cluster):
|
||||
# This makes sure the resource is actually deleted and the state is
|
||||
# consistent in the scheduler by launching an infeasible task which
|
||||
# requests the created resource
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 1.0
|
||||
num_nodes = 5
|
||||
TIMEOUT_DURATION = 1
|
||||
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
|
||||
@ray.remote
|
||||
def delete_res(resource_name, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, 0, client_id=res_client_id)
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
# Create the resource on node1
|
||||
target_clientid = clientids[1]
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
assert ray.global_state.cluster_resources()[res_name] == res_capacity
|
||||
|
||||
# Delete the resource
|
||||
ray.get(delete_res.remote(res_name, target_clientid))
|
||||
|
||||
# Define a task which requires this resource. This should not run
|
||||
@ray.remote(resources={res_name: res_capacity})
|
||||
def test_func():
|
||||
return 1
|
||||
|
||||
result = test_func.remote()
|
||||
successful, unsuccessful = ray.wait([result], timeout=TIMEOUT_DURATION)
|
||||
assert unsuccessful # The task did not complete because it's infeasible
|
||||
|
||||
|
||||
def test_dynamic_res_concurrent_res_increment(ray_start_cluster):
|
||||
# This test makes sure resource capacity is updated (increment) correctly
|
||||
# when a task has already acquired some of the resource.
|
||||
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 5
|
||||
updated_capacity = 10
|
||||
num_nodes = 5
|
||||
TIMEOUT_DURATION = 1
|
||||
|
||||
# Create a object ID to have the task wait on
|
||||
WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")
|
||||
|
||||
# Create a object ID to signal that the task is running
|
||||
TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")
|
||||
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
target_clientid = clientids[1]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
# Create the resource on node 1
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
assert ray.global_state.cluster_resources()[res_name] == res_capacity
|
||||
|
||||
# Task to hold the resource till the driver signals to finish
|
||||
@ray.remote
|
||||
def wait_func(running_oid, wait_oid):
|
||||
# Signal that the task is running
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
|
||||
# Make the task wait till signalled by driver
|
||||
ray.get(ray.ObjectID(wait_oid))
|
||||
|
||||
@ray.remote
|
||||
def test_func():
|
||||
return 1
|
||||
|
||||
# Launch the task with resource requirement of 4, thus the new available
|
||||
# capacity becomes 1
|
||||
task = wait_func._remote(
|
||||
args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
|
||||
resources={res_name: 4})
|
||||
# Wait till wait_func is launched before updating resource
|
||||
ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))
|
||||
|
||||
# Update the resource capacity
|
||||
ray.get(set_res.remote(res_name, updated_capacity, target_clientid))
|
||||
|
||||
# Signal task to complete
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
|
||||
ray.get(task)
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
# updated capacity
|
||||
task_2 = test_func._remote(args=[], resources={res_name: updated_capacity})
|
||||
successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
|
||||
assert successful # The task completed
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
# updated capacity + 1. This should not execute
|
||||
task_3 = test_func._remote(
|
||||
args=[], resources={res_name: updated_capacity + 1
|
||||
}) # This should be infeasible
|
||||
successful, unsuccessful = ray.wait([task_3], timeout=TIMEOUT_DURATION)
|
||||
assert unsuccessful # The task did not complete because it's infeasible
|
||||
assert ray.global_state.available_resources()[res_name] == updated_capacity
|
||||
|
||||
|
||||
def test_dynamic_res_concurrent_res_decrement(ray_start_cluster):
|
||||
# This test makes sure resource capacity is updated (decremented)
|
||||
# correctly when a task has already acquired some
|
||||
# of the resource.
|
||||
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 5
|
||||
updated_capacity = 2
|
||||
num_nodes = 5
|
||||
TIMEOUT_DURATION = 1
|
||||
|
||||
# Create a object ID to have the task wait on
|
||||
WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")
|
||||
|
||||
# Create a object ID to signal that the task is running
|
||||
TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")
|
||||
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
target_clientid = clientids[1]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
# Create the resource on node 1
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
assert ray.global_state.cluster_resources()[res_name] == res_capacity
|
||||
|
||||
# Task to hold the resource till the driver signals to finish
|
||||
@ray.remote
|
||||
def wait_func(running_oid, wait_oid):
|
||||
# Signal that the task is running
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
|
||||
# Make the task wait till signalled by driver
|
||||
ray.get(ray.ObjectID(wait_oid))
|
||||
|
||||
@ray.remote
|
||||
def test_func():
|
||||
return 1
|
||||
|
||||
# Launch the task with resource requirement of 4, thus the new available
|
||||
# capacity becomes 1
|
||||
task = wait_func._remote(
|
||||
args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
|
||||
resources={res_name: 4})
|
||||
# Wait till wait_func is launched before updating resource
|
||||
ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))
|
||||
|
||||
# Decrease the resource capacity
|
||||
ray.get(set_res.remote(res_name, updated_capacity, target_clientid))
|
||||
|
||||
# Signal task to complete
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
|
||||
ray.get(task)
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
# updated capacity
|
||||
task_2 = test_func._remote(args=[], resources={res_name: updated_capacity})
|
||||
successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
|
||||
assert successful # The task completed
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
# updated capacity + 1. This should not execute
|
||||
task_3 = test_func._remote(
|
||||
args=[], resources={res_name: updated_capacity + 1
|
||||
}) # This should be infeasible
|
||||
successful, unsuccessful = ray.wait([task_3], timeout=TIMEOUT_DURATION)
|
||||
assert unsuccessful # The task did not complete because it's infeasible
|
||||
assert ray.global_state.available_resources()[res_name] == updated_capacity
|
||||
|
||||
|
||||
def test_dynamic_res_concurrent_res_delete(ray_start_cluster):
|
||||
# This test makes sure resource gets deleted correctly when a task has
|
||||
# already acquired the resource
|
||||
|
||||
cluster = ray_start_cluster
|
||||
|
||||
res_name = "test_res"
|
||||
res_capacity = 5
|
||||
num_nodes = 5
|
||||
TIMEOUT_DURATION = 1
|
||||
|
||||
# Create a object ID to have the task wait on
|
||||
WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")
|
||||
|
||||
# Create a object ID to signal that the task is running
|
||||
TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")
|
||||
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
target_clientid = clientids[1]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
@ray.remote
|
||||
def delete_res(resource_name, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, 0, client_id=res_client_id)
|
||||
|
||||
# Create the resource on node 1
|
||||
ray.get(set_res.remote(res_name, res_capacity, target_clientid))
|
||||
assert ray.global_state.cluster_resources()[res_name] == res_capacity
|
||||
|
||||
# Task to hold the resource till the driver signals to finish
|
||||
@ray.remote
|
||||
def wait_func(running_oid, wait_oid):
|
||||
# Signal that the task is running
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
|
||||
# Make the task wait till signalled by driver
|
||||
ray.get(ray.ObjectID(wait_oid))
|
||||
|
||||
@ray.remote
|
||||
def test_func():
|
||||
return 1
|
||||
|
||||
# Launch the task with resource requirement of 4, thus the new available
|
||||
# capacity becomes 1
|
||||
task = wait_func._remote(
|
||||
args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
|
||||
resources={res_name: 4})
|
||||
# Wait till wait_func is launched before updating resource
|
||||
ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))
|
||||
|
||||
# Delete the resource
|
||||
ray.get(delete_res.remote(res_name, target_clientid))
|
||||
|
||||
# Signal task to complete
|
||||
ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
|
||||
ray.get(task)
|
||||
|
||||
# Check if scheduler state is consistent by launching a task requiring
|
||||
# the deleted resource This should not execute
|
||||
task_2 = test_func._remote(
|
||||
args=[], resources={res_name: 1}) # This should be infeasible
|
||||
successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
|
||||
assert unsuccessful # The task did not complete because it's infeasible
|
||||
assert res_name not in ray.global_state.available_resources()
|
||||
|
||||
|
||||
def test_dynamic_res_creation_stress(ray_start_cluster):
|
||||
# This stress tests creates many resources simultaneously on the same
|
||||
# client and then checks if the final state is consistent
|
||||
|
||||
cluster = ray_start_cluster
|
||||
|
||||
TIMEOUT = 5
|
||||
res_capacity = 1
|
||||
num_nodes = 5
|
||||
NUM_RES_TO_CREATE = 500
|
||||
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node()
|
||||
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
|
||||
clientids = [
|
||||
client["ClientID"] for client in ray.global_state.client_table()
|
||||
]
|
||||
target_clientid = clientids[1]
|
||||
|
||||
@ray.remote
|
||||
def set_res(resource_name, resource_capacity, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, resource_capacity, client_id=res_client_id)
|
||||
|
||||
@ray.remote
|
||||
def delete_res(resource_name, res_client_id):
|
||||
ray.experimental.set_resource(
|
||||
resource_name, 0, client_id=res_client_id)
|
||||
|
||||
results = [
|
||||
set_res.remote(str(i), res_capacity, target_clientid)
|
||||
for i in range(0, NUM_RES_TO_CREATE)
|
||||
]
|
||||
ray.get(results)
|
||||
|
||||
success = False
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < TIMEOUT and not success:
|
||||
resources = ray.global_state.cluster_resources()
|
||||
all_resources_created = []
|
||||
for i in range(0, NUM_RES_TO_CREATE):
|
||||
all_resources_created.append(str(i) in resources)
|
||||
success = all(all_resources_created)
|
||||
assert success
|
||||
Reference in New Issue
Block a user