Files
ray/python/ray/tests/test_actor_resources.py
T
Amog Kamsetty 1d3941e41a [Tests] Skip failing windows tests (#13495)
* skip failing windows tests

* skip more

* remove

* updates
2021-01-15 20:51:33 -08:00

643 lines
21 KiB
Python

import collections
import os
import pytest
try:
import pytest_timeout
except ImportError:
pytest_timeout = None
import sys
import time
import ray
import ray.test_utils
import ray.cluster_utils
def test_actor_deletion_with_gpus(shutdown_only):
ray.init(
num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
# When an actor that uses a GPU exits, make sure that the GPU resources
# are released.
@ray.remote(num_gpus=1)
class Actor:
def getpid(self):
return os.getpid()
for _ in range(5):
# If we can successfully create an actor, that means that enough
# GPU resources are available.
a = Actor.remote()
ray.get(a.getpid.remote())
def test_actor_state(ray_start_regular):
@ray.remote
class Counter:
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
def value(self):
return self.value
c1 = Counter.remote()
c1.increase.remote()
assert ray.get(c1.value.remote()) == 1
c2 = Counter.remote()
c2.increase.remote()
c2.increase.remote()
assert ray.get(c2.value.remote()) == 2
def test_actor_class_methods(ray_start_regular):
class Foo:
x = 2
@classmethod
def as_remote(cls):
return ray.remote(cls)
@classmethod
def f(cls):
return cls.x
@classmethod
def g(cls, y):
return cls.x + y
def echo(self, value):
return value
a = Foo.as_remote().remote()
assert ray.get(a.echo.remote(2)) == 2
assert ray.get(a.f.remote()) == 2
assert ray.get(a.g.remote(2)) == 4
@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")
def test_actor_gpus(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 3
num_gpus_per_raylet = 4
for i in range(num_nodes):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
ray.init(address=cluster.address)
@ray.remote(num_gpus=1)
class Actor1:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create one actor per GPU.
actors = [Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == num_nodes
location_actor_combinations = []
for node_name in node_names:
for gpu_id in range(num_gpus_per_raylet):
location_actor_combinations.append((node_name, (gpu_id, )))
assert set(locations_and_ids) == set(location_actor_combinations)
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
def test_actor_multiple_gpus(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 3
num_gpus_per_raylet = 5
for i in range(num_nodes):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
ray.init(address=cluster.address)
@ray.remote(num_gpus=2)
class Actor1:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create some actors.
actors1 = [Actor1.remote() for _ in range(num_nodes * 2)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors1])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == num_nodes
# Keep track of which GPU IDs are being used for each location.
gpus_in_use = {node_name: [] for node_name in node_names}
for location, gpu_ids in locations_and_ids:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(set(gpus_in_use[node_name])) == 4
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
# We should be able to create more actors that use only a single GPU.
@ray.remote(num_gpus=1)
class Actor2:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create some actors.
actors2 = [Actor2.remote() for _ in range(num_nodes)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors2])
names = {location for location, gpu_id in locations_and_ids}
assert node_names == names
for location, gpu_ids in locations_and_ids:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(gpus_in_use[node_name]) == 5
assert set(gpus_in_use[node_name]) == set(range(5))
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor2.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
def test_actor_different_numbers_of_gpus(ray_start_cluster):
# Test that we can create actors on two nodes that have different
# numbers of GPUs.
cluster = ray_start_cluster
cluster.add_node(num_cpus=10, num_gpus=0)
cluster.add_node(num_cpus=10, num_gpus=5)
cluster.add_node(num_cpus=10, num_gpus=10)
ray.init(address=cluster.address)
@ray.remote(num_gpus=1)
class Actor1:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create some actors.
actors = [Actor1.remote() for _ in range(0 + 5 + 10)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == 2
for node_name in node_names:
node_gpu_ids = [
gpu_id for location, gpu_id in locations_and_ids
if location == node_name
]
assert len(node_gpu_ids) in [5, 10]
assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))}
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")
def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 5
num_gpus_per_raylet = 5
for i in range(num_nodes):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet,
num_gpus=num_gpus_per_raylet,
_system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
ray.init(address=cluster.address)
@ray.remote
def create_actors(i, n):
@ray.remote(num_gpus=1)
class Actor:
def __init__(self, i, j):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return ((ray.worker.global_worker.node.unique_id),
tuple(self.gpu_ids))
def sleep(self):
time.sleep(100)
# Create n actors.
actors = []
for j in range(n):
actors.append(Actor.remote(i, j))
locations = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
# Put each actor to sleep for a long time to prevent them from getting
# terminated.
for actor in actors:
actor.sleep.remote()
return locations
all_locations = ray.get([
create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes)
])
# Make sure that no two actors are assigned to the same GPU.
node_names = {
location
for locations in all_locations for location, gpu_id in locations
}
assert len(node_names) == num_nodes
# Keep track of which GPU IDs are being used for each location.
gpus_in_use = {node_name: [] for node_name in node_names}
for locations in all_locations:
for location, gpu_ids in locations:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet
@ray.remote(num_gpus=1)
class Actor:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# All the GPUs should be used up now.
a = Actor.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
def test_actors_and_tasks_with_gpus(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 3
num_gpus_per_raylet = 2
for i in range(num_nodes):
cluster.add_node(
num_cpus=num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
ray.init(address=cluster.address)
def check_intervals_non_overlapping(list_of_intervals):
for i in range(len(list_of_intervals)):
for j in range(i):
first_interval = list_of_intervals[i]
second_interval = list_of_intervals[j]
# Check that list_of_intervals[i] and list_of_intervals[j]
# don't overlap.
assert first_interval[0] < first_interval[1]
assert second_interval[0] < second_interval[1]
intervals_nonoverlapping = (
first_interval[1] <= second_interval[0]
or second_interval[1] <= first_interval[0])
assert intervals_nonoverlapping, (
"Intervals {} and {} are overlapping.".format(
first_interval, second_interval))
@ray.remote(num_gpus=1)
def f1():
t1 = time.monotonic()
time.sleep(0.1)
t2 = time.monotonic()
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert gpu_ids[0] in range(num_gpus_per_raylet)
return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
[t1, t2])
@ray.remote(num_gpus=2)
def f2():
t1 = time.monotonic()
time.sleep(0.1)
t2 = time.monotonic()
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 2
assert gpu_ids[0] in range(num_gpus_per_raylet)
assert gpu_ids[1] in range(num_gpus_per_raylet)
return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
[t1, t2])
@ray.remote(num_gpus=1)
class Actor1:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
assert len(self.gpu_ids) == 1
assert self.gpu_ids[0] in range(num_gpus_per_raylet)
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
def locations_to_intervals_for_many_tasks():
# Launch a bunch of GPU tasks.
locations_ids_and_intervals = ray.get(
[f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
[f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
[f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)])
locations_to_intervals = collections.defaultdict(lambda: [])
for location, gpu_ids, interval in locations_ids_and_intervals:
for gpu_id in gpu_ids:
locations_to_intervals[(location, gpu_id)].append(interval)
return locations_to_intervals
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Create an actor that uses a GPU.
a = Actor1.remote()
actor_location = ray.get(a.get_location_and_ids.remote())
actor_location = (actor_location[0], actor_location[1][0])
# This check makes sure that actor_location is formatted the same way
# that the keys of locations_to_intervals are formatted.
assert actor_location in locations_to_intervals
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Make sure that the actor's GPU was not used.
assert actor_location not in locations_to_intervals
# Create more actors to fill up all the GPUs.
more_actors = [
Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1)
]
# Wait for the actors to finish being created.
ray.get([actor.get_location_and_ids.remote() for actor in more_actors])
# Now if we run some GPU tasks, they should not be scheduled.
results = [f1.remote() for _ in range(30)]
ready_ids, remaining_ids = ray.wait(results, timeout=1.0)
assert len(ready_ids) == 0
def test_actors_and_tasks_with_gpus_version_two(shutdown_only):
# Create tasks and actors that both use GPUs and make sure that they
# are given different GPUs
num_gpus = 4
ray.init(
num_cpus=(num_gpus + 1),
num_gpus=num_gpus,
object_store_memory=int(150 * 1024 * 1024))
# The point of this actor is to record which GPU IDs have been seen. We
# can't just return them from the tasks, because the tasks don't return
# for a long time in order to make sure the GPU is not released
# prematurely.
@ray.remote
class RecordGPUs:
def __init__(self):
self.gpu_ids_seen = []
self.num_calls = 0
def add_ids(self, gpu_ids):
self.gpu_ids_seen += gpu_ids
self.num_calls += 1
def get_gpu_ids_and_calls(self):
return self.gpu_ids_seen, self.num_calls
@ray.remote(num_gpus=1)
def f(record_gpu_actor):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
record_gpu_actor.add_ids.remote(gpu_ids)
# Sleep for a long time so that the GPU never gets released. This task
# will be killed by ray.shutdown() before it actually finishes.
time.sleep(1000)
@ray.remote(num_gpus=1)
class Actor:
def __init__(self, record_gpu_actor):
self.gpu_ids = ray.get_gpu_ids()
assert len(self.gpu_ids) == 1
record_gpu_actor.add_ids.remote(self.gpu_ids)
def check_gpu_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
record_gpu_actor = RecordGPUs.remote()
actors = []
actor_results = []
for _ in range(num_gpus // 2):
f.remote(record_gpu_actor)
a = Actor.remote(record_gpu_actor)
actor_results.append(a.check_gpu_ids.remote())
# Prevent the actor handle from going out of scope so that its GPU
# resources don't get released.
actors.append(a)
# Make sure that the actor method calls succeeded.
ray.get(actor_results)
start_time = time.time()
while time.time() - start_time < 30:
seen_gpu_ids, num_calls = ray.get(
record_gpu_actor.get_gpu_ids_and_calls.remote())
if num_calls == num_gpus:
break
assert set(seen_gpu_ids) == set(range(num_gpus))
def test_blocking_actor_task(shutdown_only):
ray.init(
num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
@ray.remote(num_gpus=1)
def f():
return 1
@ray.remote
class Foo:
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure we can execute a blocking actor method even if there is
# only one CPU.
actor = Foo.remote()
ray.get(actor.blocking_method.remote())
@ray.remote(num_cpus=1)
class CPUFoo:
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure that lifetime CPU resources are not released when actors
# block.
actor = CPUFoo.remote()
x_id = actor.blocking_method.remote()
ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
assert ready_ids == []
assert remaining_ids == [x_id]
@ray.remote(num_gpus=1)
class GPUFoo:
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure that GPU resources are not released when actors block.
actor = GPUFoo.remote()
x_id = actor.blocking_method.remote()
ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
assert ready_ids == []
assert remaining_ids == [x_id]
def test_lifetime_and_transient_resources(ray_start_regular):
# This actor acquires resources only when running methods.
@ray.remote
class Actor1:
def method(self):
pass
# This actor acquires resources for its lifetime.
@ray.remote(num_cpus=1)
class Actor2:
def method(self):
pass
actor1s = [Actor1.remote() for _ in range(10)]
ray.get([a.method.remote() for a in actor1s])
actor2s = [Actor2.remote() for _ in range(2)]
results = [a.method.remote() for a in actor2s]
ready_ids, remaining_ids = ray.wait(
results, num_returns=len(results), timeout=5.0)
assert len(ready_ids) == 1
def test_custom_label_placement(ray_start_cluster):
cluster = ray_start_cluster
custom_resource1_node = cluster.add_node(
num_cpus=2, resources={"CustomResource1": 2})
custom_resource2_node = cluster.add_node(
num_cpus=2, resources={"CustomResource2": 2})
ray.init(address=cluster.address)
@ray.remote(resources={"CustomResource1": 1})
class ResourceActor1:
def get_location(self):
return ray.worker.global_worker.node.unique_id
@ray.remote(resources={"CustomResource2": 1})
class ResourceActor2:
def get_location(self):
return ray.worker.global_worker.node.unique_id
# Create some actors.
actors1 = [ResourceActor1.remote() for _ in range(2)]
actors2 = [ResourceActor2.remote() for _ in range(2)]
locations1 = ray.get([a.get_location.remote() for a in actors1])
locations2 = ray.get([a.get_location.remote() for a in actors2])
for location in locations1:
assert location == custom_resource1_node.unique_id
for location in locations2:
assert location == custom_resource2_node.unique_id
def test_creating_more_actors_than_resources(shutdown_only):
ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1})
@ray.remote(num_gpus=1)
class ResourceActor1:
def method(self):
return ray.get_gpu_ids()[0]
@ray.remote(resources={"CustomResource1": 1})
class ResourceActor2:
def method(self):
pass
# Make sure the first two actors get created and the third one does
# not.
actor1 = ResourceActor1.remote()
result1 = actor1.method.remote()
ray.wait([result1])
actor2 = ResourceActor1.remote()
result2 = actor2.method.remote()
ray.wait([result2])
actor3 = ResourceActor1.remote()
result3 = actor3.method.remote()
ready_ids, _ = ray.wait([result3], timeout=0.2)
assert len(ready_ids) == 0
# By deleting actor1, we free up resources to create actor3.
del actor1
results = ray.get([result1, result2, result3])
assert results[0] == results[2]
assert set(results) == {0, 1}
# Make sure that when one actor goes out of scope a new actor is
# created because some resources have been freed up.
results = []
for _ in range(3):
actor = ResourceActor2.remote()
object_ref = actor.method.remote()
results.append(object_ref)
# Wait for the task to execute. We do this because otherwise it may
# be possible for the __ray_terminate__ task to execute before the
# method.
ray.wait([object_ref])
ray.get(results)
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))