[Core] Multi-tenancy: Worker capping (#10500)

This commit is contained in:
Kai Yang
2020-09-04 20:34:06 +08:00
committed by GitHub
parent 2a7f56e429
commit 5f5160ead9
8 changed files with 188 additions and 16 deletions
+1 -1
View File
@@ -45,6 +45,7 @@ py_test_module_list(
"test_memory_scheduling.py",
"test_metrics.py",
"test_multi_node_2.py",
"test_multi_tenancy.py",
"test_multinode_failures_2.py",
"test_multinode_failures.py",
"test_multi_node.py",
@@ -85,7 +86,6 @@ py_test_module_list(
"test_metrics_agent.py",
"test_microbenchmarks.py",
"test_mini.py",
"test_multi_tenancy.py",
"test_node_manager.py",
"test_numba.py",
"test_ray_init.py",
+99 -11
View File
@@ -1,6 +1,7 @@
# coding: utf-8
import os
import sys
import time
import grpc
import pytest
@@ -11,6 +12,19 @@ from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc
from ray.test_utils import wait_for_condition, run_string_as_driver_nonblocking
def get_num_workers():
raylet = ray.nodes()[0]
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
raylet["NodeManagerPort"])
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
return len([
worker for worker in stub.GetNodeStats(
node_manager_pb2.GetNodeStatsRequest()).workers_stats
if not worker.is_driver
])
# Test that when `redis_address` and `job_config` is not set in
# `ray.init(...)`, Raylet will start `num_cpus` Python workers for the driver.
def test_initial_workers(shutdown_only):
@@ -19,17 +33,7 @@ def test_initial_workers(shutdown_only):
num_cpus=1,
include_dashboard=True,
_system_config={"enable_multi_tenancy": True})
raylet = ray.nodes()[0]
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
raylet["NodeManagerPort"])
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
wait_for_condition(lambda: len([
worker for worker in stub.GetNodeStats(
node_manager_pb2.GetNodeStatsRequest()).workers_stats
if not worker.is_driver
]) == 1,
timeout=10)
wait_for_condition(lambda: get_num_workers() == 1)
# This test case starts some driver processes. Each driver process submits
@@ -123,5 +127,89 @@ def test_worker_env(shutdown_only):
assert ray.get(get_env.remote("foo2")) == "bar2"
def test_worker_capping_kill_idle_workers(shutdown_only):
# Avoid starting initial workers by setting num_cpus to 0.
ray.init(num_cpus=0, _system_config={"enable_multi_tenancy": True})
assert get_num_workers() == 0
@ray.remote(num_cpus=0)
class Actor:
def ping(self):
pass
actor = Actor.remote()
ray.get(actor.ping.remote())
# Actor is now alive and worker 1 which holds the actor is alive
assert get_num_workers() == 1
@ray.remote(num_cpus=0)
def foo():
# Wait for a while
time.sleep(10)
obj1 = foo.remote()
# Worker 2 runs a normal task
wait_for_condition(lambda: get_num_workers() == 2)
obj2 = foo.remote()
# Worker 3 runs a normal task
wait_for_condition(lambda: get_num_workers() == 3)
ray.get(obj1)
# Worker 2 now becomes idle and should be killed
wait_for_condition(lambda: get_num_workers() == 2)
ray.get(obj2)
# Worker 3 now becomes idle and should be killed
wait_for_condition(lambda: get_num_workers() == 1)
def test_worker_capping_run_many_small_tasks(shutdown_only):
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
@ray.remote(num_cpus=0.5)
def foo():
time.sleep(5)
# Run more tasks than `num_cpus`, but the CPU resource requirement is
# still within `num_cpus`.
obj_refs = [foo.remote() for _ in range(4)]
wait_for_condition(lambda: get_num_workers() == 4)
ray.get(obj_refs)
# After finished the tasks, some workers are killed to keep the total
# number of workers <= num_cpus.
wait_for_condition(lambda: get_num_workers() == 2)
time.sleep(1)
# The two remaining workers stay alive forever.
assert get_num_workers() == 2
def test_worker_capping_run_chained_tasks(shutdown_only):
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
@ray.remote(num_cpus=0.5)
def foo(x):
if x > 1:
return ray.get(foo.remote(x - 1)) + x
else:
time.sleep(5)
return x
# Run a chain of tasks which exceed `num_cpus` in amount, but the CPU
# resource requirement is still within `num_cpus`.
obj = foo.remote(4)
wait_for_condition(lambda: get_num_workers() == 4)
ray.get(obj)
# After finished the tasks, some workers are killed to keep the total
# number of workers <= num_cpus.
wait_for_condition(lambda: get_num_workers() == 2)
time.sleep(1)
# The two remaining workers stay alive forever.
assert get_num_workers() == 2
if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))