mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 19:49:04 +08:00
[Core] Multi-tenancy: Worker capping (#10500)
This commit is contained in:
@@ -45,6 +45,7 @@ py_test_module_list(
|
||||
"test_memory_scheduling.py",
|
||||
"test_metrics.py",
|
||||
"test_multi_node_2.py",
|
||||
"test_multi_tenancy.py",
|
||||
"test_multinode_failures_2.py",
|
||||
"test_multinode_failures.py",
|
||||
"test_multi_node.py",
|
||||
@@ -85,7 +86,6 @@ py_test_module_list(
|
||||
"test_metrics_agent.py",
|
||||
"test_microbenchmarks.py",
|
||||
"test_mini.py",
|
||||
"test_multi_tenancy.py",
|
||||
"test_node_manager.py",
|
||||
"test_numba.py",
|
||||
"test_ray_init.py",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# coding: utf-8
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import grpc
|
||||
import pytest
|
||||
@@ -11,6 +12,19 @@ from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc
|
||||
from ray.test_utils import wait_for_condition, run_string_as_driver_nonblocking
|
||||
|
||||
|
||||
def get_num_workers():
|
||||
raylet = ray.nodes()[0]
|
||||
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
||||
raylet["NodeManagerPort"])
|
||||
channel = grpc.insecure_channel(raylet_address)
|
||||
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
||||
return len([
|
||||
worker for worker in stub.GetNodeStats(
|
||||
node_manager_pb2.GetNodeStatsRequest()).workers_stats
|
||||
if not worker.is_driver
|
||||
])
|
||||
|
||||
|
||||
# Test that when `redis_address` and `job_config` is not set in
|
||||
# `ray.init(...)`, Raylet will start `num_cpus` Python workers for the driver.
|
||||
def test_initial_workers(shutdown_only):
|
||||
@@ -19,17 +33,7 @@ def test_initial_workers(shutdown_only):
|
||||
num_cpus=1,
|
||||
include_dashboard=True,
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
raylet = ray.nodes()[0]
|
||||
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
||||
raylet["NodeManagerPort"])
|
||||
channel = grpc.insecure_channel(raylet_address)
|
||||
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
||||
wait_for_condition(lambda: len([
|
||||
worker for worker in stub.GetNodeStats(
|
||||
node_manager_pb2.GetNodeStatsRequest()).workers_stats
|
||||
if not worker.is_driver
|
||||
]) == 1,
|
||||
timeout=10)
|
||||
wait_for_condition(lambda: get_num_workers() == 1)
|
||||
|
||||
|
||||
# This test case starts some driver processes. Each driver process submits
|
||||
@@ -123,5 +127,89 @@ def test_worker_env(shutdown_only):
|
||||
assert ray.get(get_env.remote("foo2")) == "bar2"
|
||||
|
||||
|
||||
def test_worker_capping_kill_idle_workers(shutdown_only):
|
||||
# Avoid starting initial workers by setting num_cpus to 0.
|
||||
ray.init(num_cpus=0, _system_config={"enable_multi_tenancy": True})
|
||||
assert get_num_workers() == 0
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
class Actor:
|
||||
def ping(self):
|
||||
pass
|
||||
|
||||
actor = Actor.remote()
|
||||
ray.get(actor.ping.remote())
|
||||
# Actor is now alive and worker 1 which holds the actor is alive
|
||||
assert get_num_workers() == 1
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
def foo():
|
||||
# Wait for a while
|
||||
time.sleep(10)
|
||||
|
||||
obj1 = foo.remote()
|
||||
# Worker 2 runs a normal task
|
||||
wait_for_condition(lambda: get_num_workers() == 2)
|
||||
|
||||
obj2 = foo.remote()
|
||||
# Worker 3 runs a normal task
|
||||
wait_for_condition(lambda: get_num_workers() == 3)
|
||||
|
||||
ray.get(obj1)
|
||||
# Worker 2 now becomes idle and should be killed
|
||||
wait_for_condition(lambda: get_num_workers() == 2)
|
||||
ray.get(obj2)
|
||||
# Worker 3 now becomes idle and should be killed
|
||||
wait_for_condition(lambda: get_num_workers() == 1)
|
||||
|
||||
|
||||
def test_worker_capping_run_many_small_tasks(shutdown_only):
|
||||
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
|
||||
|
||||
@ray.remote(num_cpus=0.5)
|
||||
def foo():
|
||||
time.sleep(5)
|
||||
|
||||
# Run more tasks than `num_cpus`, but the CPU resource requirement is
|
||||
# still within `num_cpus`.
|
||||
obj_refs = [foo.remote() for _ in range(4)]
|
||||
wait_for_condition(lambda: get_num_workers() == 4)
|
||||
|
||||
ray.get(obj_refs)
|
||||
# After finished the tasks, some workers are killed to keep the total
|
||||
# number of workers <= num_cpus.
|
||||
wait_for_condition(lambda: get_num_workers() == 2)
|
||||
|
||||
time.sleep(1)
|
||||
# The two remaining workers stay alive forever.
|
||||
assert get_num_workers() == 2
|
||||
|
||||
|
||||
def test_worker_capping_run_chained_tasks(shutdown_only):
|
||||
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
|
||||
|
||||
@ray.remote(num_cpus=0.5)
|
||||
def foo(x):
|
||||
if x > 1:
|
||||
return ray.get(foo.remote(x - 1)) + x
|
||||
else:
|
||||
time.sleep(5)
|
||||
return x
|
||||
|
||||
# Run a chain of tasks which exceed `num_cpus` in amount, but the CPU
|
||||
# resource requirement is still within `num_cpus`.
|
||||
obj = foo.remote(4)
|
||||
wait_for_condition(lambda: get_num_workers() == 4)
|
||||
|
||||
ray.get(obj)
|
||||
# After finished the tasks, some workers are killed to keep the total
|
||||
# number of workers <= num_cpus.
|
||||
wait_for_condition(lambda: get_num_workers() == 2)
|
||||
|
||||
time.sleep(1)
|
||||
# The two remaining workers stay alive forever.
|
||||
assert get_num_workers() == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user