mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 09:45:10 +08:00
[Core] Multi-tenancy: enable multi-tenancy by default (#10570)
* Add new job in Travis to enable multi-tenancy * fix * Update .bazelrc * Update .travis.yml * fix test_job_gc_with_detached_actor * fix test_multiple_downstream_tasks * fix lint * Enable multi-tenancy by default * Kill idle workers in FIFO order * Update test * minor update * Address comments * fix some cases * fix test_remote_cancel * Address comments * fix after merge * remove kill * fix worker_pool_test * fix java test timeout * fix test_two_custom_resources * Add a delay when killing idle workers * fix test_worker_failure * fix test_worker_failed again * fix DisconnectWorker * update test_worker_failed * Revert some python tests * lint * address comments
This commit is contained in:
@@ -207,10 +207,13 @@ def run_string_as_driver_nonblocking(driver_script):
|
||||
return proc
|
||||
|
||||
|
||||
def wait_for_num_actors(num_actors, timeout=10):
|
||||
def wait_for_num_actors(num_actors, state=None, timeout=10):
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
if len(ray.actors()) >= num_actors:
|
||||
if len([
|
||||
_ for _ in ray.actors().values()
|
||||
if state is None or _["State"] == state
|
||||
]) >= num_actors:
|
||||
return
|
||||
time.sleep(0.1)
|
||||
raise RayTestTimeoutException("Timed out while waiting for global state.")
|
||||
|
||||
@@ -11,7 +11,10 @@ import ray
|
||||
import ray.cluster_utils
|
||||
import ray.test_utils
|
||||
|
||||
from ray.test_utils import RayTestTimeoutException
|
||||
from ray.test_utils import (
|
||||
RayTestTimeoutException,
|
||||
wait_for_condition,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -505,6 +508,17 @@ def test_two_custom_resources(ray_start_cluster):
|
||||
})
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote
|
||||
def foo():
|
||||
# Sleep a while to emulate a slow operation. This is needed to make
|
||||
# sure tasks are scheduled to different nodes.
|
||||
time.sleep(0.1)
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
# Make sure each node has at least one idle worker.
|
||||
wait_for_condition(
|
||||
lambda: len(set(ray.get([foo.remote() for _ in range(6)]))) == 2)
|
||||
|
||||
@ray.remote(resources={"CustomResource1": 1})
|
||||
def f():
|
||||
time.sleep(0.001)
|
||||
|
||||
@@ -63,7 +63,11 @@ def test_worker_failed(ray_start_workers_separate_multinode):
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for pid in pids:
|
||||
os.kill(pid, SIGKILL)
|
||||
try:
|
||||
os.kill(pid, SIGKILL)
|
||||
except OSError:
|
||||
# The process may have already exited due to worker capping.
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
# Make sure that we either get the object or we get an appropriate
|
||||
# exception.
|
||||
|
||||
@@ -67,11 +67,13 @@ class Actor:
|
||||
return 1
|
||||
|
||||
_ = Actor.options(lifetime="detached", name="DetachedActor").remote()
|
||||
# Make sure the actor is created before the driver exits.
|
||||
ray.get(_.value.remote())
|
||||
""".format(address)
|
||||
|
||||
p = run_string_as_driver_nonblocking(driver)
|
||||
# Wait for actor to be created
|
||||
wait_for_num_actors(1)
|
||||
wait_for_num_actors(1, ray.gcs_utils.ActorTableData.ALIVE)
|
||||
|
||||
actor_table = ray.actors()
|
||||
assert len(actor_table) == 1
|
||||
|
||||
@@ -31,10 +31,7 @@ def get_workers():
|
||||
# `ray.init(...)`, Raylet will start `num_cpus` Python workers for the driver.
|
||||
def test_initial_workers(shutdown_only):
|
||||
# `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
include_dashboard=True,
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
ray.init(num_cpus=1, include_dashboard=True)
|
||||
wait_for_condition(lambda: len(get_workers()) == 1)
|
||||
|
||||
|
||||
@@ -46,7 +43,7 @@ def test_initial_workers(shutdown_only):
|
||||
# different drivers were scheduled to the same worker process, that is, tasks
|
||||
# of different jobs were not correctly isolated during execution.
|
||||
def test_multi_drivers(shutdown_only):
|
||||
info = ray.init(num_cpus=10, _system_config={"enable_multi_tenancy": True})
|
||||
info = ray.init(num_cpus=10)
|
||||
|
||||
driver_code = """
|
||||
import os
|
||||
@@ -118,8 +115,7 @@ def test_worker_env(shutdown_only):
|
||||
job_config=ray.job_config.JobConfig(worker_env={
|
||||
"foo1": "bar1",
|
||||
"foo2": "bar2"
|
||||
}),
|
||||
_system_config={"enable_multi_tenancy": True})
|
||||
}))
|
||||
|
||||
@ray.remote
|
||||
def get_env(key):
|
||||
@@ -131,7 +127,7 @@ def test_worker_env(shutdown_only):
|
||||
|
||||
def test_worker_capping_kill_idle_workers(shutdown_only):
|
||||
# Avoid starting initial workers by setting num_cpus to 0.
|
||||
ray.init(num_cpus=0, _system_config={"enable_multi_tenancy": True})
|
||||
ray.init(num_cpus=0)
|
||||
assert len(get_workers()) == 0
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
@@ -157,16 +153,13 @@ def test_worker_capping_kill_idle_workers(shutdown_only):
|
||||
# Worker 3 runs a normal task
|
||||
wait_for_condition(lambda: len(get_workers()) == 3)
|
||||
|
||||
ray.get(obj1)
|
||||
# Worker 2 now becomes idle and should be killed
|
||||
wait_for_condition(lambda: len(get_workers()) == 2)
|
||||
ray.get(obj2)
|
||||
# Worker 3 now becomes idle and should be killed
|
||||
ray.get([obj1, obj2])
|
||||
# Worker 2 and 3 now become idle and should be killed
|
||||
wait_for_condition(lambda: len(get_workers()) == 1)
|
||||
|
||||
|
||||
def test_worker_capping_run_many_small_tasks(shutdown_only):
|
||||
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
|
||||
ray.init(num_cpus=2)
|
||||
|
||||
@ray.remote(num_cpus=0.5)
|
||||
def foo():
|
||||
@@ -188,7 +181,7 @@ def test_worker_capping_run_many_small_tasks(shutdown_only):
|
||||
|
||||
|
||||
def test_worker_capping_run_chained_tasks(shutdown_only):
|
||||
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
|
||||
ray.init(num_cpus=2)
|
||||
|
||||
@ray.remote(num_cpus=0.5)
|
||||
def foo(x):
|
||||
@@ -215,7 +208,7 @@ def test_worker_capping_run_chained_tasks(shutdown_only):
|
||||
|
||||
def test_worker_capping_fifo(shutdown_only):
|
||||
# Start 2 initial workers by setting num_cpus to 2.
|
||||
info = ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
|
||||
info = ray.init(num_cpus=2)
|
||||
wait_for_condition(lambda: len(get_workers()) == 2)
|
||||
|
||||
time.sleep(1)
|
||||
@@ -233,6 +226,7 @@ def test_worker_capping_fifo(shutdown_only):
|
||||
|
||||
driver_code = """
|
||||
import ray
|
||||
import time
|
||||
|
||||
ray.init(address="{}")
|
||||
|
||||
@@ -241,6 +235,8 @@ def foo():
|
||||
pass
|
||||
|
||||
ray.get(foo.remote())
|
||||
# Sleep a while to make sure an idle worker exits before this driver exits.
|
||||
time.sleep(2)
|
||||
ray.shutdown()
|
||||
""".format(info["redis_address"])
|
||||
|
||||
@@ -254,7 +250,7 @@ ray.shutdown()
|
||||
|
||||
|
||||
def test_worker_registration_failure_after_driver_exit(shutdown_only):
|
||||
info = ray.init(num_cpus=1, _system_config={"enable_multi_tenancy": True})
|
||||
info = ray.init(num_cpus=1)
|
||||
|
||||
driver_code = """
|
||||
import ray
|
||||
|
||||
@@ -63,7 +63,11 @@ def test_worker_failed(ray_start_workers_separate_multinode):
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for pid in pids:
|
||||
os.kill(pid, SIGKILL)
|
||||
try:
|
||||
os.kill(pid, SIGKILL)
|
||||
except OSError:
|
||||
# The process may have already exited due to worker capping.
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
# Make sure that we either get the object or we get an appropriate
|
||||
# exception.
|
||||
|
||||
@@ -410,7 +410,11 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
return
|
||||
|
||||
obj = large_object.options(resources={"node2": 1}).remote()
|
||||
downstream = [chain.remote(obj) for _ in range(4)]
|
||||
downstream = [
|
||||
chain.options(resources={
|
||||
"node1": 1
|
||||
}).remote(obj) for _ in range(4)
|
||||
]
|
||||
for obj in downstream:
|
||||
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user