[Core] Multi-tenancy: enable multi-tenancy by default (#10570)

* Add new job in Travis to enable multi-tenancy

* fix

* Update .bazelrc

* Update .travis.yml

* fix test_job_gc_with_detached_actor

* fix test_multiple_downstream_tasks

* fix lint

* Enable multi-tenancy by default

* Kill idle workers in FIFO order

* Update test

* minor update

* Address comments

* fix some cases

* fix test_remote_cancel

* Address comments

* fix after merge

* remove kill

* fix worker_pool_test

* fix java test timeout

* fix test_two_custom_resources

* Add a delay when killing idle workers

* fix test_worker_failure

* fix test_worker_failed again

* fix DisconnectWorker

* update test_worker_failed

* Revert some python tests

* lint

* address comments
This commit is contained in:
Kai Yang
2020-09-30 14:54:53 +08:00
committed by GitHub
parent f54f7b23f6
commit 3504391fd2
17 changed files with 205 additions and 72 deletions
+5 -2
View File
@@ -207,10 +207,13 @@ def run_string_as_driver_nonblocking(driver_script):
return proc
def wait_for_num_actors(num_actors, timeout=10):
def wait_for_num_actors(num_actors, state=None, timeout=10):
start_time = time.time()
while time.time() - start_time < timeout:
if len(ray.actors()) >= num_actors:
if len([
_ for _ in ray.actors().values()
if state is None or _["State"] == state
]) >= num_actors:
return
time.sleep(0.1)
raise RayTestTimeoutException("Timed out while waiting for global state.")
+15 -1
View File
@@ -11,7 +11,10 @@ import ray
import ray.cluster_utils
import ray.test_utils
from ray.test_utils import RayTestTimeoutException
from ray.test_utils import (
RayTestTimeoutException,
wait_for_condition,
)
logger = logging.getLogger(__name__)
@@ -505,6 +508,17 @@ def test_two_custom_resources(ray_start_cluster):
})
ray.init(address=cluster.address)
@ray.remote
def foo():
# Sleep a while to emulate a slow operation. This is needed to make
# sure tasks are scheduled to different nodes.
time.sleep(0.1)
return ray.worker.global_worker.node.unique_id
# Make sure each node has at least one idle worker.
wait_for_condition(
lambda: len(set(ray.get([foo.remote() for _ in range(6)]))) == 2)
@ray.remote(resources={"CustomResource1": 1})
def f():
time.sleep(0.001)
@@ -63,7 +63,11 @@ def test_worker_failed(ray_start_workers_separate_multinode):
time.sleep(0.1)
# Kill the workers as the tasks execute.
for pid in pids:
os.kill(pid, SIGKILL)
try:
os.kill(pid, SIGKILL)
except OSError:
# The process may have already exited due to worker capping.
pass
time.sleep(0.1)
# Make sure that we either get the object or we get an appropriate
# exception.
+3 -1
View File
@@ -67,11 +67,13 @@ class Actor:
return 1
_ = Actor.options(lifetime="detached", name="DetachedActor").remote()
# Make sure the actor is created before the driver exits.
ray.get(_.value.remote())
""".format(address)
p = run_string_as_driver_nonblocking(driver)
# Wait for actor to be created
wait_for_num_actors(1)
wait_for_num_actors(1, ray.gcs_utils.ActorTableData.ALIVE)
actor_table = ray.actors()
assert len(actor_table) == 1
+13 -17
View File
@@ -31,10 +31,7 @@ def get_workers():
# `ray.init(...)`, Raylet will start `num_cpus` Python workers for the driver.
def test_initial_workers(shutdown_only):
# `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores
ray.init(
num_cpus=1,
include_dashboard=True,
_system_config={"enable_multi_tenancy": True})
ray.init(num_cpus=1, include_dashboard=True)
wait_for_condition(lambda: len(get_workers()) == 1)
@@ -46,7 +43,7 @@ def test_initial_workers(shutdown_only):
# different drivers were scheduled to the same worker process, that is, tasks
# of different jobs were not correctly isolated during execution.
def test_multi_drivers(shutdown_only):
info = ray.init(num_cpus=10, _system_config={"enable_multi_tenancy": True})
info = ray.init(num_cpus=10)
driver_code = """
import os
@@ -118,8 +115,7 @@ def test_worker_env(shutdown_only):
job_config=ray.job_config.JobConfig(worker_env={
"foo1": "bar1",
"foo2": "bar2"
}),
_system_config={"enable_multi_tenancy": True})
}))
@ray.remote
def get_env(key):
@@ -131,7 +127,7 @@ def test_worker_env(shutdown_only):
def test_worker_capping_kill_idle_workers(shutdown_only):
# Avoid starting initial workers by setting num_cpus to 0.
ray.init(num_cpus=0, _system_config={"enable_multi_tenancy": True})
ray.init(num_cpus=0)
assert len(get_workers()) == 0
@ray.remote(num_cpus=0)
@@ -157,16 +153,13 @@ def test_worker_capping_kill_idle_workers(shutdown_only):
# Worker 3 runs a normal task
wait_for_condition(lambda: len(get_workers()) == 3)
ray.get(obj1)
# Worker 2 now becomes idle and should be killed
wait_for_condition(lambda: len(get_workers()) == 2)
ray.get(obj2)
# Worker 3 now becomes idle and should be killed
ray.get([obj1, obj2])
# Worker 2 and 3 now become idle and should be killed
wait_for_condition(lambda: len(get_workers()) == 1)
def test_worker_capping_run_many_small_tasks(shutdown_only):
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
ray.init(num_cpus=2)
@ray.remote(num_cpus=0.5)
def foo():
@@ -188,7 +181,7 @@ def test_worker_capping_run_many_small_tasks(shutdown_only):
def test_worker_capping_run_chained_tasks(shutdown_only):
ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
ray.init(num_cpus=2)
@ray.remote(num_cpus=0.5)
def foo(x):
@@ -215,7 +208,7 @@ def test_worker_capping_run_chained_tasks(shutdown_only):
def test_worker_capping_fifo(shutdown_only):
# Start 2 initial workers by setting num_cpus to 2.
info = ray.init(num_cpus=2, _system_config={"enable_multi_tenancy": True})
info = ray.init(num_cpus=2)
wait_for_condition(lambda: len(get_workers()) == 2)
time.sleep(1)
@@ -233,6 +226,7 @@ def test_worker_capping_fifo(shutdown_only):
driver_code = """
import ray
import time
ray.init(address="{}")
@@ -241,6 +235,8 @@ def foo():
pass
ray.get(foo.remote())
# Sleep a while to make sure an idle worker exits before this driver exits.
time.sleep(2)
ray.shutdown()
""".format(info["redis_address"])
@@ -254,7 +250,7 @@ ray.shutdown()
def test_worker_registration_failure_after_driver_exit(shutdown_only):
info = ray.init(num_cpus=1, _system_config={"enable_multi_tenancy": True})
info = ray.init(num_cpus=1)
driver_code = """
import ray
+5 -1
View File
@@ -63,7 +63,11 @@ def test_worker_failed(ray_start_workers_separate_multinode):
time.sleep(0.1)
# Kill the workers as the tasks execute.
for pid in pids:
os.kill(pid, SIGKILL)
try:
os.kill(pid, SIGKILL)
except OSError:
# The process may have already exited due to worker capping.
pass
time.sleep(0.1)
# Make sure that we either get the object or we get an appropriate
# exception.
+5 -1
View File
@@ -410,7 +410,11 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
return
obj = large_object.options(resources={"node2": 1}).remote()
downstream = [chain.remote(obj) for _ in range(4)]
downstream = [
chain.options(resources={
"node1": 1
}).remote(obj) for _ in range(4)
]
for obj in downstream:
ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))