mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 11:37:28 +08:00
1d3941e41a
* skip failing windows tests * skip more * remove * updates
241 lines
6.9 KiB
Python
241 lines
6.9 KiB
Python
# coding: utf-8
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
import grpc
|
|
import pytest
|
|
|
|
import ray
|
|
import ray.test_utils
|
|
from ray.core.generated import common_pb2
|
|
from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc
|
|
from ray.test_utils import (wait_for_condition, run_string_as_driver,
|
|
run_string_as_driver_nonblocking)
|
|
|
|
|
|
def get_workers():
|
|
raylet = ray.nodes()[0]
|
|
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
|
raylet["NodeManagerPort"])
|
|
channel = grpc.insecure_channel(raylet_address)
|
|
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
|
return [
|
|
worker for worker in stub.GetNodeStats(
|
|
node_manager_pb2.GetNodeStatsRequest()).core_workers_stats
|
|
if worker.worker_type != common_pb2.DRIVER
|
|
]
|
|
|
|
|
|
# Test that when `redis_address` and `job_config` is not set in
|
|
# `ray.init(...)`, Raylet will start `num_cpus` Python workers for the driver.
|
|
def test_initial_workers(shutdown_only):
|
|
# `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores
|
|
ray.init(num_cpus=1, include_dashboard=True)
|
|
wait_for_condition(lambda: len(get_workers()) == 1)
|
|
|
|
|
|
# This test case starts some driver processes. Each driver process submits
|
|
# some tasks and collect the PIDs of the workers used by the driver. The
|
|
# drivers output the PID list which will be read by the test case itself. The
|
|
# test case will compare the PIDs used by different drivers and make sure that
|
|
# all the PIDs don't overlap. If overlapped, it means that tasks owned by
|
|
# different drivers were scheduled to the same worker process, that is, tasks
|
|
# of different jobs were not correctly isolated during execution.
|
|
@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")
|
|
def test_multi_drivers(shutdown_only):
|
|
info = ray.init(num_cpus=10)
|
|
|
|
driver_code = """
|
|
import os
|
|
import sys
|
|
import ray
|
|
|
|
|
|
ray.init(address="{}")
|
|
|
|
@ray.remote
|
|
class Actor:
|
|
def get_pid(self):
|
|
return os.getpid()
|
|
|
|
@ray.remote
|
|
def get_pid():
|
|
return os.getpid()
|
|
|
|
pid_objs = []
|
|
# Submit some normal tasks and get the PIDs of workers which execute the tasks.
|
|
pid_objs = pid_objs + [get_pid.remote() for _ in range(2)]
|
|
# Create some actors and get the PIDs of actors.
|
|
actors = [Actor.remote() for _ in range(2)]
|
|
pid_objs = pid_objs + [actor.get_pid.remote() for actor in actors]
|
|
|
|
pids = set([ray.get(obj) for obj in pid_objs])
|
|
# Write pids to stdout
|
|
print("PID:" + str.join(",", [str(_) for _ in pids]))
|
|
|
|
ray.shutdown()
|
|
""".format(info["redis_address"])
|
|
|
|
driver_count = 3
|
|
processes = [
|
|
run_string_as_driver_nonblocking(driver_code)
|
|
for _ in range(driver_count)
|
|
]
|
|
outputs = []
|
|
for p in processes:
|
|
out = p.stdout.read().decode("ascii")
|
|
err = p.stderr.read().decode("ascii")
|
|
p.wait()
|
|
# out, err = p.communicate()
|
|
# out = ray.utils.decode(out)
|
|
# err = ray.utils.decode(err)
|
|
if p.returncode != 0:
|
|
print("Driver with PID {} returned error code {}".format(
|
|
p.pid, p.returncode))
|
|
print("STDOUT:\n{}".format(out))
|
|
print("STDERR:\n{}".format(err))
|
|
outputs.append((p, out))
|
|
|
|
all_worker_pids = set()
|
|
for p, out in outputs:
|
|
assert p.returncode == 0
|
|
for line in out.splitlines():
|
|
if line.startswith("PID:"):
|
|
worker_pids = [int(_) for _ in line.split(":")[1].split(",")]
|
|
assert len(worker_pids) > 0
|
|
for worker_pid in worker_pids:
|
|
assert worker_pid not in all_worker_pids, (
|
|
("Worker process with PID {} is shared" +
|
|
" by multiple drivers.").format(worker_pid))
|
|
all_worker_pids.add(worker_pid)
|
|
|
|
|
|
def test_worker_env(shutdown_only):
|
|
ray.init(
|
|
job_config=ray.job_config.JobConfig(worker_env={
|
|
"foo1": "bar1",
|
|
"foo2": "bar2"
|
|
}))
|
|
|
|
@ray.remote
|
|
def get_env(key):
|
|
return os.environ.get(key)
|
|
|
|
assert ray.get(get_env.remote("foo1")) == "bar1"
|
|
assert ray.get(get_env.remote("foo2")) == "bar2"
|
|
|
|
|
|
def test_worker_capping_kill_idle_workers(shutdown_only):
|
|
# Avoid starting initial workers by setting num_cpus to 0.
|
|
ray.init(num_cpus=0)
|
|
assert len(get_workers()) == 0
|
|
|
|
@ray.remote(num_cpus=0)
|
|
class Actor:
|
|
def ping(self):
|
|
pass
|
|
|
|
actor = Actor.remote()
|
|
ray.get(actor.ping.remote())
|
|
# Actor is now alive and worker 1 which holds the actor is alive
|
|
assert len(get_workers()) == 1
|
|
|
|
@ray.remote(num_cpus=0)
|
|
def foo():
|
|
# Wait for a while
|
|
time.sleep(10)
|
|
|
|
obj1 = foo.remote()
|
|
# Worker 2 runs a normal task
|
|
wait_for_condition(lambda: len(get_workers()) == 2)
|
|
|
|
obj2 = foo.remote()
|
|
# Worker 3 runs a normal task
|
|
wait_for_condition(lambda: len(get_workers()) == 3)
|
|
|
|
ray.get([obj1, obj2])
|
|
# Worker 2 and 3 now become idle and should be killed
|
|
wait_for_condition(lambda: len(get_workers()) == 1)
|
|
|
|
|
|
def test_worker_capping_run_many_small_tasks(shutdown_only):
|
|
ray.init(num_cpus=2)
|
|
|
|
@ray.remote(num_cpus=0.5)
|
|
def foo():
|
|
time.sleep(5)
|
|
|
|
# Run more tasks than `num_cpus`, but the CPU resource requirement is
|
|
# still within `num_cpus`.
|
|
obj_refs = [foo.remote() for _ in range(4)]
|
|
wait_for_condition(lambda: len(get_workers()) == 4)
|
|
|
|
ray.get(obj_refs)
|
|
# After finished the tasks, some workers are killed to keep the total
|
|
# number of workers <= num_cpus.
|
|
wait_for_condition(lambda: len(get_workers()) == 2)
|
|
|
|
time.sleep(1)
|
|
# The two remaining workers stay alive forever.
|
|
assert len(get_workers()) == 2
|
|
|
|
|
|
def test_worker_capping_run_chained_tasks(shutdown_only):
|
|
ray.init(num_cpus=2)
|
|
|
|
@ray.remote(num_cpus=0.5)
|
|
def foo(x):
|
|
if x > 1:
|
|
return ray.get(foo.remote(x - 1)) + x
|
|
else:
|
|
time.sleep(5)
|
|
return x
|
|
|
|
# Run a chain of tasks which exceed `num_cpus` in amount, but the CPU
|
|
# resource requirement is still within `num_cpus`.
|
|
obj = foo.remote(4)
|
|
wait_for_condition(lambda: len(get_workers()) == 4)
|
|
|
|
ray.get(obj)
|
|
# After finished the tasks, some workers are killed to keep the total
|
|
# number of workers <= num_cpus.
|
|
wait_for_condition(lambda: len(get_workers()) == 2)
|
|
|
|
time.sleep(1)
|
|
# The two remaining workers stay alive forever.
|
|
assert len(get_workers()) == 2
|
|
|
|
|
|
def test_worker_registration_failure_after_driver_exit(shutdown_only):
|
|
info = ray.init(num_cpus=1)
|
|
|
|
driver_code = """
|
|
import ray
|
|
import time
|
|
|
|
|
|
ray.init(address="{}")
|
|
|
|
@ray.remote
|
|
def foo():
|
|
pass
|
|
|
|
[foo.remote() for _ in range(100)]
|
|
|
|
ray.shutdown()
|
|
""".format(info["redis_address"])
|
|
|
|
before = len(get_workers())
|
|
assert before == 1
|
|
|
|
run_string_as_driver(driver_code)
|
|
|
|
# wait for a while to let workers register
|
|
time.sleep(2)
|
|
wait_for_condition(lambda: len(get_workers()) == before)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(pytest.main(["-v", __file__]))
|