Convert some unittests to pytest. (#2779)

* Convert multi_node_test.py to pytest.

* Convert array_test.py to pytest.

* Convert failure_test.py to pytest.

* Convert microbenchmarks to pytest.

* Convert component_failures_test.py to pytest and some minor quotes changes.

* Convert tensorflow_test.py to pytest.

* Convert actor_test.py to pytest.

* Fix.

* Fix
This commit is contained in:
Robert Nishihara
2018-08-31 11:24:15 -07:00
committed by Philipp Moritz
parent 3813ae34b3
commit eda6ebb87d
10 changed files with 3185 additions and 3205 deletions
+3 -3
View File
@@ -189,7 +189,7 @@ class TestGlobalScheduler(unittest.TestCase):
assert (db_client_id is not None)
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
def test_integration_single_task(self):
# There should be three db clients, the global scheduler, the local
@@ -307,13 +307,13 @@ class TestGlobalScheduler(unittest.TestCase):
self.assertEqual(num_tasks_done + num_tasks_waiting, num_tasks)
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
def test_integration_many_tasks_handler_sync(self):
self.integration_many_tasks_helper(timesync=True)
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
def test_integration_many_tasks(self):
# More realistic case: should handle out of order object and task
+1814 -1817
View File
File diff suppressed because it is too large Load Diff
+215 -220
View File
@@ -4,8 +4,8 @@ from __future__ import print_function
import numpy as np
from numpy.testing import assert_equal, assert_almost_equal
import pytest
import sys
import unittest
import ray
import ray.experimental.array.remote as ra
@@ -15,229 +15,224 @@ if sys.version_info >= (3, 0):
from importlib import reload
class RemoteArrayTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def testMethods(self):
for module in [
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
]:
reload(module)
ray.init()
# test eye
object_id = ra.eye.remote(3)
val = ray.get(object_id)
assert_almost_equal(val, np.eye(3))
# test zeros
object_id = ra.zeros.remote([3, 4, 5])
val = ray.get(object_id)
assert_equal(val, np.zeros([3, 4, 5]))
# test qr - pass by value
a_val = np.random.normal(size=[10, 11])
q_id, r_id = ra.linalg.qr.remote(a_val)
q_val = ray.get(q_id)
r_val = ray.get(r_id)
assert_almost_equal(np.dot(q_val, r_val), a_val)
# test qr - pass by objectid
a = ra.random.normal.remote([10, 13])
q_id, r_id = ra.linalg.qr.remote(a)
a_val = ray.get(a)
q_val = ray.get(q_id)
r_val = ray.get(r_id)
assert_almost_equal(np.dot(q_val, r_val), a_val)
@pytest.fixture
def ray_start_regular():
for module in [
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
]:
reload(module)
# Start the Ray processes.
ray.init(num_cpus=2)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
class DistributedArrayTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def test_remote_array_methods(ray_start_regular):
# test eye
object_id = ra.eye.remote(3)
val = ray.get(object_id)
assert_almost_equal(val, np.eye(3))
def testAssemble(self):
for module in [
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
]:
reload(module)
ray.init()
# test zeros
object_id = ra.zeros.remote([3, 4, 5])
val = ray.get(object_id)
assert_equal(val, np.zeros([3, 4, 5]))
a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
x = da.DistArray([2 * da.BLOCK_SIZE, da.BLOCK_SIZE],
np.array([[a], [b]]))
assert_equal(
x.assemble(),
np.vstack([
np.ones([da.BLOCK_SIZE, da.BLOCK_SIZE]),
np.zeros([da.BLOCK_SIZE, da.BLOCK_SIZE])
]))
# test qr - pass by value
a_val = np.random.normal(size=[10, 11])
q_id, r_id = ra.linalg.qr.remote(a_val)
q_val = ray.get(q_id)
r_val = ray.get(r_id)
assert_almost_equal(np.dot(q_val, r_val), a_val)
def testMethods(self):
for module in [
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
]:
reload(module)
ray.worker._init(
start_ray_local=True, num_local_schedulers=2, num_cpus=[10, 10])
x = da.zeros.remote([9, 25, 51], "float")
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
x = da.ones.remote([11, 25, 49], dtype_name="float")
assert_equal(ray.get(da.assemble.remote(x)), np.ones([11, 25, 49]))
x = da.random.normal.remote([11, 25, 49])
y = da.copy.remote(x)
assert_equal(
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(y)))
x = da.eye.remote(25, dtype_name="float")
assert_equal(ray.get(da.assemble.remote(x)), np.eye(25))
x = da.random.normal.remote([25, 49])
y = da.triu.remote(x)
assert_equal(
ray.get(da.assemble.remote(y)),
np.triu(ray.get(da.assemble.remote(x))))
x = da.random.normal.remote([25, 49])
y = da.tril.remote(x)
assert_equal(
ray.get(da.assemble.remote(y)),
np.tril(ray.get(da.assemble.remote(x))))
x = da.random.normal.remote([25, 49])
y = da.random.normal.remote([49, 18])
z = da.dot.remote(x, y)
w = da.assemble.remote(z)
u = da.assemble.remote(x)
v = da.assemble.remote(y)
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
# test add
x = da.random.normal.remote([23, 42])
y = da.random.normal.remote([23, 42])
z = da.add.remote(x, y)
assert_almost_equal(
ray.get(da.assemble.remote(z)),
ray.get(da.assemble.remote(x)) + ray.get(da.assemble.remote(y)))
# test subtract
x = da.random.normal.remote([33, 40])
y = da.random.normal.remote([33, 40])
z = da.subtract.remote(x, y)
assert_almost_equal(
ray.get(da.assemble.remote(z)),
ray.get(da.assemble.remote(x)) - ray.get(da.assemble.remote(y)))
# test transpose
x = da.random.normal.remote([234, 432])
y = da.transpose.remote(x)
assert_equal(
ray.get(da.assemble.remote(x)).T, ray.get(da.assemble.remote(y)))
# test numpy_to_dist
x = da.random.normal.remote([23, 45])
y = da.assemble.remote(x)
z = da.numpy_to_dist.remote(y)
w = da.assemble.remote(z)
assert_equal(
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(z)))
assert_equal(ray.get(y), ray.get(w))
# test da.tsqr
for shape in [[123, da.BLOCK_SIZE], [7, da.BLOCK_SIZE],
[da.BLOCK_SIZE, da.BLOCK_SIZE], [da.BLOCK_SIZE, 7],
[10 * da.BLOCK_SIZE, da.BLOCK_SIZE]]:
x = da.random.normal.remote(shape)
K = min(shape)
q, r = da.linalg.tsqr.remote(x)
x_val = ray.get(da.assemble.remote(x))
q_val = ray.get(da.assemble.remote(q))
r_val = ray.get(r)
assert r_val.shape == (K, shape[1])
assert_equal(r_val, np.triu(r_val))
assert_almost_equal(x_val, np.dot(q_val, r_val))
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
# test da.linalg.modified_lu
def test_modified_lu(d1, d2):
print("testing dist_modified_lu with d1 = " + str(d1) + ", d2 = " +
str(d2))
assert d1 >= d2
m = ra.random.normal.remote([d1, d2])
q, r = ra.linalg.qr.remote(m)
l, u, s = da.linalg.modified_lu.remote(da.numpy_to_dist.remote(q))
q_val = ray.get(q)
ray.get(r)
l_val = ray.get(da.assemble.remote(l))
u_val = ray.get(u)
s_val = ray.get(s)
s_mat = np.zeros((d1, d2))
for i in range(len(s_val)):
s_mat[i, i] = s_val[i]
# Check that q - s = l * u.
assert_almost_equal(q_val - s_mat, np.dot(l_val, u_val))
# Check that u is upper triangular.
assert_equal(np.triu(u_val), u_val)
# Check that l is lower triangular.
assert_equal(np.tril(l_val), l_val)
for d1, d2 in [(100, 100), (99, 98), (7, 5), (7, 7), (20, 7), (20,
10)]:
test_modified_lu(d1, d2)
# test dist_tsqr_hr
def test_dist_tsqr_hr(d1, d2):
print("testing dist_tsqr_hr with d1 = " + str(d1) + ", d2 = " +
str(d2))
a = da.random.normal.remote([d1, d2])
y, t, y_top, r = da.linalg.tsqr_hr.remote(a)
a_val = ray.get(da.assemble.remote(a))
y_val = ray.get(da.assemble.remote(y))
t_val = ray.get(t)
y_top_val = ray.get(y_top)
r_val = ray.get(r)
tall_eye = np.zeros((d1, min(d1, d2)))
np.fill_diagonal(tall_eye, 1)
q = tall_eye - np.dot(y_val, np.dot(t_val, y_top_val.T))
# Check that q.T * q = I.
assert_almost_equal(np.dot(q.T, q), np.eye(min(d1, d2)))
# Check that a = (I - y * t * y_top.T) * r.
assert_almost_equal(np.dot(q, r_val), a_val)
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE),
(da.BLOCK_SIZE, da.BLOCK_SIZE), (da.BLOCK_SIZE, 7),
(10 * da.BLOCK_SIZE, da.BLOCK_SIZE)]:
test_dist_tsqr_hr(d1, d2)
def test_dist_qr(d1, d2):
print("testing qr with d1 = {}, and d2 = {}.".format(d1, d2))
a = da.random.normal.remote([d1, d2])
K = min(d1, d2)
q, r = da.linalg.qr.remote(a)
a_val = ray.get(da.assemble.remote(a))
q_val = ray.get(da.assemble.remote(q))
r_val = ray.get(da.assemble.remote(r))
assert q_val.shape == (d1, K)
assert r_val.shape == (K, d2)
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
assert_equal(r_val, np.triu(r_val))
assert_almost_equal(a_val, np.dot(q_val, r_val))
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE),
(da.BLOCK_SIZE, da.BLOCK_SIZE), (da.BLOCK_SIZE, 7),
(13, 21), (34, 35), (8, 7)]:
test_dist_qr(d1, d2)
test_dist_qr(d2, d1)
for _ in range(20):
d1 = np.random.randint(1, 35)
d2 = np.random.randint(1, 35)
test_dist_qr(d1, d2)
# test qr - pass by objectid
a = ra.random.normal.remote([10, 13])
q_id, r_id = ra.linalg.qr.remote(a)
a_val = ray.get(a)
q_val = ray.get(q_id)
r_val = ray.get(r_id)
assert_almost_equal(np.dot(q_val, r_val), a_val)
if __name__ == "__main__":
unittest.main(verbosity=2)
def test_distributed_array_assemble(ray_start_regular):
a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
x = da.DistArray([2 * da.BLOCK_SIZE, da.BLOCK_SIZE], np.array([[a], [b]]))
assert_equal(
x.assemble(),
np.vstack([
np.ones([da.BLOCK_SIZE, da.BLOCK_SIZE]),
np.zeros([da.BLOCK_SIZE, da.BLOCK_SIZE])
]))
@pytest.fixture
def ray_start_two_nodes():
for module in [
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
]:
reload(module)
# Start the Ray processes.
ray.worker._init(
start_ray_local=True, num_local_schedulers=2, num_cpus=[10, 10])
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
def test_distributed_array_methods(ray_start_two_nodes):
x = da.zeros.remote([9, 25, 51], "float")
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
x = da.ones.remote([11, 25, 49], dtype_name="float")
assert_equal(ray.get(da.assemble.remote(x)), np.ones([11, 25, 49]))
x = da.random.normal.remote([11, 25, 49])
y = da.copy.remote(x)
assert_equal(
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(y)))
x = da.eye.remote(25, dtype_name="float")
assert_equal(ray.get(da.assemble.remote(x)), np.eye(25))
x = da.random.normal.remote([25, 49])
y = da.triu.remote(x)
assert_equal(
ray.get(da.assemble.remote(y)), np.triu(
ray.get(da.assemble.remote(x))))
x = da.random.normal.remote([25, 49])
y = da.tril.remote(x)
assert_equal(
ray.get(da.assemble.remote(y)), np.tril(
ray.get(da.assemble.remote(x))))
x = da.random.normal.remote([25, 49])
y = da.random.normal.remote([49, 18])
z = da.dot.remote(x, y)
w = da.assemble.remote(z)
u = da.assemble.remote(x)
v = da.assemble.remote(y)
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
# test add
x = da.random.normal.remote([23, 42])
y = da.random.normal.remote([23, 42])
z = da.add.remote(x, y)
assert_almost_equal(
ray.get(da.assemble.remote(z)),
ray.get(da.assemble.remote(x)) + ray.get(da.assemble.remote(y)))
# test subtract
x = da.random.normal.remote([33, 40])
y = da.random.normal.remote([33, 40])
z = da.subtract.remote(x, y)
assert_almost_equal(
ray.get(da.assemble.remote(z)),
ray.get(da.assemble.remote(x)) - ray.get(da.assemble.remote(y)))
# test transpose
x = da.random.normal.remote([234, 432])
y = da.transpose.remote(x)
assert_equal(
ray.get(da.assemble.remote(x)).T, ray.get(da.assemble.remote(y)))
# test numpy_to_dist
x = da.random.normal.remote([23, 45])
y = da.assemble.remote(x)
z = da.numpy_to_dist.remote(y)
w = da.assemble.remote(z)
assert_equal(
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(z)))
assert_equal(ray.get(y), ray.get(w))
# test da.tsqr
for shape in [[123, da.BLOCK_SIZE], [7, da.BLOCK_SIZE],
[da.BLOCK_SIZE, da.BLOCK_SIZE], [da.BLOCK_SIZE, 7],
[10 * da.BLOCK_SIZE, da.BLOCK_SIZE]]:
x = da.random.normal.remote(shape)
K = min(shape)
q, r = da.linalg.tsqr.remote(x)
x_val = ray.get(da.assemble.remote(x))
q_val = ray.get(da.assemble.remote(q))
r_val = ray.get(r)
assert r_val.shape == (K, shape[1])
assert_equal(r_val, np.triu(r_val))
assert_almost_equal(x_val, np.dot(q_val, r_val))
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
# test da.linalg.modified_lu
def test_modified_lu(d1, d2):
print("testing dist_modified_lu with d1 = " + str(d1) + ", d2 = " +
str(d2))
assert d1 >= d2
m = ra.random.normal.remote([d1, d2])
q, r = ra.linalg.qr.remote(m)
l, u, s = da.linalg.modified_lu.remote(da.numpy_to_dist.remote(q))
q_val = ray.get(q)
ray.get(r)
l_val = ray.get(da.assemble.remote(l))
u_val = ray.get(u)
s_val = ray.get(s)
s_mat = np.zeros((d1, d2))
for i in range(len(s_val)):
s_mat[i, i] = s_val[i]
# Check that q - s = l * u.
assert_almost_equal(q_val - s_mat, np.dot(l_val, u_val))
# Check that u is upper triangular.
assert_equal(np.triu(u_val), u_val)
# Check that l is lower triangular.
assert_equal(np.tril(l_val), l_val)
for d1, d2 in [(100, 100), (99, 98), (7, 5), (7, 7), (20, 7), (20, 10)]:
test_modified_lu(d1, d2)
# test dist_tsqr_hr
def test_dist_tsqr_hr(d1, d2):
print("testing dist_tsqr_hr with d1 = " + str(d1) + ", d2 = " +
str(d2))
a = da.random.normal.remote([d1, d2])
y, t, y_top, r = da.linalg.tsqr_hr.remote(a)
a_val = ray.get(da.assemble.remote(a))
y_val = ray.get(da.assemble.remote(y))
t_val = ray.get(t)
y_top_val = ray.get(y_top)
r_val = ray.get(r)
tall_eye = np.zeros((d1, min(d1, d2)))
np.fill_diagonal(tall_eye, 1)
q = tall_eye - np.dot(y_val, np.dot(t_val, y_top_val.T))
# Check that q.T * q = I.
assert_almost_equal(np.dot(q.T, q), np.eye(min(d1, d2)))
# Check that a = (I - y * t * y_top.T) * r.
assert_almost_equal(np.dot(q, r_val), a_val)
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), (da.BLOCK_SIZE,
da.BLOCK_SIZE),
(da.BLOCK_SIZE, 7), (10 * da.BLOCK_SIZE, da.BLOCK_SIZE)]:
test_dist_tsqr_hr(d1, d2)
def test_dist_qr(d1, d2):
print("testing qr with d1 = {}, and d2 = {}.".format(d1, d2))
a = da.random.normal.remote([d1, d2])
K = min(d1, d2)
q, r = da.linalg.qr.remote(a)
a_val = ray.get(da.assemble.remote(a))
q_val = ray.get(da.assemble.remote(q))
r_val = ray.get(da.assemble.remote(r))
assert q_val.shape == (d1, K)
assert r_val.shape == (K, d2)
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
assert_equal(r_val, np.triu(r_val))
assert_almost_equal(a_val, np.dot(q_val, r_val))
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), (da.BLOCK_SIZE,
da.BLOCK_SIZE),
(da.BLOCK_SIZE, 7), (13, 21), (34, 35), (8, 7)]:
test_dist_qr(d1, d2)
test_dist_qr(d2, d1)
for _ in range(20):
d1 = np.random.randint(1, 35)
d2 = np.random.randint(1, 35)
test_dist_qr(d1, d2)
+306 -294
View File
@@ -2,351 +2,363 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import os
import ray
import time
import unittest
import pyarrow as pa
class ComponentFailureTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
@pytest.fixture
def ray_start_workers_separate():
# Start the Ray processes.
ray.worker._init(
num_cpus=1,
start_workers_from_local_scheduler=False,
start_ray_local=True,
redirect_output=True)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
# This test checks that when a worker dies in the middle of a get, the
# plasma store and manager will not die.
@unittest.skipIf(
os.environ.get('RAY_USE_XRAY', False),
"Workers are all started by Raylet, so cannot be killed from Python.")
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
"Not working with new GCS API.")
def testDyingWorkerGet(self):
obj_id = 20 * b"a"
@ray.remote
def f():
ray.worker.global_worker.plasma_client.get(obj_id)
# This test checks that when a worker dies in the middle of a get, the
# plasma store and manager will not die.
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY", False),
reason="This test does not work with xray yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Not working with new GCS API.")
def test_dying_worker_get(ray_start_workers_separate):
obj_id = 20 * b"a"
ray.worker._init(
num_workers=1,
start_workers_from_local_scheduler=False,
start_ray_local=True,
redirect_output=True)
@ray.remote
def f():
ray.worker.global_worker.plasma_client.get(ray.ObjectID(obj_id))
# Have the worker wait in a get call.
f.remote()
# Have the worker wait in a get call.
f.remote()
# Kill the worker.
time.sleep(1)
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
.terminate())
# Kill the worker.
time.sleep(1)
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
.terminate())
time.sleep(0.1)
# Seal the object so the store attempts to notify the worker that the
# get has been fulfilled.
ray.worker.global_worker.plasma_client.create(
pa.plasma.ObjectID(obj_id), 100)
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
time.sleep(0.1)
# Make sure that nothing has died.
assert ray.services.all_processes_alive(
exclude=[ray.services.PROCESS_TYPE_WORKER])
# This test checks that when a worker dies in the middle of a wait, the
# plasma store and manager will not die.
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY", False),
reason="This test does not work with xray yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Not working with new GCS API.")
def test_dying_worker_wait(ray_start_workers_separate):
obj_id = 20 * b"a"
@ray.remote
def f():
ray.worker.global_worker.plasma_client.wait([ray.ObjectID(obj_id)])
# Have the worker wait in a get call.
f.remote()
# Kill the worker.
time.sleep(1)
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
.terminate())
time.sleep(0.1)
# Seal the object so the store attempts to notify the worker that the
# get has been fulfilled.
ray.worker.global_worker.plasma_client.create(
pa.plasma.ObjectID(obj_id), 100)
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
time.sleep(0.1)
# Make sure that nothing has died.
assert ray.services.all_processes_alive(
exclude=[ray.services.PROCESS_TYPE_WORKER])
@pytest.fixture(params=[(1, 4), (4, 4)])
def ray_start_workers_separate_multinode(request):
num_local_schedulers = request.param[0]
num_initial_workers = request.param[1]
# Start the Ray processes.
ray.worker._init(
num_workers=(num_initial_workers * num_local_schedulers),
num_local_schedulers=num_local_schedulers,
start_workers_from_local_scheduler=False,
start_ray_local=True,
num_cpus=[num_initial_workers] * num_local_schedulers,
redirect_output=True)
yield num_local_schedulers, num_initial_workers
# The code after the yield will run as teardown code.
ray.shutdown()
def test_worker_failed(ray_start_workers_separate_multinode):
num_local_schedulers, num_initial_workers = (
ray_start_workers_separate_multinode)
@ray.remote
def f(x):
time.sleep(0.5)
return x
# Submit more tasks than there are workers so that all workers and
# cores are utilized.
object_ids = [
f.remote(i) for i in range(num_initial_workers * num_local_schedulers)
]
object_ids += [f.remote(object_id) for object_id in object_ids]
# Allow the tasks some time to begin executing.
time.sleep(0.1)
# Kill the workers as the tasks execute.
for worker in (
ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]):
worker.terminate()
time.sleep(0.1)
# Make sure that we can still get the objects after the executing tasks
# died.
ray.get(object_ids)
# Seal the object so the store attempts to notify the worker that the
# get has been fulfilled.
ray.worker.global_worker.plasma_client.create(
pa.plasma.ObjectID(obj_id), 100)
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
time.sleep(0.1)
# Make sure that nothing has died.
assert ray.services.all_processes_alive(
exclude=[ray.services.PROCESS_TYPE_WORKER])
def _test_component_failed(component_type):
"""Kill a component on all worker nodes and check workload succeeds."""
# Raylet is able to pass a harder failure test than legacy ray.
use_raylet = os.environ.get("RAY_USE_XRAY") == "1"
# This test checks that when a worker dies in the middle of a wait, the
# plasma store and manager will not die.
@unittest.skipIf(
os.environ.get('RAY_USE_XRAY', False),
"Workers are all started by Raylet, so cannot be killed from Python.")
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
"Not working with new GCS API.")
def testDyingWorkerWait(self):
obj_id = 20 * b"a"
# Start with 4 workers and 4 cores.
num_local_schedulers = 4
num_workers_per_scheduler = 8
ray.worker._init(
num_workers=num_workers_per_scheduler,
num_local_schedulers=num_local_schedulers,
start_ray_local=True,
num_cpus=[num_workers_per_scheduler] * num_local_schedulers,
redirect_output=True)
@ray.remote
def f():
ray.worker.global_worker.plasma_client.wait([obj_id])
ray.worker._init(
num_workers=1,
start_workers_from_local_scheduler=False,
start_ray_local=True,
redirect_output=True)
# Have the worker wait in a get call.
f.remote()
# Kill the worker.
time.sleep(1)
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
.terminate())
time.sleep(0.1)
# Seal the object so the store attempts to notify the worker that the
# get has been fulfilled.
ray.worker.global_worker.plasma_client.create(
pa.plasma.ObjectID(obj_id), 100)
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
time.sleep(0.1)
# Make sure that nothing has died.
assert ray.services.all_processes_alive(
exclude=[ray.services.PROCESS_TYPE_WORKER])
def _testWorkerFailed(self, num_local_schedulers):
if use_raylet:
# Submit many tasks with many dependencies.
@ray.remote
def f(x):
time.sleep(0.5)
return x
num_initial_workers = 4
ray.worker._init(
num_workers=(num_initial_workers * num_local_schedulers),
num_local_schedulers=num_local_schedulers,
start_workers_from_local_scheduler=False,
start_ray_local=True,
num_cpus=[num_initial_workers] * num_local_schedulers,
redirect_output=True)
@ray.remote
def g(*xs):
return 1
# Kill the component on all nodes except the head node as the tasks
# execute. Do this in a loop while submitting tasks between each
# component failure.
# NOTE(swang): Legacy ray hangs on this test if the plasma manager
# is killed.
time.sleep(0.1)
components = ray.services.all_processes[component_type]
for process in components[1:]:
# Submit a round of tasks with many dependencies.
x = 1
for _ in range(1000):
x = f.remote(x)
xs = [g.remote(1)]
for _ in range(100):
xs.append(g.remote(*xs))
xs.append(g.remote(1))
# Kill a component on one of the nodes.
process.terminate()
time.sleep(1)
process.kill()
process.wait()
assert not process.poll() is None
# Make sure that we can still get the objects after the
# executing tasks died.
ray.get(x)
ray.get(xs)
else:
@ray.remote
def f(x, j):
time.sleep(0.2)
return x
# Submit more tasks than there are workers so that all workers and
# cores are utilized.
object_ids = [
f.remote(i)
for i in range(num_initial_workers * num_local_schedulers)
f.remote(i, 0)
for i in range(num_workers_per_scheduler * num_local_schedulers)
]
object_ids += [f.remote(object_id) for object_id in object_ids]
# Allow the tasks some time to begin executing.
object_ids += [f.remote(object_id, 1) for object_id in object_ids]
object_ids += [f.remote(object_id, 2) for object_id in object_ids]
# Kill the component on all nodes except the head node as the tasks
# execute.
time.sleep(0.1)
# Kill the workers as the tasks execute.
for worker in (
ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]):
worker.terminate()
time.sleep(0.1)
# Make sure that we can still get the objects after the executing tasks
# died.
ray.get(object_ids)
components = ray.services.all_processes[component_type]
for process in components[1:]:
process.terminate()
time.sleep(1)
def testWorkerFailed(self):
self._testWorkerFailed(1)
for process in components[1:]:
process.kill()
process.wait()
assert not process.poll() is None
def testWorkerFailedMultinode(self):
self._testWorkerFailed(4)
# Make sure that we can still get the objects after the executing
# tasks died.
results = ray.get(object_ids)
expected_results = 4 * list(
range(num_workers_per_scheduler * num_local_schedulers))
assert results == expected_results
def _testComponentFailed(self, component_type):
"""Kill a component on all worker nodes and check workload succeeds."""
# Raylet is able to pass a harder failure test than legacy ray.
use_raylet = os.environ.get("RAY_USE_XRAY") == "1"
# Start with 4 workers and 4 cores.
num_local_schedulers = 4
num_workers_per_scheduler = 8
ray.worker._init(
num_workers=num_workers_per_scheduler,
num_local_schedulers=num_local_schedulers,
start_ray_local=True,
num_cpus=[num_workers_per_scheduler] * num_local_schedulers,
redirect_output=True)
if use_raylet:
# Submit many tasks with many dependencies.
@ray.remote
def f(x):
return x
@ray.remote
def g(*xs):
return 1
# Kill the component on all nodes except the head node as the tasks
# execute. Do this in a loop while submitting tasks between each
# component failure.
# NOTE(swang): Legacy ray hangs on this test if the plasma manager
# is killed.
time.sleep(0.1)
components = ray.services.all_processes[component_type]
for process in components[1:]:
# Submit a round of tasks with many dependencies.
x = 1
for _ in range(1000):
x = f.remote(x)
xs = [g.remote(1)]
for _ in range(100):
xs.append(g.remote(*xs))
xs.append(g.remote(1))
# Kill a component on one of the nodes.
process.terminate()
time.sleep(1)
process.kill()
process.wait()
assert not process.poll() is None
# Make sure that we can still get the objects after the
# executing tasks died.
ray.get(x)
ray.get(xs)
def check_components_alive(component_type, check_component_alive):
"""Check that a given component type is alive on all worker nodes.
"""
components = ray.services.all_processes[component_type][1:]
for component in components:
if check_component_alive:
assert component.poll() is None
else:
print("waiting for " + component_type + " with PID " +
str(component.pid) + "to terminate")
component.wait()
print("done waiting for " + component_type + " with PID " +
str(component.pid) + "to terminate")
assert not component.poll() is None
@ray.remote
def f(x, j):
time.sleep(0.2)
return x
# Submit more tasks than there are workers so that all workers and
# cores are utilized.
object_ids = [
f.remote(i, 0) for i in range(num_workers_per_scheduler *
num_local_schedulers)
]
object_ids += [f.remote(object_id, 1) for object_id in object_ids]
object_ids += [f.remote(object_id, 2) for object_id in object_ids]
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY") != "1",
reason="This test only makes sense with xray.")
def test_raylet_failed():
# Kill all local schedulers on worker nodes.
_test_component_failed(ray.services.PROCESS_TYPE_RAYLET)
# Kill the component on all nodes except the head node as the tasks
# execute.
time.sleep(0.1)
components = ray.services.all_processes[component_type]
for process in components[1:]:
process.terminate()
time.sleep(1)
# The plasma stores and plasma managers should still be alive on the
# worker nodes.
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True)
for process in components[1:]:
process.kill()
process.wait()
assert not process.poll() is None
ray.shutdown()
# Make sure that we can still get the objects after the executing
# tasks died.
results = ray.get(object_ids)
expected_results = 4 * list(
range(num_workers_per_scheduler * num_local_schedulers))
assert results == expected_results
def check_components_alive(self, component_type, check_component_alive):
"""Check that a given component type is alive on all worker nodes.
"""
components = ray.services.all_processes[component_type][1:]
for component in components:
if check_component_alive:
assert component.poll() is None
else:
print("waiting for " + component_type + " with PID " +
str(component.pid) + "to terminate")
component.wait()
print("done waiting for " + component_type + " with PID " +
str(component.pid) + "to terminate")
assert not component.poll() is None
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY") == "1",
reason="This test does not make sense with xray.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_local_scheduler_failed():
# Kill all local schedulers on worker nodes.
_test_component_failed(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER)
@unittest.skipIf(not os.environ.get('RAY_USE_XRAY', False),
"Only tests Raylet failure.")
def testRayletFailed(self):
# Kill all local schedulers on worker nodes.
self._testComponentFailed(ray.services.PROCESS_TYPE_RAYLET)
# The plasma stores and plasma managers should still be alive on the
# worker nodes.
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True)
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, True)
check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False)
# The plasma stores and plasma managers should still be alive on the
# worker nodes.
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
True)
ray.shutdown()
@unittest.skipIf(
os.environ.get('RAY_USE_XRAY', False),
"Raylet codepath does not have this component")
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.")
def testLocalSchedulerFailed(self):
# Kill all local schedulers on worker nodes.
self._testComponentFailed(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER)
# The plasma stores and plasma managers should still be alive on the
# worker nodes.
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
True)
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER,
True)
self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER,
False)
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY") == "1",
reason="This test does not make sense with xray.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_plasma_manager_failed():
# Kill all plasma managers on worker nodes.
_test_component_failed(ray.services.PROCESS_TYPE_PLASMA_MANAGER)
@unittest.skipIf(
os.environ.get('RAY_USE_XRAY', False),
"Raylet codepath does not have this component")
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.")
def testPlasmaManagerFailed(self):
# Kill all plasma managers on worker nodes.
self._testComponentFailed(ray.services.PROCESS_TYPE_PLASMA_MANAGER)
# The plasma stores should still be alive (but unreachable) on the
# worker nodes.
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True)
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, False)
check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False)
# The plasma stores should still be alive (but unreachable) on the
# worker nodes.
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
True)
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER,
False)
self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER,
False)
ray.shutdown()
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.")
def testPlasmaStoreFailed(self):
# Kill all plasma stores on worker nodes.
self._testComponentFailed(ray.services.PROCESS_TYPE_PLASMA_STORE)
# No processes should be left alive on the worker nodes.
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
False)
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER,
False)
self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER,
False)
self.check_components_alive(ray.services.PROCESS_TYPE_RAYLET, False)
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_plasma_store_failed():
# Kill all plasma stores on worker nodes.
_test_component_failed(ray.services.PROCESS_TYPE_PLASMA_STORE)
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
"Not working with new GCS API.")
def testDriverLivesSequential(self):
ray.worker.init(redirect_output=True)
all_processes = ray.services.all_processes
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER]
+ all_processes[ray.services.PROCESS_TYPE_RAYLET])
# No processes should be left alive on the worker nodes.
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, False)
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, False)
check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False)
check_components_alive(ray.services.PROCESS_TYPE_RAYLET, False)
# Kill all the components sequentially.
for process in processes:
process.terminate()
time.sleep(0.1)
process.kill()
process.wait()
ray.shutdown()
# If the driver can reach the tearDown method, then it is still alive.
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
"Not working with new GCS API.")
def testDriverLivesParallel(self):
ray.worker.init(redirect_output=True)
all_processes = ray.services.all_processes
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER]
+ all_processes[ray.services.PROCESS_TYPE_RAYLET])
# Kill all the components in parallel.
for process in processes:
process.terminate()
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_driver_lives_sequential():
ray.worker.init()
all_processes = ray.services.all_processes
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] +
all_processes[ray.services.PROCESS_TYPE_RAYLET])
# Kill all the components sequentially.
for process in processes:
process.terminate()
time.sleep(0.1)
for process in processes:
process.kill()
process.kill()
process.wait()
for process in processes:
process.wait()
# If the driver can reach the tearDown method, then it is still alive.
ray.shutdown()
# If the driver can reach the tearDown method, then it is still alive.
if __name__ == "__main__":
unittest.main(verbosity=2)
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_driver_lives_parallel():
ray.worker.init()
all_processes = ray.services.all_processes
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] +
all_processes[ray.services.PROCESS_TYPE_RAYLET])
# Kill all the components in parallel.
for process in processes:
process.terminate()
time.sleep(0.1)
for process in processes:
process.kill()
for process in processes:
process.wait()
# If the driver can reach the tearDown method, then it is still alive.
ray.shutdown()
+1 -1
View File
@@ -13,7 +13,7 @@ def parse_client(addr_port_str):
return redis.StrictRedis(host=redis_address, port=redis_port)
@unittest.skipIf(not os.environ.get('RAY_USE_NEW_GCS', False),
@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False),
"Tests functionality of the new GCS.")
class CredisTest(unittest.TestCase):
def setUp(self):
+371 -396
View File
@@ -8,7 +8,6 @@ import ray
import sys
import tempfile
import time
import unittest
import ray.ray_constants as ray_constants
import pytest
@@ -27,481 +26,457 @@ def wait_for_errors(error_type, num_errors, timeout=10):
raise Exception("Timing out of wait.")
class TaskStatusTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
@pytest.fixture
def ray_start_regular():
# Start the Ray processes.
ray.init(num_cpus=2)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
def testFailedTask(self):
@ray.remote
def throw_exception_fct1():
raise Exception("Test function 1 intentionally failed.")
@ray.remote
def throw_exception_fct2():
raise Exception("Test function 2 intentionally failed.")
def test_failed_task(ray_start_regular):
@ray.remote
def throw_exception_fct1():
raise Exception("Test function 1 intentionally failed.")
@ray.remote(num_return_vals=3)
def throw_exception_fct3(x):
raise Exception("Test function 3 intentionally failed.")
@ray.remote
def throw_exception_fct2():
raise Exception("Test function 2 intentionally failed.")
ray.init(num_workers=3)
@ray.remote(num_return_vals=3)
def throw_exception_fct3(x):
raise Exception("Test function 3 intentionally failed.")
throw_exception_fct1.remote()
throw_exception_fct1.remote()
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2
for task in relevant_errors(ray_constants.TASK_PUSH_ERROR):
msg = task.get("message")
assert "Test function 1 intentionally failed." in msg
throw_exception_fct1.remote()
throw_exception_fct1.remote()
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2
for task in relevant_errors(ray_constants.TASK_PUSH_ERROR):
msg = task.get("message")
assert "Test function 1 intentionally failed." in msg
x = throw_exception_fct2.remote()
x = throw_exception_fct2.remote()
try:
ray.get(x)
except Exception as e:
assert "Test function 2 intentionally failed." in str(e)
else:
# ray.get should throw an exception.
assert False
x, y, z = throw_exception_fct3.remote(1.0)
for ref in [x, y, z]:
try:
ray.get(x)
ray.get(ref)
except Exception as e:
assert "Test function 2 intentionally failed." in str(e)
assert "Test function 3 intentionally failed." in str(e)
else:
# ray.get should throw an exception.
assert False
x, y, z = throw_exception_fct3.remote(1.0)
for ref in [x, y, z]:
try:
ray.get(ref)
except Exception as e:
assert "Test function 3 intentionally failed." in str(e)
else:
# ray.get should throw an exception.
assert False
@ray.remote
def f():
raise Exception("This function failed.")
@ray.remote
def f():
raise Exception("This function failed.")
try:
ray.get(f.remote())
except Exception as e:
assert "This function failed." in str(e)
else:
# ray.get should throw an exception.
assert False
try:
ray.get(f.remote())
except Exception as e:
assert "This function failed." in str(e)
else:
# ray.get should throw an exception.
assert False
def testFailImportingRemoteFunction(self):
ray.init(num_workers=2)
# Create the contents of a temporary Python file.
temporary_python_file = """
def test_fail_importing_remote_function(ray_start_regular):
# Create the contents of a temporary Python file.
temporary_python_file = """
def temporary_helper_function():
return 1
"""
f = tempfile.NamedTemporaryFile(suffix=".py")
f.write(temporary_python_file.encode("ascii"))
f.flush()
directory = os.path.dirname(f.name)
# Get the module name and strip ".py" from the end.
module_name = os.path.basename(f.name)[:-3]
sys.path.append(directory)
module = __import__(module_name)
f = tempfile.NamedTemporaryFile(suffix=".py")
f.write(temporary_python_file.encode("ascii"))
f.flush()
directory = os.path.dirname(f.name)
# Get the module name and strip ".py" from the end.
module_name = os.path.basename(f.name)[:-3]
sys.path.append(directory)
module = __import__(module_name)
# Define a function that closes over this temporary module. This should
# fail when it is unpickled.
@ray.remote
def g():
return module.temporary_python_file()
# Define a function that closes over this temporary module. This should
# fail when it is unpickled.
@ray.remote
def g():
return module.temporary_python_file()
wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2)
assert "No module named" in ray.error_info()[0]["message"]
assert "No module named" in ray.error_info()[1]["message"]
wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2)
assert "No module named" in ray.error_info()[0]["message"]
assert "No module named" in ray.error_info()[1]["message"]
# Check that if we try to call the function it throws an exception and
# does not hang.
for _ in range(10):
with pytest.raises(Exception):
ray.get(g.remote())
# Check that if we try to call the function it throws an exception and
# does not hang.
for _ in range(10):
with pytest.raises(Exception):
ray.get(g.remote())
f.close()
f.close()
# Clean up the junk we added to sys.path.
sys.path.pop(-1)
# Clean up the junk we added to sys.path.
sys.path.pop(-1)
def testFailedFunctionToRun(self):
ray.init(num_workers=2)
def f(worker):
if ray.worker.global_worker.mode == ray.WORKER_MODE:
raise Exception("Function to run failed.")
def test_failed_function_to_run(ray_start_regular):
def f(worker):
if ray.worker.global_worker.mode == ray.WORKER_MODE:
raise Exception("Function to run failed.")
ray.worker.global_worker.run_function_on_all_workers(f)
wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2)
# Check that the error message is in the task info.
error_info = ray.error_info()
assert len(error_info) == 2
assert "Function to run failed." in error_info[0]["message"]
assert "Function to run failed." in error_info[1]["message"]
ray.worker.global_worker.run_function_on_all_workers(f)
wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2)
# Check that the error message is in the task info.
error_info = ray.error_info()
assert len(error_info) == 2
assert "Function to run failed." in error_info[0]["message"]
assert "Function to run failed." in error_info[1]["message"]
def testFailImportingActor(self):
ray.init(num_workers=2)
# Create the contents of a temporary Python file.
temporary_python_file = """
def test_fail_importing_actor(ray_start_regular):
# Create the contents of a temporary Python file.
temporary_python_file = """
def temporary_helper_function():
return 1
"""
f = tempfile.NamedTemporaryFile(suffix=".py")
f.write(temporary_python_file.encode("ascii"))
f.flush()
directory = os.path.dirname(f.name)
# Get the module name and strip ".py" from the end.
module_name = os.path.basename(f.name)[:-3]
sys.path.append(directory)
module = __import__(module_name)
f = tempfile.NamedTemporaryFile(suffix=".py")
f.write(temporary_python_file.encode("ascii"))
f.flush()
directory = os.path.dirname(f.name)
# Get the module name and strip ".py" from the end.
module_name = os.path.basename(f.name)[:-3]
sys.path.append(directory)
module = __import__(module_name)
# Define an actor that closes over this temporary module. This should
# fail when it is unpickled.
@ray.remote
class Foo(object):
def __init__(self):
self.x = module.temporary_python_file()
# Define an actor that closes over this temporary module. This should
# fail when it is unpickled.
@ray.remote
class Foo(object):
def __init__(self):
self.x = module.temporary_python_file()
def get_val(self):
return 1
def get_val(self):
return 1
# There should be no errors yet.
assert len(ray.error_info()) == 0
# There should be no errors yet.
assert len(ray.error_info()) == 0
# Create an actor.
foo = Foo.remote()
# Create an actor.
foo = Foo.remote()
# Wait for the error to arrive.
wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
assert "No module named" in ray.error_info()[0]["message"]
# Wait for the error to arrive.
wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
assert "No module named" in ray.error_info()[0]["message"]
# Wait for the error from when the __init__ tries to run.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
assert ("failed to be imported, and so cannot execute this method" in
ray.error_info()[1]["message"])
# Wait for the error from when the __init__ tries to run.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
assert ("failed to be imported, and so cannot execute this method" in
ray.error_info()[1]["message"])
# Check that if we try to get the function it throws an exception and
# does not hang.
with pytest.raises(Exception):
ray.get(foo.get_val.remote())
# Check that if we try to get the function it throws an exception and
# does not hang.
with pytest.raises(Exception):
ray.get(foo.get_val.remote())
# Wait for the error from when the call to get_val.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
assert ("failed to be imported, and so cannot execute this method" in
ray.error_info()[2]["message"])
# Wait for the error from when the call to get_val.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
assert ("failed to be imported, and so cannot execute this method" in
ray.error_info()[2]["message"])
f.close()
f.close()
# Clean up the junk we added to sys.path.
sys.path.pop(-1)
# Clean up the junk we added to sys.path.
sys.path.pop(-1)
class ActorTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def test_failed_actor_init(ray_start_regular):
error_message1 = "actor constructor failed"
error_message2 = "actor method failed"
def testFailedActorInit(self):
ray.init(num_workers=0)
@ray.remote
class FailedActor(object):
def __init__(self):
raise Exception(error_message1)
error_message1 = "actor constructor failed"
error_message2 = "actor method failed"
def get_val(self):
return 1
@ray.remote
class FailedActor(object):
def __init__(self):
raise Exception(error_message1)
def fail_method(self):
raise Exception(error_message2)
def get_val(self):
return 1
a = FailedActor.remote()
def fail_method(self):
raise Exception(error_message2)
# Make sure that we get errors from a failed constructor.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
assert len(ray.error_info()) == 1
assert error_message1 in ray.error_info()[0]["message"]
a = FailedActor.remote()
# Make sure that we get errors from a failed constructor.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
assert len(ray.error_info()) == 1
assert error_message1 in ray.error_info()[0]["message"]
# Make sure that we get errors from a failed method.
a.fail_method.remote()
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
assert len(ray.error_info()) == 2
assert error_message2 in ray.error_info()[1]["message"]
def testIncorrectMethodCalls(self):
ray.init(num_workers=0)
@ray.remote
class Actor(object):
def __init__(self, missing_variable_name):
pass
def get_val(self, x):
pass
# Make sure that we get errors if we call the constructor incorrectly.
# Create an actor with too few arguments.
with pytest.raises(Exception):
a = Actor.remote()
# Create an actor with too many arguments.
with pytest.raises(Exception):
a = Actor.remote(1, 2)
# Create an actor the correct number of arguments.
a = Actor.remote(1)
# Call a method with too few arguments.
with pytest.raises(Exception):
a.get_val.remote()
# Call a method with too many arguments.
with pytest.raises(Exception):
a.get_val.remote(1, 2)
# Call a method that doesn't exist.
with pytest.raises(AttributeError):
a.nonexistent_method()
with pytest.raises(AttributeError):
a.nonexistent_method.remote()
# Make sure that we get errors from a failed method.
a.fail_method.remote()
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
assert len(ray.error_info()) == 2
assert error_message2 in ray.error_info()[1]["message"]
class WorkerDeath(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def testWorkerRaisingException(self):
ray.init(num_workers=1)
@ray.remote
def f():
ray.worker.global_worker._get_next_task_from_local_scheduler = None
# Running this task should cause the worker to raise an exception after
# the task has successfully completed.
f.remote()
wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1)
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
assert len(ray.error_info()) == 2
def testWorkerDying(self):
ray.init(num_workers=0)
# Define a remote function that will kill the worker that runs it.
@ray.remote
def f():
eval("exit()")
f.remote()
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
error_info = ray.error_info()
assert len(error_info) == 1
assert "died or was killed while executing" in error_info[0]["message"]
def testActorWorkerDying(self):
ray.init(num_workers=0)
@ray.remote
class Actor(object):
def kill(self):
eval("exit()")
@ray.remote
def consume(x):
def test_incorrect_method_calls(ray_start_regular):
@ray.remote
class Actor(object):
def __init__(self, missing_variable_name):
pass
def get_val(self, x):
pass
# Make sure that we get errors if we call the constructor incorrectly.
# Create an actor with too few arguments.
with pytest.raises(Exception):
a = Actor.remote()
[obj], _ = ray.wait([a.kill.remote()], timeout=5000)
# Create an actor with too many arguments.
with pytest.raises(Exception):
a = Actor.remote(1, 2)
# Create an actor the correct number of arguments.
a = Actor.remote(1)
# Call a method with too few arguments.
with pytest.raises(Exception):
a.get_val.remote()
# Call a method with too many arguments.
with pytest.raises(Exception):
a.get_val.remote(1, 2)
# Call a method that doesn't exist.
with pytest.raises(AttributeError):
a.nonexistent_method()
with pytest.raises(AttributeError):
a.nonexistent_method.remote()
def test_worker_raising_exception(ray_start_regular):
@ray.remote
def f():
ray.worker.global_worker._get_next_task_from_local_scheduler = None
# Running this task should cause the worker to raise an exception after
# the task has successfully completed.
f.remote()
wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1)
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
assert len(ray.error_info()) == 2
def test_worker_dying(ray_start_regular):
# Define a remote function that will kill the worker that runs it.
@ray.remote
def f():
eval("exit()")
f.remote()
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
error_info = ray.error_info()
assert len(error_info) == 1
assert "died or was killed while executing" in error_info[0]["message"]
def test_actor_worker_dying(ray_start_regular):
@ray.remote
class Actor(object):
def kill(self):
eval("exit()")
@ray.remote
def consume(x):
pass
a = Actor.remote()
[obj], _ = ray.wait([a.kill.remote()], timeout=5000)
with pytest.raises(Exception):
ray.get(obj)
with pytest.raises(Exception):
ray.get(consume.remote(obj))
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
def test_actor_worker_dying_future_tasks(ray_start_regular):
@ray.remote
class Actor(object):
def getpid(self):
return os.getpid()
def sleep(self):
time.sleep(1)
a = Actor.remote()
pid = ray.get(a.getpid.remote())
tasks1 = [a.sleep.remote() for _ in range(10)]
os.kill(pid, 9)
time.sleep(0.1)
tasks2 = [a.sleep.remote() for _ in range(10)]
for obj in tasks1 + tasks2:
with pytest.raises(Exception):
ray.get(obj)
with pytest.raises(Exception):
ray.get(consume.remote(obj))
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
def testActorWorkerDyingFutureTasks(self):
ray.init(num_workers=0)
@ray.remote
class Actor(object):
def getpid(self):
return os.getpid()
def sleep(self):
time.sleep(1)
a = Actor.remote()
pid = ray.get(a.getpid.remote())
tasks1 = [a.sleep.remote() for _ in range(10)]
os.kill(pid, 9)
time.sleep(0.1)
tasks2 = [a.sleep.remote() for _ in range(10)]
for obj in tasks1 + tasks2:
with pytest.raises(Exception):
ray.get(obj)
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
def testActorWorkerDyingNothingInProgress(self):
ray.init(num_workers=0)
@ray.remote
class Actor(object):
def getpid(self):
return os.getpid()
a = Actor.remote()
pid = ray.get(a.getpid.remote())
os.kill(pid, 9)
time.sleep(0.1)
task2 = a.getpid.remote()
with pytest.raises(Exception):
ray.get(task2)
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
@unittest.skipIf(
def test_actor_worker_dying_nothing_in_progress(ray_start_regular):
@ray.remote
class Actor(object):
def getpid(self):
return os.getpid()
a = Actor.remote()
pid = ray.get(a.getpid.remote())
os.kill(pid, 9)
time.sleep(0.1)
task2 = a.getpid.remote()
with pytest.raises(Exception):
ray.get(task2)
@pytest.fixture
def ray_start_object_store_memory():
# Start the Ray processes.
store_size = 10**6
ray.init(num_cpus=1, object_store_memory=store_size)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY") == "1",
"This test does not work with xray yet.")
class PutErrorTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
reason="This test does not work with xray yet.")
def test_put_error1(ray_start_object_store_memory):
num_objects = 3
object_size = 4 * 10**5
def testPutError1(self):
store_size = 10**6
ray.worker._init(start_ray_local=True, object_store_memory=store_size)
# Define a task with a single dependency, a numpy array, that returns
# another array.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
num_objects = 3
object_size = 4 * 10**5
@ray.remote
def put_arg_task():
# Launch num_objects instances of the remote task, each dependent
# on the one before it. The result of the first task should get
# evicted.
args = []
arg = single_dependency.remote(0, np.zeros(
object_size, dtype=np.uint8))
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Define a task with a single dependency, a numpy array, that returns
# another array.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
# Get the last value to force all tasks to finish.
value = ray.get(args[-1])
assert value[0] == i
@ray.remote
def put_arg_task():
# Launch num_objects instances of the remote task, each dependent
# on the one before it. The result of the first task should get
# evicted.
args = []
arg = single_dependency.remote(
0, np.zeros(object_size, dtype=np.uint8))
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Get the first value (which should have been evicted) to force
# reconstruction. Currently, since we're not able to reconstruct
# `ray.put` objects that were evicted and whose originating tasks
# are still running, this for-loop should hang and push an error to
# the driver.
ray.get(args[0])
# Get the last value to force all tasks to finish.
value = ray.get(args[-1])
assert value[0] == i
put_arg_task.remote()
# Get the first value (which should have been evicted) to force
# reconstruction. Currently, since we're not able to reconstruct
# `ray.put` objects that were evicted and whose originating tasks
# are still running, this for-loop should hang and push an error to
# the driver.
ray.get(args[0])
put_arg_task.remote()
# Make sure we receive the correct error message.
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
def testPutError2(self):
# This is the same as the previous test, but it calls ray.put directly.
store_size = 10**6
ray.worker._init(start_ray_local=True, object_store_memory=store_size)
num_objects = 3
object_size = 4 * 10**5
# Define a task with a single dependency, a numpy array, that returns
# another array.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
@ray.remote
def put_task():
# Launch num_objects instances of the remote task, each dependent
# on the one before it. The result of the first task should get
# evicted.
args = []
arg = ray.put(np.zeros(object_size, dtype=np.uint8))
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Get the last value to force all tasks to finish.
value = ray.get(args[-1])
assert value[0] == i
# Get the first value (which should have been evicted) to force
# reconstruction. Currently, since we're not able to reconstruct
# `ray.put` objects that were evicted and whose originating tasks
# are still running, this for-loop should hang and push an error to
# the driver.
ray.get(args[0])
put_task.remote()
# Make sure we receive the correct error message.
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
# Make sure we receive the correct error message.
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
class ConfigurationTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
@pytest.mark.skipif(
os.environ.get("RAY_USE_XRAY") == "1",
reason="This test does not work with xray yet.")
def test_put_error2(ray_start_object_store_memory):
# This is the same as the previous test, but it calls ray.put directly.
num_objects = 3
object_size = 4 * 10**5
def testVersionMismatch(self):
ray_version = ray.__version__
ray.__version__ = "fake ray version"
# Define a task with a single dependency, a numpy array, that returns
# another array.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
ray.init(num_workers=1)
@ray.remote
def put_task():
# Launch num_objects instances of the remote task, each dependent
# on the one before it. The result of the first task should get
# evicted.
args = []
arg = ray.put(np.zeros(object_size, dtype=np.uint8))
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1)
# Get the last value to force all tasks to finish.
value = ray.get(args[-1])
assert value[0] == i
ray.__version__ = ray_version
# Get the first value (which should have been evicted) to force
# reconstruction. Currently, since we're not able to reconstruct
# `ray.put` objects that were evicted and whose originating tasks
# are still running, this for-loop should hang and push an error to
# the driver.
ray.get(args[0])
put_task.remote()
# Make sure we receive the correct error message.
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
class WarningTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def test_version_mismatch():
ray_version = ray.__version__
ray.__version__ = "fake ray version"
def testExportLargeObjects(self):
import ray.ray_constants as ray_constants
ray.init(num_cpus=1)
ray.init(num_workers=1)
wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1)
large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE)
# Reset the version.
ray.__version__ = ray_version
@ray.remote
def f():
ray.shutdown()
def test_export_large_objects(ray_start_regular):
import ray.ray_constants as ray_constants
large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE)
@ray.remote
def f():
large_object
# Make sure that a warning is generated.
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1)
@ray.remote
class Foo(object):
def __init__(self):
large_object
# Make sure that a warning is generated.
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1)
Foo.remote()
@ray.remote
class Foo(object):
def __init__(self):
large_object
Foo.remote()
# Make sure that a warning is generated.
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2)
if __name__ == "__main__":
unittest.main(verbosity=2)
# Make sure that a warning is generated.
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2)
+107 -109
View File
@@ -2,120 +2,118 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import os
import unittest
import ray
import time
import numpy as np
class MicroBenchmarkTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def testTiming(self):
@ray.remote
def empty_function():
pass
@ray.remote
def trivial_function():
return 1
ray.init(num_workers=3)
# Measure the time required to submit a remote task to the scheduler.
elapsed_times = []
for _ in range(1000):
start_time = time.time()
empty_function.remote()
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to submit an empty function call:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.00038.
# Measure the time required to submit a remote task to the scheduler
# (where the remote task returns one value).
elapsed_times = []
for _ in range(1000):
start_time = time.time()
trivial_function.remote()
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to submit a trivial function call:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.001.
# Measure the time required to submit a remote task to the scheduler
# and get the result.
elapsed_times = []
for _ in range(1000):
start_time = time.time()
x = trivial_function.remote()
ray.get(x)
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to submit a trivial function call and get the "
"result:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.0013.
# Measure the time required to do do a put.
elapsed_times = []
for _ in range(1000):
start_time = time.time()
ray.put(1)
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to put an int:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.00087.
def testCache(self):
ray.init(num_workers=1)
A = np.random.rand(1, 1000000)
v = np.random.rand(1000000)
A_id = ray.put(A)
v_id = ray.put(v)
a = time.time()
for i in range(100):
A.dot(v)
b = time.time() - a
c = time.time()
for i in range(100):
ray.get(A_id).dot(ray.get(v_id))
d = time.time() - c
if d > 1.5 * b:
if os.getenv("TRAVIS") is None:
raise Exception("The caching test was too slow. "
"d = {}, b = {}".format(d, b))
else:
print("WARNING: The caching test was too slow. "
"d = {}, b = {}".format(d, b))
@pytest.fixture
def ray_start_regular():
# Start the Ray processes.
ray.init(num_cpus=3)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
if __name__ == "__main__":
unittest.main(verbosity=2)
def test_timing(ray_start_regular):
@ray.remote
def empty_function():
pass
@ray.remote
def trivial_function():
return 1
# Measure the time required to submit a remote task to the scheduler.
elapsed_times = []
for _ in range(1000):
start_time = time.time()
empty_function.remote()
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to submit an empty function call:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.00038.
# Measure the time required to submit a remote task to the scheduler
# (where the remote task returns one value).
elapsed_times = []
for _ in range(1000):
start_time = time.time()
trivial_function.remote()
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to submit a trivial function call:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.001.
# Measure the time required to submit a remote task to the scheduler
# and get the result.
elapsed_times = []
for _ in range(1000):
start_time = time.time()
x = trivial_function.remote()
ray.get(x)
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to submit a trivial function call and get the "
"result:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.0013.
# Measure the time required to do do a put.
elapsed_times = []
for _ in range(1000):
start_time = time.time()
ray.put(1)
end_time = time.time()
elapsed_times.append(end_time - start_time)
elapsed_times = np.sort(elapsed_times)
average_elapsed_time = sum(elapsed_times) / 1000
print("Time required to put an int:")
print(" Average: {}".format(average_elapsed_time))
print(" 90th percentile: {}".format(elapsed_times[900]))
print(" 99th percentile: {}".format(elapsed_times[990]))
print(" worst: {}".format(elapsed_times[999]))
# average_elapsed_time should be about 0.00087.
def test_cache(ray_start_regular):
A = np.random.rand(1, 1000000)
v = np.random.rand(1000000)
A_id = ray.put(A)
v_id = ray.put(v)
a = time.time()
for i in range(100):
A.dot(v)
b = time.time() - a
c = time.time()
for i in range(100):
ray.get(A_id).dot(ray.get(v_id))
d = time.time() - c
if d > 1.5 * b:
if os.getenv("TRAVIS") is None:
raise Exception("The caching test was too slow. "
"d = {}, b = {}".format(d, b))
else:
print("WARNING: The caching test was too slow. "
"d = {}, b = {}".format(d, b))
+201 -189
View File
@@ -1,15 +1,16 @@
from __future__ import absolute_import, division, print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pytest
import subprocess
import sys
import tempfile
import time
import unittest
import ray
from ray.test.test_utils import run_and_get_output
import pytest
def run_string_as_driver(driver_script):
@@ -30,52 +31,56 @@ def run_string_as_driver(driver_script):
return out
class MultiNodeTest(unittest.TestCase):
def setUp(self):
out = run_and_get_output(["ray", "start", "--head"])
# Get the redis address from the output.
redis_substring_prefix = "redis_address=\""
redis_address_location = (
out.find(redis_substring_prefix) + len(redis_substring_prefix))
redis_address = out[redis_address_location:]
self.redis_address = redis_address.split("\"")[0]
@pytest.fixture
def ray_start_head():
out = run_and_get_output(["ray", "start", "--head"])
# Get the redis address from the output.
redis_substring_prefix = "redis_address=\""
redis_address_location = (
out.find(redis_substring_prefix) + len(redis_substring_prefix))
redis_address = out[redis_address_location:]
redis_address = redis_address.split("\"")[0]
def tearDown(self):
ray.shutdown()
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
yield redis_address
def testErrorIsolation(self):
# Connect a driver to the Ray cluster.
ray.init(redis_address=self.redis_address)
# Disconnect from the Ray cluster.
ray.shutdown()
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
# There shouldn't be any errors yet.
assert len(ray.error_info()) == 0
error_string1 = "error_string1"
error_string2 = "error_string2"
def test_error_isolation(ray_start_head):
redis_address = ray_start_head
# Connect a driver to the Ray cluster.
ray.init(redis_address=redis_address)
@ray.remote
def f():
raise Exception(error_string1)
# There shouldn't be any errors yet.
assert len(ray.error_info()) == 0
# Run a remote function that throws an error.
with pytest.raises(Exception):
ray.get(f.remote())
error_string1 = "error_string1"
error_string2 = "error_string2"
# Wait for the error to appear in Redis.
while len(ray.error_info()) != 1:
time.sleep(0.1)
print("Waiting for error to appear.")
@ray.remote
def f():
raise Exception(error_string1)
# Make sure we got the error.
assert len(ray.error_info()) == 1
assert error_string1 in ray.error_info()[0]["message"]
# Run a remote function that throws an error.
with pytest.raises(Exception):
ray.get(f.remote())
# Start another driver and make sure that it does not receive this
# error. Make the other driver throw an error, and make sure it
# receives that error.
driver_script = """
# Wait for the error to appear in Redis.
while len(ray.error_info()) != 1:
time.sleep(0.1)
print("Waiting for error to appear.")
# Make sure we got the error.
assert len(ray.error_info()) == 1
assert error_string1 in ray.error_info()[0]["message"]
# Start another driver and make sure that it does not receive this
# error. Make the other driver throw an error, and make sure it
# receives that error.
driver_script = """
import ray
import time
@@ -101,25 +106,28 @@ assert len(ray.error_info()) == 1
assert "{}" in ray.error_info()[0]["message"]
print("success")
""".format(self.redis_address, error_string2, error_string2)
""".format(redis_address, error_string2, error_string2)
out = run_string_as_driver(driver_script)
# Make sure the other driver succeeded.
assert "success" in out
out = run_string_as_driver(driver_script)
# Make sure the other driver succeeded.
assert "success" in out
# Make sure that the other error message doesn't show up for this
# driver.
assert len(ray.error_info()) == 1
assert error_string1 in ray.error_info()[0]["message"]
# Make sure that the other error message doesn't show up for this
# driver.
assert len(ray.error_info()) == 1
assert error_string1 in ray.error_info()[0]["message"]
def testRemoteFunctionIsolation(self):
# This test will run multiple remote functions with the same names in
# two different drivers. Connect a driver to the Ray cluster.
ray.init(redis_address=self.redis_address)
# Start another driver and make sure that it can define and call its
# own commands with the same names.
driver_script = """
def test_remote_function_isolation(ray_start_head):
# This test will run multiple remote functions with the same names in
# two different drivers. Connect a driver to the Ray cluster.
redis_address = ray_start_head
ray.init(redis_address=redis_address)
# Start another driver and make sure that it can define and call its
# own commands with the same names.
driver_script = """
import ray
import time
ray.init(redis_address="{}")
@@ -133,32 +141,35 @@ for _ in range(10000):
result = ray.get([f.remote(), g.remote(0, 0)])
assert result == [3, 4]
print("success")
""".format(self.redis_address)
""".format(redis_address)
out = run_string_as_driver(driver_script)
out = run_string_as_driver(driver_script)
@ray.remote
def f():
return 1
@ray.remote
def f():
return 1
@ray.remote
def g(x):
return 2
@ray.remote
def g(x):
return 2
for _ in range(10000):
result = ray.get([f.remote(), g.remote(0)])
assert result == [1, 2]
for _ in range(10000):
result = ray.get([f.remote(), g.remote(0)])
assert result == [1, 2]
# Make sure the other driver succeeded.
assert "success" in out
# Make sure the other driver succeeded.
assert "success" in out
def testDriverExitingQuickly(self):
# This test will create some drivers that submit some tasks and then
# exit without waiting for the tasks to complete.
ray.init(redis_address=self.redis_address)
# Define a driver that creates an actor and exits.
driver_script1 = """
def test_driver_exiting_quickly(ray_start_head):
# This test will create some drivers that submit some tasks and then
# exit without waiting for the tasks to complete.
redis_address = ray_start_head
ray.init(redis_address=redis_address)
# Define a driver that creates an actor and exits.
driver_script1 = """
import ray
ray.init(redis_address="{}")
@ray.remote
@@ -167,10 +178,10 @@ class Foo(object):
pass
Foo.remote()
print("success")
""".format(self.redis_address)
""".format(redis_address)
# Define a driver that creates some tasks and exits.
driver_script2 = """
# Define a driver that creates some tasks and exits.
driver_script2 = """
import ray
ray.init(redis_address="{}")
@ray.remote
@@ -178,137 +189,142 @@ def f():
return 1
f.remote()
print("success")
""".format(self.redis_address)
""".format(redis_address)
# Create some drivers and let them exit and make sure everything is
# still alive.
for _ in range(3):
out = run_string_as_driver(driver_script1)
# Make sure the first driver ran to completion.
assert "success" in out
out = run_string_as_driver(driver_script2)
# Make sure the first driver ran to completion.
assert "success" in out
assert ray.services.all_processes_alive()
# Create some drivers and let them exit and make sure everything is
# still alive.
for _ in range(3):
out = run_string_as_driver(driver_script1)
# Make sure the first driver ran to completion.
assert "success" in out
out = run_string_as_driver(driver_script2)
# Make sure the first driver ran to completion.
assert "success" in out
assert ray.services.all_processes_alive()
class StartRayScriptTest(unittest.TestCase):
def testCallingStartRayHead(self):
# Test that we can call start-ray.sh with various command line
# parameters. TODO(rkn): This test only tests the --head code path. We
# should also test the non-head node code path.
def test_calling_start_ray_head():
# Test that we can call start-ray.sh with various command line
# parameters. TODO(rkn): This test only tests the --head code path. We
# should also test the non-head node code path.
# Test starting Ray with no arguments.
run_and_get_output(["ray", "start", "--head"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with no arguments.
run_and_get_output(["ray", "start", "--head"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a number of workers specified.
run_and_get_output(["ray", "start", "--head", "--num-workers", "20"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a number of workers specified.
run_and_get_output(["ray", "start", "--head", "--num-workers", "20"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a redis port specified.
run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a redis port specified.
run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a node IP address specified.
run_and_get_output(
["ray", "start", "--head", "--node-ip-address", "127.0.0.1"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a node IP address specified.
run_and_get_output(
["ray", "start", "--head", "--node-ip-address", "127.0.0.1"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with an object manager port specified.
run_and_get_output(
["ray", "start", "--head", "--object-manager-port", "12345"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with an object manager port specified.
run_and_get_output(
["ray", "start", "--head", "--object-manager-port", "12345"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the number of CPUs specified.
run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the number of CPUs specified.
run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the number of GPUs specified.
run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the number of GPUs specified.
run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the max redis clients specified.
run_and_get_output(
["ray", "start", "--head", "--redis-max-clients", "100"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the max redis clients specified.
run_and_get_output(
["ray", "start", "--head", "--redis-max-clients", "100"])
subprocess.Popen(["ray", "stop"]).wait()
if "RAY_USE_NEW_GCS" not in os.environ:
# Test starting Ray with redis shard ports specified.
run_and_get_output([
"ray", "start", "--head", "--redis-shard-ports",
"6380,6381,6382"
])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with all arguments specified.
run_and_get_output([
"ray", "start", "--head", "--num-workers", "2", "--redis-port",
"6379", "--redis-shard-ports", "6380,6381,6382",
"--object-manager-port", "12345", "--num-cpus", "2",
"--num-gpus", "0", "--redis-max-clients", "100", "--resources",
"{\"Custom\": 1}"
])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with invalid arguments.
with pytest.raises(Exception):
run_and_get_output([
"ray", "start", "--head", "--redis-address", "127.0.0.1:6379"
])
subprocess.Popen(["ray", "stop"]).wait()
def testUsingHostnames(self):
# Start the Ray processes on this machine.
if "RAY_USE_NEW_GCS" not in os.environ:
# Test starting Ray with redis shard ports specified.
run_and_get_output([
"ray", "start", "--head", "--node-ip-address=localhost",
"--redis-port=6379"
"ray", "start", "--head", "--redis-shard-ports", "6380,6381,6382"
])
ray.init(node_ip_address="localhost", redis_address="localhost:6379")
@ray.remote
def f():
return 1
assert ray.get(f.remote()) == 1
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with all arguments specified.
run_and_get_output([
"ray", "start", "--head", "--num-workers", "2", "--redis-port",
"6379", "--redis-shard-ports", "6380,6381,6382",
"--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus",
"0", "--redis-max-clients", "100", "--resources", "{\"Custom\": 1}"
])
subprocess.Popen(["ray", "stop"]).wait()
class MiscellaneousTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
# Test starting Ray with invalid arguments.
with pytest.raises(Exception):
run_and_get_output(
["ray", "start", "--head", "--redis-address", "127.0.0.1:6379"])
subprocess.Popen(["ray", "stop"]).wait()
def testConnectingInLocalCase(self):
address_info = ray.init(num_cpus=0)
# Define a driver that just connects to Redis.
driver_script = """
@pytest.fixture
def ray_start_head_local():
# Start the Ray processes on this machine.
run_and_get_output([
"ray", "start", "--head", "--node-ip-address=localhost",
"--redis-port=6379"
])
yield None
# Disconnect from the Ray cluster.
ray.shutdown()
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
def test_using_hostnames(ray_start_head_local):
ray.init(node_ip_address="localhost", redis_address="localhost:6379")
@ray.remote
def f():
return 1
assert ray.get(f.remote()) == 1
@pytest.fixture
def ray_start_regular():
# Start the Ray processes.
address_info = ray.init(num_cpus=1)
yield address_info
# The code after the yield will run as teardown code.
ray.shutdown()
def test_connecting_in_local_case(ray_start_regular):
address_info = ray_start_regular
# Define a driver that just connects to Redis.
driver_script = """
import ray
ray.init(redis_address="{}")
print("success")
""".format(address_info["redis_address"])
out = run_string_as_driver(driver_script)
# Make sure the other driver succeeded.
assert "success" in out
out = run_string_as_driver(driver_script)
# Make sure the other driver succeeded.
assert "success" in out
class RunDriverForMultipleTimesTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def testRunDriverForTwice(self):
# We used to have issue 2165 and 2288:
# https://github.com/ray-project/ray/issues/2165
# https://github.com/ray-project/ray/issues/2288
# both complain that driver will hang when run for the second time.
# This test is used to verify the fix for above issue, it will run the
# same driver for twice and verify whether both of them succeed.
address_info = ray.init()
driver_script = """
def test_run_driver_twice(ray_start_regular):
# We used to have issue 2165 and 2288:
# https://github.com/ray-project/ray/issues/2165
# https://github.com/ray-project/ray/issues/2288
# both complain that driver will hang when run for the second time.
# This test is used to verify the fix for above issue, it will run the
# same driver for twice and verify whether both of them succeed.
address_info = ray_start_regular
driver_script = """
import ray
import ray.tune as tune
import os
@@ -338,10 +354,6 @@ tune.run_experiments({{
print("success")
""".format(address_info["redis_address"])
for i in range(2):
out = run_string_as_driver(driver_script)
assert "success" in out
if __name__ == "__main__":
unittest.main(verbosity=2)
for i in range(2):
out = run_string_as_driver(driver_script)
assert "success" in out
+2 -2
View File
@@ -1287,7 +1287,7 @@ class APITest(unittest.TestCase):
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
os.environ.get("RAY_USE_NEW_GCS", False),
"For now, RAY_USE_NEW_GCS supports 1 shard, and credis "
"supports 1-node chain for that shard only.")
class APITestSharded(APITest):
@@ -2187,7 +2187,7 @@ def wait_for_num_objects(num_objects, timeout=10):
@unittest.skipIf(
os.environ.get('RAY_USE_NEW_GCS', False),
os.environ.get("RAY_USE_NEW_GCS", False),
"New GCS API doesn't have a Python API yet.")
class GlobalStateAPI(unittest.TestCase):
def tearDown(self):
+165 -174
View File
@@ -3,8 +3,8 @@ from __future__ import division
from __future__ import print_function
from numpy.testing import assert_almost_equal
import pytest
import tensorflow as tf
import unittest
import ray
@@ -93,178 +93,169 @@ class TrainActor(object):
return self.values[1].get_weights()
class TensorFlowTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
def testTensorFlowVariables(self):
ray.init(num_workers=2)
sess = tf.Session()
loss, init, _, _ = make_linear_network()
sess.run(init)
variables = ray.experimental.TensorFlowVariables(loss, sess)
weights = variables.get_weights()
for (name, val) in weights.items():
weights[name] += 1.0
variables.set_weights(weights)
assert weights == variables.get_weights()
loss2, init2, _, _ = make_linear_network("w", "b")
sess.run(init2)
variables2 = ray.experimental.TensorFlowVariables(loss2, sess)
weights2 = variables2.get_weights()
for (name, val) in weights2.items():
weights2[name] += 2.0
variables2.set_weights(weights2)
assert weights2 == variables2.get_weights()
flat_weights = variables2.get_flat() + 2.0
variables2.set_flat(flat_weights)
assert_almost_equal(flat_weights, variables2.get_flat())
variables3 = ray.experimental.TensorFlowVariables([loss2])
assert variables3.sess is None
sess = tf.Session()
variables3.set_session(sess)
assert variables3.sess == sess
# Test that the variable names for the two different nets are not
# modified by TensorFlow to be unique (i.e., they should already
# be unique because of the variable prefix).
def testVariableNameCollision(self):
ray.init(num_workers=2)
net1 = NetActor()
net2 = NetActor()
# This is checking that the variable names of the two nets are the
# same, i.e., that the names in the weight dictionaries are the same.
net1.values[0].set_weights(net2.values[0].get_weights())
# Test that TensorFlowVariables can take in addition variables through
# input_variables arg and with no loss.
def testAdditionalVariablesNoLoss(self):
ray.init(num_workers=1)
net = LossActor(use_loss=False)
assert len(net.values[0].variables.items()) == 1
assert len(net.values[0].placeholders.items()) == 1
net.values[0].set_weights(net.values[0].get_weights())
# Test that TensorFlowVariables can take in addition variables through
# input_variables arg and with a loss.
def testAdditionalVariablesWithLoss(self):
ray.init(num_workers=1)
net = LossActor()
assert len(net.values[0].variables.items()) == 3
assert len(net.values[0].placeholders.items()) == 3
net.values[0].set_weights(net.values[0].get_weights())
# Test that different networks on the same worker are independent and
# we can get/set their weights without any interaction.
def testNetworksIndependent(self):
# Note we use only one worker to ensure that all of the remote
# functions run on the same worker.
ray.init(num_workers=1)
net1 = NetActor()
net2 = NetActor()
# Make sure the two networks have different weights. TODO(rkn): Note
# that equality comparisons of numpy arrays normally does not work.
# This only works because at the moment they have size 1.
weights1 = net1.get_weights()
weights2 = net2.get_weights()
assert weights1 != weights2
# Set the weights and get the weights, and make sure they are
# unchanged.
new_weights1 = net1.set_and_get_weights(weights1)
new_weights2 = net2.set_and_get_weights(weights2)
assert weights1 == new_weights1
assert weights2 == new_weights2
# Swap the weights.
new_weights1 = net2.set_and_get_weights(weights1)
new_weights2 = net1.set_and_get_weights(weights2)
assert weights1 == new_weights1
assert weights2 == new_weights2
# This test creates an additional network on the driver so that the
# tensorflow variables on the driver and the worker differ.
def testNetworkDriverWorkerIndependent(self):
ray.init(num_workers=1)
# Create a network on the driver locally.
sess1 = tf.Session()
loss1, init1, _, _ = make_linear_network()
ray.experimental.TensorFlowVariables(loss1, sess1)
sess1.run(init1)
net2 = ray.remote(NetActor).remote()
weights2 = ray.get(net2.get_weights.remote())
new_weights2 = ray.get(
net2.set_and_get_weights.remote(net2.get_weights.remote()))
assert weights2 == new_weights2
def testVariablesControlDependencies(self):
ray.init(num_workers=1)
# Creates a network and appends a momentum optimizer.
sess = tf.Session()
loss, init, _, _ = make_linear_network()
minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss)
net_vars = ray.experimental.TensorFlowVariables(minimizer, sess)
sess.run(init)
# Tests if all variables are properly retrieved, 2 variables and 2
# momentum variables.
assert len(net_vars.variables.items()) == 4
def testRemoteTrainingStep(self):
ray.init(num_workers=1)
net = ray.remote(TrainActor).remote()
ray.get(net.training_step.remote(net.get_weights.remote()))
def testRemoteTrainingLoss(self):
ray.init(num_workers=2)
net = ray.remote(TrainActor).remote()
net_values = TrainActor().values
loss, variables, _, sess, grads, train, placeholders = net_values
before_acc = sess.run(
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
for _ in range(3):
gradients_list = ray.get([
net.training_step.remote(variables.get_weights())
for _ in range(2)
])
mean_grads = [
sum(gradients[i]
for gradients in gradients_list) / len(gradients_list)
for i in range(len(gradients_list[0]))
]
feed_dict = {
grad[0]: mean_grad
for (grad, mean_grad) in zip(grads, mean_grads)
}
sess.run(train, feed_dict=feed_dict)
after_acc = sess.run(
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
assert before_acc < after_acc
@pytest.fixture
def ray_start_regular():
# Start the Ray processes.
ray.init(num_cpus=2)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
if __name__ == "__main__":
unittest.main(verbosity=2)
def test_tensorflow_variables(ray_start_regular):
sess = tf.Session()
loss, init, _, _ = make_linear_network()
sess.run(init)
variables = ray.experimental.TensorFlowVariables(loss, sess)
weights = variables.get_weights()
for (name, val) in weights.items():
weights[name] += 1.0
variables.set_weights(weights)
assert weights == variables.get_weights()
loss2, init2, _, _ = make_linear_network("w", "b")
sess.run(init2)
variables2 = ray.experimental.TensorFlowVariables(loss2, sess)
weights2 = variables2.get_weights()
for (name, val) in weights2.items():
weights2[name] += 2.0
variables2.set_weights(weights2)
assert weights2 == variables2.get_weights()
flat_weights = variables2.get_flat() + 2.0
variables2.set_flat(flat_weights)
assert_almost_equal(flat_weights, variables2.get_flat())
variables3 = ray.experimental.TensorFlowVariables([loss2])
assert variables3.sess is None
sess = tf.Session()
variables3.set_session(sess)
assert variables3.sess == sess
# Test that the variable names for the two different nets are not
# modified by TensorFlow to be unique (i.e., they should already
# be unique because of the variable prefix).
def test_variable_name_collision(ray_start_regular):
net1 = NetActor()
net2 = NetActor()
# This is checking that the variable names of the two nets are the
# same, i.e., that the names in the weight dictionaries are the same.
net1.values[0].set_weights(net2.values[0].get_weights())
# Test that TensorFlowVariables can take in addition variables through
# input_variables arg and with no loss.
def test_additional_variables_no_loss(ray_start_regular):
net = LossActor(use_loss=False)
assert len(net.values[0].variables.items()) == 1
assert len(net.values[0].placeholders.items()) == 1
net.values[0].set_weights(net.values[0].get_weights())
# Test that TensorFlowVariables can take in addition variables through
# input_variables arg and with a loss.
def test_additional_variables_with_loss(ray_start_regular):
net = LossActor()
assert len(net.values[0].variables.items()) == 3
assert len(net.values[0].placeholders.items()) == 3
net.values[0].set_weights(net.values[0].get_weights())
# Test that different networks on the same worker are independent and
# we can get/set their weights without any interaction.
def test_networks_independent(ray_start_regular):
# Note we use only one worker to ensure that all of the remote
# functions run on the same worker.
net1 = NetActor()
net2 = NetActor()
# Make sure the two networks have different weights. TODO(rkn): Note
# that equality comparisons of numpy arrays normally does not work.
# This only works because at the moment they have size 1.
weights1 = net1.get_weights()
weights2 = net2.get_weights()
assert weights1 != weights2
# Set the weights and get the weights, and make sure they are
# unchanged.
new_weights1 = net1.set_and_get_weights(weights1)
new_weights2 = net2.set_and_get_weights(weights2)
assert weights1 == new_weights1
assert weights2 == new_weights2
# Swap the weights.
new_weights1 = net2.set_and_get_weights(weights1)
new_weights2 = net1.set_and_get_weights(weights2)
assert weights1 == new_weights1
assert weights2 == new_weights2
# This test creates an additional network on the driver so that the
# tensorflow variables on the driver and the worker differ.
def test_network_driver_worker_independent(ray_start_regular):
# Create a network on the driver locally.
sess1 = tf.Session()
loss1, init1, _, _ = make_linear_network()
ray.experimental.TensorFlowVariables(loss1, sess1)
sess1.run(init1)
net2 = ray.remote(NetActor).remote()
weights2 = ray.get(net2.get_weights.remote())
new_weights2 = ray.get(
net2.set_and_get_weights.remote(net2.get_weights.remote()))
assert weights2 == new_weights2
def test_variables_control_dependencies(ray_start_regular):
# Creates a network and appends a momentum optimizer.
sess = tf.Session()
loss, init, _, _ = make_linear_network()
minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss)
net_vars = ray.experimental.TensorFlowVariables(minimizer, sess)
sess.run(init)
# Tests if all variables are properly retrieved, 2 variables and 2
# momentum variables.
assert len(net_vars.variables.items()) == 4
def test_remote_training_step(ray_start_regular):
net = ray.remote(TrainActor).remote()
ray.get(net.training_step.remote(net.get_weights.remote()))
def test_remote_training_loss(ray_start_regular):
net = ray.remote(TrainActor).remote()
net_values = TrainActor().values
loss, variables, _, sess, grads, train, placeholders = net_values
before_acc = sess.run(
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
for _ in range(3):
gradients_list = ray.get([
net.training_step.remote(variables.get_weights()) for _ in range(2)
])
mean_grads = [
sum(gradients[i]
for gradients in gradients_list) / len(gradients_list)
for i in range(len(gradients_list[0]))
]
feed_dict = {
grad[0]: mean_grad
for (grad, mean_grad) in zip(grads, mean_grads)
}
sess.run(train, feed_dict=feed_dict)
after_acc = sess.run(
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
assert before_acc < after_acc