mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:38:19 +08:00
Convert some unittests to pytest. (#2779)
* Convert multi_node_test.py to pytest. * Convert array_test.py to pytest. * Convert failure_test.py to pytest. * Convert microbenchmarks to pytest. * Convert component_failures_test.py to pytest and some minor quotes changes. * Convert tensorflow_test.py to pytest. * Convert actor_test.py to pytest. * Fix. * Fix
This commit is contained in:
committed by
Philipp Moritz
parent
3813ae34b3
commit
eda6ebb87d
@@ -189,7 +189,7 @@ class TestGlobalScheduler(unittest.TestCase):
|
||||
assert (db_client_id is not None)
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"New GCS API doesn't have a Python API yet.")
|
||||
def test_integration_single_task(self):
|
||||
# There should be three db clients, the global scheduler, the local
|
||||
@@ -307,13 +307,13 @@ class TestGlobalScheduler(unittest.TestCase):
|
||||
self.assertEqual(num_tasks_done + num_tasks_waiting, num_tasks)
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"New GCS API doesn't have a Python API yet.")
|
||||
def test_integration_many_tasks_handler_sync(self):
|
||||
self.integration_many_tasks_helper(timesync=True)
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"New GCS API doesn't have a Python API yet.")
|
||||
def test_integration_many_tasks(self):
|
||||
# More realistic case: should handle out of order object and task
|
||||
|
||||
+1814
-1817
File diff suppressed because it is too large
Load Diff
+215
-220
@@ -4,8 +4,8 @@ from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
import pytest
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
import ray.experimental.array.remote as ra
|
||||
@@ -15,229 +15,224 @@ if sys.version_info >= (3, 0):
|
||||
from importlib import reload
|
||||
|
||||
|
||||
class RemoteArrayTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
|
||||
def testMethods(self):
|
||||
for module in [
|
||||
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
|
||||
]:
|
||||
reload(module)
|
||||
ray.init()
|
||||
|
||||
# test eye
|
||||
object_id = ra.eye.remote(3)
|
||||
val = ray.get(object_id)
|
||||
assert_almost_equal(val, np.eye(3))
|
||||
|
||||
# test zeros
|
||||
object_id = ra.zeros.remote([3, 4, 5])
|
||||
val = ray.get(object_id)
|
||||
assert_equal(val, np.zeros([3, 4, 5]))
|
||||
|
||||
# test qr - pass by value
|
||||
a_val = np.random.normal(size=[10, 11])
|
||||
q_id, r_id = ra.linalg.qr.remote(a_val)
|
||||
q_val = ray.get(q_id)
|
||||
r_val = ray.get(r_id)
|
||||
assert_almost_equal(np.dot(q_val, r_val), a_val)
|
||||
|
||||
# test qr - pass by objectid
|
||||
a = ra.random.normal.remote([10, 13])
|
||||
q_id, r_id = ra.linalg.qr.remote(a)
|
||||
a_val = ray.get(a)
|
||||
q_val = ray.get(q_id)
|
||||
r_val = ray.get(r_id)
|
||||
assert_almost_equal(np.dot(q_val, r_val), a_val)
|
||||
@pytest.fixture
|
||||
def ray_start_regular():
|
||||
for module in [
|
||||
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
|
||||
]:
|
||||
reload(module)
|
||||
# Start the Ray processes.
|
||||
ray.init(num_cpus=2)
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
class DistributedArrayTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
def test_remote_array_methods(ray_start_regular):
|
||||
# test eye
|
||||
object_id = ra.eye.remote(3)
|
||||
val = ray.get(object_id)
|
||||
assert_almost_equal(val, np.eye(3))
|
||||
|
||||
def testAssemble(self):
|
||||
for module in [
|
||||
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
|
||||
]:
|
||||
reload(module)
|
||||
ray.init()
|
||||
# test zeros
|
||||
object_id = ra.zeros.remote([3, 4, 5])
|
||||
val = ray.get(object_id)
|
||||
assert_equal(val, np.zeros([3, 4, 5]))
|
||||
|
||||
a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
|
||||
b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
|
||||
x = da.DistArray([2 * da.BLOCK_SIZE, da.BLOCK_SIZE],
|
||||
np.array([[a], [b]]))
|
||||
assert_equal(
|
||||
x.assemble(),
|
||||
np.vstack([
|
||||
np.ones([da.BLOCK_SIZE, da.BLOCK_SIZE]),
|
||||
np.zeros([da.BLOCK_SIZE, da.BLOCK_SIZE])
|
||||
]))
|
||||
# test qr - pass by value
|
||||
a_val = np.random.normal(size=[10, 11])
|
||||
q_id, r_id = ra.linalg.qr.remote(a_val)
|
||||
q_val = ray.get(q_id)
|
||||
r_val = ray.get(r_id)
|
||||
assert_almost_equal(np.dot(q_val, r_val), a_val)
|
||||
|
||||
def testMethods(self):
|
||||
for module in [
|
||||
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
|
||||
]:
|
||||
reload(module)
|
||||
ray.worker._init(
|
||||
start_ray_local=True, num_local_schedulers=2, num_cpus=[10, 10])
|
||||
|
||||
x = da.zeros.remote([9, 25, 51], "float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
|
||||
|
||||
x = da.ones.remote([11, 25, 49], dtype_name="float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.ones([11, 25, 49]))
|
||||
|
||||
x = da.random.normal.remote([11, 25, 49])
|
||||
y = da.copy.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(y)))
|
||||
|
||||
x = da.eye.remote(25, dtype_name="float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.eye(25))
|
||||
|
||||
x = da.random.normal.remote([25, 49])
|
||||
y = da.triu.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(y)),
|
||||
np.triu(ray.get(da.assemble.remote(x))))
|
||||
|
||||
x = da.random.normal.remote([25, 49])
|
||||
y = da.tril.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(y)),
|
||||
np.tril(ray.get(da.assemble.remote(x))))
|
||||
|
||||
x = da.random.normal.remote([25, 49])
|
||||
y = da.random.normal.remote([49, 18])
|
||||
z = da.dot.remote(x, y)
|
||||
w = da.assemble.remote(z)
|
||||
u = da.assemble.remote(x)
|
||||
v = da.assemble.remote(y)
|
||||
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
|
||||
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
|
||||
|
||||
# test add
|
||||
x = da.random.normal.remote([23, 42])
|
||||
y = da.random.normal.remote([23, 42])
|
||||
z = da.add.remote(x, y)
|
||||
assert_almost_equal(
|
||||
ray.get(da.assemble.remote(z)),
|
||||
ray.get(da.assemble.remote(x)) + ray.get(da.assemble.remote(y)))
|
||||
|
||||
# test subtract
|
||||
x = da.random.normal.remote([33, 40])
|
||||
y = da.random.normal.remote([33, 40])
|
||||
z = da.subtract.remote(x, y)
|
||||
assert_almost_equal(
|
||||
ray.get(da.assemble.remote(z)),
|
||||
ray.get(da.assemble.remote(x)) - ray.get(da.assemble.remote(y)))
|
||||
|
||||
# test transpose
|
||||
x = da.random.normal.remote([234, 432])
|
||||
y = da.transpose.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(x)).T, ray.get(da.assemble.remote(y)))
|
||||
|
||||
# test numpy_to_dist
|
||||
x = da.random.normal.remote([23, 45])
|
||||
y = da.assemble.remote(x)
|
||||
z = da.numpy_to_dist.remote(y)
|
||||
w = da.assemble.remote(z)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(z)))
|
||||
assert_equal(ray.get(y), ray.get(w))
|
||||
|
||||
# test da.tsqr
|
||||
for shape in [[123, da.BLOCK_SIZE], [7, da.BLOCK_SIZE],
|
||||
[da.BLOCK_SIZE, da.BLOCK_SIZE], [da.BLOCK_SIZE, 7],
|
||||
[10 * da.BLOCK_SIZE, da.BLOCK_SIZE]]:
|
||||
x = da.random.normal.remote(shape)
|
||||
K = min(shape)
|
||||
q, r = da.linalg.tsqr.remote(x)
|
||||
x_val = ray.get(da.assemble.remote(x))
|
||||
q_val = ray.get(da.assemble.remote(q))
|
||||
r_val = ray.get(r)
|
||||
assert r_val.shape == (K, shape[1])
|
||||
assert_equal(r_val, np.triu(r_val))
|
||||
assert_almost_equal(x_val, np.dot(q_val, r_val))
|
||||
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
|
||||
|
||||
# test da.linalg.modified_lu
|
||||
def test_modified_lu(d1, d2):
|
||||
print("testing dist_modified_lu with d1 = " + str(d1) + ", d2 = " +
|
||||
str(d2))
|
||||
assert d1 >= d2
|
||||
m = ra.random.normal.remote([d1, d2])
|
||||
q, r = ra.linalg.qr.remote(m)
|
||||
l, u, s = da.linalg.modified_lu.remote(da.numpy_to_dist.remote(q))
|
||||
q_val = ray.get(q)
|
||||
ray.get(r)
|
||||
l_val = ray.get(da.assemble.remote(l))
|
||||
u_val = ray.get(u)
|
||||
s_val = ray.get(s)
|
||||
s_mat = np.zeros((d1, d2))
|
||||
for i in range(len(s_val)):
|
||||
s_mat[i, i] = s_val[i]
|
||||
# Check that q - s = l * u.
|
||||
assert_almost_equal(q_val - s_mat, np.dot(l_val, u_val))
|
||||
# Check that u is upper triangular.
|
||||
assert_equal(np.triu(u_val), u_val)
|
||||
# Check that l is lower triangular.
|
||||
assert_equal(np.tril(l_val), l_val)
|
||||
|
||||
for d1, d2 in [(100, 100), (99, 98), (7, 5), (7, 7), (20, 7), (20,
|
||||
10)]:
|
||||
test_modified_lu(d1, d2)
|
||||
|
||||
# test dist_tsqr_hr
|
||||
def test_dist_tsqr_hr(d1, d2):
|
||||
print("testing dist_tsqr_hr with d1 = " + str(d1) + ", d2 = " +
|
||||
str(d2))
|
||||
a = da.random.normal.remote([d1, d2])
|
||||
y, t, y_top, r = da.linalg.tsqr_hr.remote(a)
|
||||
a_val = ray.get(da.assemble.remote(a))
|
||||
y_val = ray.get(da.assemble.remote(y))
|
||||
t_val = ray.get(t)
|
||||
y_top_val = ray.get(y_top)
|
||||
r_val = ray.get(r)
|
||||
tall_eye = np.zeros((d1, min(d1, d2)))
|
||||
np.fill_diagonal(tall_eye, 1)
|
||||
q = tall_eye - np.dot(y_val, np.dot(t_val, y_top_val.T))
|
||||
# Check that q.T * q = I.
|
||||
assert_almost_equal(np.dot(q.T, q), np.eye(min(d1, d2)))
|
||||
# Check that a = (I - y * t * y_top.T) * r.
|
||||
assert_almost_equal(np.dot(q, r_val), a_val)
|
||||
|
||||
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE),
|
||||
(da.BLOCK_SIZE, da.BLOCK_SIZE), (da.BLOCK_SIZE, 7),
|
||||
(10 * da.BLOCK_SIZE, da.BLOCK_SIZE)]:
|
||||
test_dist_tsqr_hr(d1, d2)
|
||||
|
||||
def test_dist_qr(d1, d2):
|
||||
print("testing qr with d1 = {}, and d2 = {}.".format(d1, d2))
|
||||
a = da.random.normal.remote([d1, d2])
|
||||
K = min(d1, d2)
|
||||
q, r = da.linalg.qr.remote(a)
|
||||
a_val = ray.get(da.assemble.remote(a))
|
||||
q_val = ray.get(da.assemble.remote(q))
|
||||
r_val = ray.get(da.assemble.remote(r))
|
||||
assert q_val.shape == (d1, K)
|
||||
assert r_val.shape == (K, d2)
|
||||
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
|
||||
assert_equal(r_val, np.triu(r_val))
|
||||
assert_almost_equal(a_val, np.dot(q_val, r_val))
|
||||
|
||||
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE),
|
||||
(da.BLOCK_SIZE, da.BLOCK_SIZE), (da.BLOCK_SIZE, 7),
|
||||
(13, 21), (34, 35), (8, 7)]:
|
||||
test_dist_qr(d1, d2)
|
||||
test_dist_qr(d2, d1)
|
||||
for _ in range(20):
|
||||
d1 = np.random.randint(1, 35)
|
||||
d2 = np.random.randint(1, 35)
|
||||
test_dist_qr(d1, d2)
|
||||
# test qr - pass by objectid
|
||||
a = ra.random.normal.remote([10, 13])
|
||||
q_id, r_id = ra.linalg.qr.remote(a)
|
||||
a_val = ray.get(a)
|
||||
q_val = ray.get(q_id)
|
||||
r_val = ray.get(r_id)
|
||||
assert_almost_equal(np.dot(q_val, r_val), a_val)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
def test_distributed_array_assemble(ray_start_regular):
|
||||
a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
|
||||
b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE])
|
||||
x = da.DistArray([2 * da.BLOCK_SIZE, da.BLOCK_SIZE], np.array([[a], [b]]))
|
||||
assert_equal(
|
||||
x.assemble(),
|
||||
np.vstack([
|
||||
np.ones([da.BLOCK_SIZE, da.BLOCK_SIZE]),
|
||||
np.zeros([da.BLOCK_SIZE, da.BLOCK_SIZE])
|
||||
]))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_start_two_nodes():
|
||||
for module in [
|
||||
ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
|
||||
]:
|
||||
reload(module)
|
||||
# Start the Ray processes.
|
||||
ray.worker._init(
|
||||
start_ray_local=True, num_local_schedulers=2, num_cpus=[10, 10])
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_distributed_array_methods(ray_start_two_nodes):
|
||||
x = da.zeros.remote([9, 25, 51], "float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51]))
|
||||
|
||||
x = da.ones.remote([11, 25, 49], dtype_name="float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.ones([11, 25, 49]))
|
||||
|
||||
x = da.random.normal.remote([11, 25, 49])
|
||||
y = da.copy.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(y)))
|
||||
|
||||
x = da.eye.remote(25, dtype_name="float")
|
||||
assert_equal(ray.get(da.assemble.remote(x)), np.eye(25))
|
||||
|
||||
x = da.random.normal.remote([25, 49])
|
||||
y = da.triu.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(y)), np.triu(
|
||||
ray.get(da.assemble.remote(x))))
|
||||
|
||||
x = da.random.normal.remote([25, 49])
|
||||
y = da.tril.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(y)), np.tril(
|
||||
ray.get(da.assemble.remote(x))))
|
||||
|
||||
x = da.random.normal.remote([25, 49])
|
||||
y = da.random.normal.remote([49, 18])
|
||||
z = da.dot.remote(x, y)
|
||||
w = da.assemble.remote(z)
|
||||
u = da.assemble.remote(x)
|
||||
v = da.assemble.remote(y)
|
||||
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
|
||||
assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v)))
|
||||
|
||||
# test add
|
||||
x = da.random.normal.remote([23, 42])
|
||||
y = da.random.normal.remote([23, 42])
|
||||
z = da.add.remote(x, y)
|
||||
assert_almost_equal(
|
||||
ray.get(da.assemble.remote(z)),
|
||||
ray.get(da.assemble.remote(x)) + ray.get(da.assemble.remote(y)))
|
||||
|
||||
# test subtract
|
||||
x = da.random.normal.remote([33, 40])
|
||||
y = da.random.normal.remote([33, 40])
|
||||
z = da.subtract.remote(x, y)
|
||||
assert_almost_equal(
|
||||
ray.get(da.assemble.remote(z)),
|
||||
ray.get(da.assemble.remote(x)) - ray.get(da.assemble.remote(y)))
|
||||
|
||||
# test transpose
|
||||
x = da.random.normal.remote([234, 432])
|
||||
y = da.transpose.remote(x)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(x)).T, ray.get(da.assemble.remote(y)))
|
||||
|
||||
# test numpy_to_dist
|
||||
x = da.random.normal.remote([23, 45])
|
||||
y = da.assemble.remote(x)
|
||||
z = da.numpy_to_dist.remote(y)
|
||||
w = da.assemble.remote(z)
|
||||
assert_equal(
|
||||
ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(z)))
|
||||
assert_equal(ray.get(y), ray.get(w))
|
||||
|
||||
# test da.tsqr
|
||||
for shape in [[123, da.BLOCK_SIZE], [7, da.BLOCK_SIZE],
|
||||
[da.BLOCK_SIZE, da.BLOCK_SIZE], [da.BLOCK_SIZE, 7],
|
||||
[10 * da.BLOCK_SIZE, da.BLOCK_SIZE]]:
|
||||
x = da.random.normal.remote(shape)
|
||||
K = min(shape)
|
||||
q, r = da.linalg.tsqr.remote(x)
|
||||
x_val = ray.get(da.assemble.remote(x))
|
||||
q_val = ray.get(da.assemble.remote(q))
|
||||
r_val = ray.get(r)
|
||||
assert r_val.shape == (K, shape[1])
|
||||
assert_equal(r_val, np.triu(r_val))
|
||||
assert_almost_equal(x_val, np.dot(q_val, r_val))
|
||||
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
|
||||
|
||||
# test da.linalg.modified_lu
|
||||
def test_modified_lu(d1, d2):
|
||||
print("testing dist_modified_lu with d1 = " + str(d1) + ", d2 = " +
|
||||
str(d2))
|
||||
assert d1 >= d2
|
||||
m = ra.random.normal.remote([d1, d2])
|
||||
q, r = ra.linalg.qr.remote(m)
|
||||
l, u, s = da.linalg.modified_lu.remote(da.numpy_to_dist.remote(q))
|
||||
q_val = ray.get(q)
|
||||
ray.get(r)
|
||||
l_val = ray.get(da.assemble.remote(l))
|
||||
u_val = ray.get(u)
|
||||
s_val = ray.get(s)
|
||||
s_mat = np.zeros((d1, d2))
|
||||
for i in range(len(s_val)):
|
||||
s_mat[i, i] = s_val[i]
|
||||
# Check that q - s = l * u.
|
||||
assert_almost_equal(q_val - s_mat, np.dot(l_val, u_val))
|
||||
# Check that u is upper triangular.
|
||||
assert_equal(np.triu(u_val), u_val)
|
||||
# Check that l is lower triangular.
|
||||
assert_equal(np.tril(l_val), l_val)
|
||||
|
||||
for d1, d2 in [(100, 100), (99, 98), (7, 5), (7, 7), (20, 7), (20, 10)]:
|
||||
test_modified_lu(d1, d2)
|
||||
|
||||
# test dist_tsqr_hr
|
||||
def test_dist_tsqr_hr(d1, d2):
|
||||
print("testing dist_tsqr_hr with d1 = " + str(d1) + ", d2 = " +
|
||||
str(d2))
|
||||
a = da.random.normal.remote([d1, d2])
|
||||
y, t, y_top, r = da.linalg.tsqr_hr.remote(a)
|
||||
a_val = ray.get(da.assemble.remote(a))
|
||||
y_val = ray.get(da.assemble.remote(y))
|
||||
t_val = ray.get(t)
|
||||
y_top_val = ray.get(y_top)
|
||||
r_val = ray.get(r)
|
||||
tall_eye = np.zeros((d1, min(d1, d2)))
|
||||
np.fill_diagonal(tall_eye, 1)
|
||||
q = tall_eye - np.dot(y_val, np.dot(t_val, y_top_val.T))
|
||||
# Check that q.T * q = I.
|
||||
assert_almost_equal(np.dot(q.T, q), np.eye(min(d1, d2)))
|
||||
# Check that a = (I - y * t * y_top.T) * r.
|
||||
assert_almost_equal(np.dot(q, r_val), a_val)
|
||||
|
||||
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), (da.BLOCK_SIZE,
|
||||
da.BLOCK_SIZE),
|
||||
(da.BLOCK_SIZE, 7), (10 * da.BLOCK_SIZE, da.BLOCK_SIZE)]:
|
||||
test_dist_tsqr_hr(d1, d2)
|
||||
|
||||
def test_dist_qr(d1, d2):
|
||||
print("testing qr with d1 = {}, and d2 = {}.".format(d1, d2))
|
||||
a = da.random.normal.remote([d1, d2])
|
||||
K = min(d1, d2)
|
||||
q, r = da.linalg.qr.remote(a)
|
||||
a_val = ray.get(da.assemble.remote(a))
|
||||
q_val = ray.get(da.assemble.remote(q))
|
||||
r_val = ray.get(da.assemble.remote(r))
|
||||
assert q_val.shape == (d1, K)
|
||||
assert r_val.shape == (K, d2)
|
||||
assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K))
|
||||
assert_equal(r_val, np.triu(r_val))
|
||||
assert_almost_equal(a_val, np.dot(q_val, r_val))
|
||||
|
||||
for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), (da.BLOCK_SIZE,
|
||||
da.BLOCK_SIZE),
|
||||
(da.BLOCK_SIZE, 7), (13, 21), (34, 35), (8, 7)]:
|
||||
test_dist_qr(d1, d2)
|
||||
test_dist_qr(d2, d1)
|
||||
for _ in range(20):
|
||||
d1 = np.random.randint(1, 35)
|
||||
d2 = np.random.randint(1, 35)
|
||||
test_dist_qr(d1, d2)
|
||||
|
||||
+306
-294
@@ -2,351 +2,363 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import os
|
||||
import ray
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import pyarrow as pa
|
||||
|
||||
|
||||
class ComponentFailureTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
@pytest.fixture
|
||||
def ray_start_workers_separate():
|
||||
# Start the Ray processes.
|
||||
ray.worker._init(
|
||||
num_cpus=1,
|
||||
start_workers_from_local_scheduler=False,
|
||||
start_ray_local=True,
|
||||
redirect_output=True)
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
# This test checks that when a worker dies in the middle of a get, the
|
||||
# plasma store and manager will not die.
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_XRAY', False),
|
||||
"Workers are all started by Raylet, so cannot be killed from Python.")
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
"Not working with new GCS API.")
|
||||
def testDyingWorkerGet(self):
|
||||
obj_id = 20 * b"a"
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker.plasma_client.get(obj_id)
|
||||
# This test checks that when a worker dies in the middle of a get, the
|
||||
# plasma store and manager will not die.
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY", False),
|
||||
reason="This test does not work with xray yet.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Not working with new GCS API.")
|
||||
def test_dying_worker_get(ray_start_workers_separate):
|
||||
obj_id = 20 * b"a"
|
||||
|
||||
ray.worker._init(
|
||||
num_workers=1,
|
||||
start_workers_from_local_scheduler=False,
|
||||
start_ray_local=True,
|
||||
redirect_output=True)
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker.plasma_client.get(ray.ObjectID(obj_id))
|
||||
|
||||
# Have the worker wait in a get call.
|
||||
f.remote()
|
||||
# Have the worker wait in a get call.
|
||||
f.remote()
|
||||
|
||||
# Kill the worker.
|
||||
time.sleep(1)
|
||||
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
|
||||
.terminate())
|
||||
# Kill the worker.
|
||||
time.sleep(1)
|
||||
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
|
||||
.terminate())
|
||||
time.sleep(0.1)
|
||||
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.plasma_client.create(
|
||||
pa.plasma.ObjectID(obj_id), 100)
|
||||
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
assert ray.services.all_processes_alive(
|
||||
exclude=[ray.services.PROCESS_TYPE_WORKER])
|
||||
|
||||
|
||||
# This test checks that when a worker dies in the middle of a wait, the
|
||||
# plasma store and manager will not die.
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY", False),
|
||||
reason="This test does not work with xray yet.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Not working with new GCS API.")
|
||||
def test_dying_worker_wait(ray_start_workers_separate):
|
||||
obj_id = 20 * b"a"
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker.plasma_client.wait([ray.ObjectID(obj_id)])
|
||||
|
||||
# Have the worker wait in a get call.
|
||||
f.remote()
|
||||
|
||||
# Kill the worker.
|
||||
time.sleep(1)
|
||||
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
|
||||
.terminate())
|
||||
time.sleep(0.1)
|
||||
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.plasma_client.create(
|
||||
pa.plasma.ObjectID(obj_id), 100)
|
||||
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
assert ray.services.all_processes_alive(
|
||||
exclude=[ray.services.PROCESS_TYPE_WORKER])
|
||||
|
||||
|
||||
@pytest.fixture(params=[(1, 4), (4, 4)])
|
||||
def ray_start_workers_separate_multinode(request):
|
||||
num_local_schedulers = request.param[0]
|
||||
num_initial_workers = request.param[1]
|
||||
# Start the Ray processes.
|
||||
ray.worker._init(
|
||||
num_workers=(num_initial_workers * num_local_schedulers),
|
||||
num_local_schedulers=num_local_schedulers,
|
||||
start_workers_from_local_scheduler=False,
|
||||
start_ray_local=True,
|
||||
num_cpus=[num_initial_workers] * num_local_schedulers,
|
||||
redirect_output=True)
|
||||
yield num_local_schedulers, num_initial_workers
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_worker_failed(ray_start_workers_separate_multinode):
|
||||
num_local_schedulers, num_initial_workers = (
|
||||
ray_start_workers_separate_multinode)
|
||||
|
||||
@ray.remote
|
||||
def f(x):
|
||||
time.sleep(0.5)
|
||||
return x
|
||||
|
||||
# Submit more tasks than there are workers so that all workers and
|
||||
# cores are utilized.
|
||||
object_ids = [
|
||||
f.remote(i) for i in range(num_initial_workers * num_local_schedulers)
|
||||
]
|
||||
object_ids += [f.remote(object_id) for object_id in object_ids]
|
||||
# Allow the tasks some time to begin executing.
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for worker in (
|
||||
ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]):
|
||||
worker.terminate()
|
||||
time.sleep(0.1)
|
||||
# Make sure that we can still get the objects after the executing tasks
|
||||
# died.
|
||||
ray.get(object_ids)
|
||||
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.plasma_client.create(
|
||||
pa.plasma.ObjectID(obj_id), 100)
|
||||
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
assert ray.services.all_processes_alive(
|
||||
exclude=[ray.services.PROCESS_TYPE_WORKER])
|
||||
def _test_component_failed(component_type):
|
||||
"""Kill a component on all worker nodes and check workload succeeds."""
|
||||
# Raylet is able to pass a harder failure test than legacy ray.
|
||||
use_raylet = os.environ.get("RAY_USE_XRAY") == "1"
|
||||
|
||||
# This test checks that when a worker dies in the middle of a wait, the
|
||||
# plasma store and manager will not die.
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_XRAY', False),
|
||||
"Workers are all started by Raylet, so cannot be killed from Python.")
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
"Not working with new GCS API.")
|
||||
def testDyingWorkerWait(self):
|
||||
obj_id = 20 * b"a"
|
||||
# Start with 4 workers and 4 cores.
|
||||
num_local_schedulers = 4
|
||||
num_workers_per_scheduler = 8
|
||||
ray.worker._init(
|
||||
num_workers=num_workers_per_scheduler,
|
||||
num_local_schedulers=num_local_schedulers,
|
||||
start_ray_local=True,
|
||||
num_cpus=[num_workers_per_scheduler] * num_local_schedulers,
|
||||
redirect_output=True)
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker.plasma_client.wait([obj_id])
|
||||
|
||||
ray.worker._init(
|
||||
num_workers=1,
|
||||
start_workers_from_local_scheduler=False,
|
||||
start_ray_local=True,
|
||||
redirect_output=True)
|
||||
|
||||
# Have the worker wait in a get call.
|
||||
f.remote()
|
||||
|
||||
# Kill the worker.
|
||||
time.sleep(1)
|
||||
(ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0]
|
||||
.terminate())
|
||||
time.sleep(0.1)
|
||||
|
||||
# Seal the object so the store attempts to notify the worker that the
|
||||
# get has been fulfilled.
|
||||
ray.worker.global_worker.plasma_client.create(
|
||||
pa.plasma.ObjectID(obj_id), 100)
|
||||
ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id))
|
||||
time.sleep(0.1)
|
||||
|
||||
# Make sure that nothing has died.
|
||||
assert ray.services.all_processes_alive(
|
||||
exclude=[ray.services.PROCESS_TYPE_WORKER])
|
||||
|
||||
def _testWorkerFailed(self, num_local_schedulers):
|
||||
if use_raylet:
|
||||
# Submit many tasks with many dependencies.
|
||||
@ray.remote
|
||||
def f(x):
|
||||
time.sleep(0.5)
|
||||
return x
|
||||
|
||||
num_initial_workers = 4
|
||||
ray.worker._init(
|
||||
num_workers=(num_initial_workers * num_local_schedulers),
|
||||
num_local_schedulers=num_local_schedulers,
|
||||
start_workers_from_local_scheduler=False,
|
||||
start_ray_local=True,
|
||||
num_cpus=[num_initial_workers] * num_local_schedulers,
|
||||
redirect_output=True)
|
||||
@ray.remote
|
||||
def g(*xs):
|
||||
return 1
|
||||
|
||||
# Kill the component on all nodes except the head node as the tasks
|
||||
# execute. Do this in a loop while submitting tasks between each
|
||||
# component failure.
|
||||
# NOTE(swang): Legacy ray hangs on this test if the plasma manager
|
||||
# is killed.
|
||||
time.sleep(0.1)
|
||||
components = ray.services.all_processes[component_type]
|
||||
for process in components[1:]:
|
||||
# Submit a round of tasks with many dependencies.
|
||||
x = 1
|
||||
for _ in range(1000):
|
||||
x = f.remote(x)
|
||||
|
||||
xs = [g.remote(1)]
|
||||
for _ in range(100):
|
||||
xs.append(g.remote(*xs))
|
||||
xs.append(g.remote(1))
|
||||
|
||||
# Kill a component on one of the nodes.
|
||||
process.terminate()
|
||||
time.sleep(1)
|
||||
process.kill()
|
||||
process.wait()
|
||||
assert not process.poll() is None
|
||||
|
||||
# Make sure that we can still get the objects after the
|
||||
# executing tasks died.
|
||||
ray.get(x)
|
||||
ray.get(xs)
|
||||
else:
|
||||
|
||||
@ray.remote
|
||||
def f(x, j):
|
||||
time.sleep(0.2)
|
||||
return x
|
||||
|
||||
# Submit more tasks than there are workers so that all workers and
|
||||
# cores are utilized.
|
||||
object_ids = [
|
||||
f.remote(i)
|
||||
for i in range(num_initial_workers * num_local_schedulers)
|
||||
f.remote(i, 0)
|
||||
for i in range(num_workers_per_scheduler * num_local_schedulers)
|
||||
]
|
||||
object_ids += [f.remote(object_id) for object_id in object_ids]
|
||||
# Allow the tasks some time to begin executing.
|
||||
object_ids += [f.remote(object_id, 1) for object_id in object_ids]
|
||||
object_ids += [f.remote(object_id, 2) for object_id in object_ids]
|
||||
|
||||
# Kill the component on all nodes except the head node as the tasks
|
||||
# execute.
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for worker in (
|
||||
ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]):
|
||||
worker.terminate()
|
||||
time.sleep(0.1)
|
||||
# Make sure that we can still get the objects after the executing tasks
|
||||
# died.
|
||||
ray.get(object_ids)
|
||||
components = ray.services.all_processes[component_type]
|
||||
for process in components[1:]:
|
||||
process.terminate()
|
||||
time.sleep(1)
|
||||
|
||||
def testWorkerFailed(self):
|
||||
self._testWorkerFailed(1)
|
||||
for process in components[1:]:
|
||||
process.kill()
|
||||
process.wait()
|
||||
assert not process.poll() is None
|
||||
|
||||
def testWorkerFailedMultinode(self):
|
||||
self._testWorkerFailed(4)
|
||||
# Make sure that we can still get the objects after the executing
|
||||
# tasks died.
|
||||
results = ray.get(object_ids)
|
||||
expected_results = 4 * list(
|
||||
range(num_workers_per_scheduler * num_local_schedulers))
|
||||
assert results == expected_results
|
||||
|
||||
def _testComponentFailed(self, component_type):
|
||||
"""Kill a component on all worker nodes and check workload succeeds."""
|
||||
# Raylet is able to pass a harder failure test than legacy ray.
|
||||
use_raylet = os.environ.get("RAY_USE_XRAY") == "1"
|
||||
|
||||
# Start with 4 workers and 4 cores.
|
||||
num_local_schedulers = 4
|
||||
num_workers_per_scheduler = 8
|
||||
ray.worker._init(
|
||||
num_workers=num_workers_per_scheduler,
|
||||
num_local_schedulers=num_local_schedulers,
|
||||
start_ray_local=True,
|
||||
num_cpus=[num_workers_per_scheduler] * num_local_schedulers,
|
||||
redirect_output=True)
|
||||
|
||||
if use_raylet:
|
||||
# Submit many tasks with many dependencies.
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return x
|
||||
|
||||
@ray.remote
|
||||
def g(*xs):
|
||||
return 1
|
||||
|
||||
# Kill the component on all nodes except the head node as the tasks
|
||||
# execute. Do this in a loop while submitting tasks between each
|
||||
# component failure.
|
||||
# NOTE(swang): Legacy ray hangs on this test if the plasma manager
|
||||
# is killed.
|
||||
time.sleep(0.1)
|
||||
components = ray.services.all_processes[component_type]
|
||||
for process in components[1:]:
|
||||
# Submit a round of tasks with many dependencies.
|
||||
x = 1
|
||||
for _ in range(1000):
|
||||
x = f.remote(x)
|
||||
|
||||
xs = [g.remote(1)]
|
||||
for _ in range(100):
|
||||
xs.append(g.remote(*xs))
|
||||
xs.append(g.remote(1))
|
||||
|
||||
# Kill a component on one of the nodes.
|
||||
process.terminate()
|
||||
time.sleep(1)
|
||||
process.kill()
|
||||
process.wait()
|
||||
assert not process.poll() is None
|
||||
|
||||
# Make sure that we can still get the objects after the
|
||||
# executing tasks died.
|
||||
ray.get(x)
|
||||
ray.get(xs)
|
||||
def check_components_alive(component_type, check_component_alive):
|
||||
"""Check that a given component type is alive on all worker nodes.
|
||||
"""
|
||||
components = ray.services.all_processes[component_type][1:]
|
||||
for component in components:
|
||||
if check_component_alive:
|
||||
assert component.poll() is None
|
||||
else:
|
||||
print("waiting for " + component_type + " with PID " +
|
||||
str(component.pid) + "to terminate")
|
||||
component.wait()
|
||||
print("done waiting for " + component_type + " with PID " +
|
||||
str(component.pid) + "to terminate")
|
||||
assert not component.poll() is None
|
||||
|
||||
@ray.remote
|
||||
def f(x, j):
|
||||
time.sleep(0.2)
|
||||
return x
|
||||
|
||||
# Submit more tasks than there are workers so that all workers and
|
||||
# cores are utilized.
|
||||
object_ids = [
|
||||
f.remote(i, 0) for i in range(num_workers_per_scheduler *
|
||||
num_local_schedulers)
|
||||
]
|
||||
object_ids += [f.remote(object_id, 1) for object_id in object_ids]
|
||||
object_ids += [f.remote(object_id, 2) for object_id in object_ids]
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY") != "1",
|
||||
reason="This test only makes sense with xray.")
|
||||
def test_raylet_failed():
|
||||
# Kill all local schedulers on worker nodes.
|
||||
_test_component_failed(ray.services.PROCESS_TYPE_RAYLET)
|
||||
|
||||
# Kill the component on all nodes except the head node as the tasks
|
||||
# execute.
|
||||
time.sleep(0.1)
|
||||
components = ray.services.all_processes[component_type]
|
||||
for process in components[1:]:
|
||||
process.terminate()
|
||||
time.sleep(1)
|
||||
# The plasma stores and plasma managers should still be alive on the
|
||||
# worker nodes.
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True)
|
||||
|
||||
for process in components[1:]:
|
||||
process.kill()
|
||||
process.wait()
|
||||
assert not process.poll() is None
|
||||
ray.shutdown()
|
||||
|
||||
# Make sure that we can still get the objects after the executing
|
||||
# tasks died.
|
||||
results = ray.get(object_ids)
|
||||
expected_results = 4 * list(
|
||||
range(num_workers_per_scheduler * num_local_schedulers))
|
||||
assert results == expected_results
|
||||
|
||||
def check_components_alive(self, component_type, check_component_alive):
|
||||
"""Check that a given component type is alive on all worker nodes.
|
||||
"""
|
||||
components = ray.services.all_processes[component_type][1:]
|
||||
for component in components:
|
||||
if check_component_alive:
|
||||
assert component.poll() is None
|
||||
else:
|
||||
print("waiting for " + component_type + " with PID " +
|
||||
str(component.pid) + "to terminate")
|
||||
component.wait()
|
||||
print("done waiting for " + component_type + " with PID " +
|
||||
str(component.pid) + "to terminate")
|
||||
assert not component.poll() is None
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY") == "1",
|
||||
reason="This test does not make sense with xray.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_local_scheduler_failed():
|
||||
# Kill all local schedulers on worker nodes.
|
||||
_test_component_failed(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER)
|
||||
|
||||
@unittest.skipIf(not os.environ.get('RAY_USE_XRAY', False),
|
||||
"Only tests Raylet failure.")
|
||||
def testRayletFailed(self):
|
||||
# Kill all local schedulers on worker nodes.
|
||||
self._testComponentFailed(ray.services.PROCESS_TYPE_RAYLET)
|
||||
# The plasma stores and plasma managers should still be alive on the
|
||||
# worker nodes.
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, True)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False)
|
||||
|
||||
# The plasma stores and plasma managers should still be alive on the
|
||||
# worker nodes.
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
|
||||
True)
|
||||
ray.shutdown()
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_XRAY', False),
|
||||
"Raylet codepath does not have this component")
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.")
|
||||
def testLocalSchedulerFailed(self):
|
||||
# Kill all local schedulers on worker nodes.
|
||||
self._testComponentFailed(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER)
|
||||
|
||||
# The plasma stores and plasma managers should still be alive on the
|
||||
# worker nodes.
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
|
||||
True)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER,
|
||||
True)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER,
|
||||
False)
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY") == "1",
|
||||
reason="This test does not make sense with xray.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_plasma_manager_failed():
|
||||
# Kill all plasma managers on worker nodes.
|
||||
_test_component_failed(ray.services.PROCESS_TYPE_PLASMA_MANAGER)
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_XRAY', False),
|
||||
"Raylet codepath does not have this component")
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.")
|
||||
def testPlasmaManagerFailed(self):
|
||||
# Kill all plasma managers on worker nodes.
|
||||
self._testComponentFailed(ray.services.PROCESS_TYPE_PLASMA_MANAGER)
|
||||
# The plasma stores should still be alive (but unreachable) on the
|
||||
# worker nodes.
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, False)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False)
|
||||
|
||||
# The plasma stores should still be alive (but unreachable) on the
|
||||
# worker nodes.
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
|
||||
True)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER,
|
||||
False)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER,
|
||||
False)
|
||||
ray.shutdown()
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.")
|
||||
def testPlasmaStoreFailed(self):
|
||||
# Kill all plasma stores on worker nodes.
|
||||
self._testComponentFailed(ray.services.PROCESS_TYPE_PLASMA_STORE)
|
||||
|
||||
# No processes should be left alive on the worker nodes.
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE,
|
||||
False)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER,
|
||||
False)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER,
|
||||
False)
|
||||
self.check_components_alive(ray.services.PROCESS_TYPE_RAYLET, False)
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_plasma_store_failed():
|
||||
# Kill all plasma stores on worker nodes.
|
||||
_test_component_failed(ray.services.PROCESS_TYPE_PLASMA_STORE)
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
"Not working with new GCS API.")
|
||||
def testDriverLivesSequential(self):
|
||||
ray.worker.init(redirect_output=True)
|
||||
all_processes = ray.services.all_processes
|
||||
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER]
|
||||
+ all_processes[ray.services.PROCESS_TYPE_RAYLET])
|
||||
# No processes should be left alive on the worker nodes.
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, False)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, False)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False)
|
||||
check_components_alive(ray.services.PROCESS_TYPE_RAYLET, False)
|
||||
|
||||
# Kill all the components sequentially.
|
||||
for process in processes:
|
||||
process.terminate()
|
||||
time.sleep(0.1)
|
||||
process.kill()
|
||||
process.wait()
|
||||
ray.shutdown()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
"Not working with new GCS API.")
|
||||
def testDriverLivesParallel(self):
|
||||
ray.worker.init(redirect_output=True)
|
||||
all_processes = ray.services.all_processes
|
||||
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER]
|
||||
+ all_processes[ray.services.PROCESS_TYPE_RAYLET])
|
||||
|
||||
# Kill all the components in parallel.
|
||||
for process in processes:
|
||||
process.terminate()
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_driver_lives_sequential():
|
||||
ray.worker.init()
|
||||
all_processes = ray.services.all_processes
|
||||
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_RAYLET])
|
||||
|
||||
# Kill all the components sequentially.
|
||||
for process in processes:
|
||||
process.terminate()
|
||||
time.sleep(0.1)
|
||||
for process in processes:
|
||||
process.kill()
|
||||
process.kill()
|
||||
process.wait()
|
||||
|
||||
for process in processes:
|
||||
process.wait()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
ray.shutdown()
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_driver_lives_parallel():
|
||||
ray.worker.init()
|
||||
all_processes = ray.services.all_processes
|
||||
processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] +
|
||||
all_processes[ray.services.PROCESS_TYPE_RAYLET])
|
||||
|
||||
# Kill all the components in parallel.
|
||||
for process in processes:
|
||||
process.terminate()
|
||||
|
||||
time.sleep(0.1)
|
||||
for process in processes:
|
||||
process.kill()
|
||||
|
||||
for process in processes:
|
||||
process.wait()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
ray.shutdown()
|
||||
|
||||
+1
-1
@@ -13,7 +13,7 @@ def parse_client(addr_port_str):
|
||||
return redis.StrictRedis(host=redis_address, port=redis_port)
|
||||
|
||||
|
||||
@unittest.skipIf(not os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"Tests functionality of the new GCS.")
|
||||
class CredisTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
|
||||
+371
-396
@@ -8,7 +8,6 @@ import ray
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import ray.ray_constants as ray_constants
|
||||
import pytest
|
||||
@@ -27,481 +26,457 @@ def wait_for_errors(error_type, num_errors, timeout=10):
|
||||
raise Exception("Timing out of wait.")
|
||||
|
||||
|
||||
class TaskStatusTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
@pytest.fixture
|
||||
def ray_start_regular():
|
||||
# Start the Ray processes.
|
||||
ray.init(num_cpus=2)
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
def testFailedTask(self):
|
||||
@ray.remote
|
||||
def throw_exception_fct1():
|
||||
raise Exception("Test function 1 intentionally failed.")
|
||||
|
||||
@ray.remote
|
||||
def throw_exception_fct2():
|
||||
raise Exception("Test function 2 intentionally failed.")
|
||||
def test_failed_task(ray_start_regular):
|
||||
@ray.remote
|
||||
def throw_exception_fct1():
|
||||
raise Exception("Test function 1 intentionally failed.")
|
||||
|
||||
@ray.remote(num_return_vals=3)
|
||||
def throw_exception_fct3(x):
|
||||
raise Exception("Test function 3 intentionally failed.")
|
||||
@ray.remote
|
||||
def throw_exception_fct2():
|
||||
raise Exception("Test function 2 intentionally failed.")
|
||||
|
||||
ray.init(num_workers=3)
|
||||
@ray.remote(num_return_vals=3)
|
||||
def throw_exception_fct3(x):
|
||||
raise Exception("Test function 3 intentionally failed.")
|
||||
|
||||
throw_exception_fct1.remote()
|
||||
throw_exception_fct1.remote()
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2
|
||||
for task in relevant_errors(ray_constants.TASK_PUSH_ERROR):
|
||||
msg = task.get("message")
|
||||
assert "Test function 1 intentionally failed." in msg
|
||||
throw_exception_fct1.remote()
|
||||
throw_exception_fct1.remote()
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2
|
||||
for task in relevant_errors(ray_constants.TASK_PUSH_ERROR):
|
||||
msg = task.get("message")
|
||||
assert "Test function 1 intentionally failed." in msg
|
||||
|
||||
x = throw_exception_fct2.remote()
|
||||
x = throw_exception_fct2.remote()
|
||||
try:
|
||||
ray.get(x)
|
||||
except Exception as e:
|
||||
assert "Test function 2 intentionally failed." in str(e)
|
||||
else:
|
||||
# ray.get should throw an exception.
|
||||
assert False
|
||||
|
||||
x, y, z = throw_exception_fct3.remote(1.0)
|
||||
for ref in [x, y, z]:
|
||||
try:
|
||||
ray.get(x)
|
||||
ray.get(ref)
|
||||
except Exception as e:
|
||||
assert "Test function 2 intentionally failed." in str(e)
|
||||
assert "Test function 3 intentionally failed." in str(e)
|
||||
else:
|
||||
# ray.get should throw an exception.
|
||||
assert False
|
||||
|
||||
x, y, z = throw_exception_fct3.remote(1.0)
|
||||
for ref in [x, y, z]:
|
||||
try:
|
||||
ray.get(ref)
|
||||
except Exception as e:
|
||||
assert "Test function 3 intentionally failed." in str(e)
|
||||
else:
|
||||
# ray.get should throw an exception.
|
||||
assert False
|
||||
@ray.remote
|
||||
def f():
|
||||
raise Exception("This function failed.")
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
raise Exception("This function failed.")
|
||||
try:
|
||||
ray.get(f.remote())
|
||||
except Exception as e:
|
||||
assert "This function failed." in str(e)
|
||||
else:
|
||||
# ray.get should throw an exception.
|
||||
assert False
|
||||
|
||||
try:
|
||||
ray.get(f.remote())
|
||||
except Exception as e:
|
||||
assert "This function failed." in str(e)
|
||||
else:
|
||||
# ray.get should throw an exception.
|
||||
assert False
|
||||
|
||||
def testFailImportingRemoteFunction(self):
|
||||
ray.init(num_workers=2)
|
||||
|
||||
# Create the contents of a temporary Python file.
|
||||
temporary_python_file = """
|
||||
def test_fail_importing_remote_function(ray_start_regular):
|
||||
# Create the contents of a temporary Python file.
|
||||
temporary_python_file = """
|
||||
def temporary_helper_function():
|
||||
return 1
|
||||
"""
|
||||
|
||||
f = tempfile.NamedTemporaryFile(suffix=".py")
|
||||
f.write(temporary_python_file.encode("ascii"))
|
||||
f.flush()
|
||||
directory = os.path.dirname(f.name)
|
||||
# Get the module name and strip ".py" from the end.
|
||||
module_name = os.path.basename(f.name)[:-3]
|
||||
sys.path.append(directory)
|
||||
module = __import__(module_name)
|
||||
f = tempfile.NamedTemporaryFile(suffix=".py")
|
||||
f.write(temporary_python_file.encode("ascii"))
|
||||
f.flush()
|
||||
directory = os.path.dirname(f.name)
|
||||
# Get the module name and strip ".py" from the end.
|
||||
module_name = os.path.basename(f.name)[:-3]
|
||||
sys.path.append(directory)
|
||||
module = __import__(module_name)
|
||||
|
||||
# Define a function that closes over this temporary module. This should
|
||||
# fail when it is unpickled.
|
||||
@ray.remote
|
||||
def g():
|
||||
return module.temporary_python_file()
|
||||
# Define a function that closes over this temporary module. This should
|
||||
# fail when it is unpickled.
|
||||
@ray.remote
|
||||
def g():
|
||||
return module.temporary_python_file()
|
||||
|
||||
wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2)
|
||||
assert "No module named" in ray.error_info()[0]["message"]
|
||||
assert "No module named" in ray.error_info()[1]["message"]
|
||||
wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2)
|
||||
assert "No module named" in ray.error_info()[0]["message"]
|
||||
assert "No module named" in ray.error_info()[1]["message"]
|
||||
|
||||
# Check that if we try to call the function it throws an exception and
|
||||
# does not hang.
|
||||
for _ in range(10):
|
||||
with pytest.raises(Exception):
|
||||
ray.get(g.remote())
|
||||
# Check that if we try to call the function it throws an exception and
|
||||
# does not hang.
|
||||
for _ in range(10):
|
||||
with pytest.raises(Exception):
|
||||
ray.get(g.remote())
|
||||
|
||||
f.close()
|
||||
f.close()
|
||||
|
||||
# Clean up the junk we added to sys.path.
|
||||
sys.path.pop(-1)
|
||||
# Clean up the junk we added to sys.path.
|
||||
sys.path.pop(-1)
|
||||
|
||||
def testFailedFunctionToRun(self):
|
||||
ray.init(num_workers=2)
|
||||
|
||||
def f(worker):
|
||||
if ray.worker.global_worker.mode == ray.WORKER_MODE:
|
||||
raise Exception("Function to run failed.")
|
||||
def test_failed_function_to_run(ray_start_regular):
|
||||
def f(worker):
|
||||
if ray.worker.global_worker.mode == ray.WORKER_MODE:
|
||||
raise Exception("Function to run failed.")
|
||||
|
||||
ray.worker.global_worker.run_function_on_all_workers(f)
|
||||
wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2)
|
||||
# Check that the error message is in the task info.
|
||||
error_info = ray.error_info()
|
||||
assert len(error_info) == 2
|
||||
assert "Function to run failed." in error_info[0]["message"]
|
||||
assert "Function to run failed." in error_info[1]["message"]
|
||||
ray.worker.global_worker.run_function_on_all_workers(f)
|
||||
wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2)
|
||||
# Check that the error message is in the task info.
|
||||
error_info = ray.error_info()
|
||||
assert len(error_info) == 2
|
||||
assert "Function to run failed." in error_info[0]["message"]
|
||||
assert "Function to run failed." in error_info[1]["message"]
|
||||
|
||||
def testFailImportingActor(self):
|
||||
ray.init(num_workers=2)
|
||||
|
||||
# Create the contents of a temporary Python file.
|
||||
temporary_python_file = """
|
||||
def test_fail_importing_actor(ray_start_regular):
|
||||
# Create the contents of a temporary Python file.
|
||||
temporary_python_file = """
|
||||
def temporary_helper_function():
|
||||
return 1
|
||||
"""
|
||||
|
||||
f = tempfile.NamedTemporaryFile(suffix=".py")
|
||||
f.write(temporary_python_file.encode("ascii"))
|
||||
f.flush()
|
||||
directory = os.path.dirname(f.name)
|
||||
# Get the module name and strip ".py" from the end.
|
||||
module_name = os.path.basename(f.name)[:-3]
|
||||
sys.path.append(directory)
|
||||
module = __import__(module_name)
|
||||
f = tempfile.NamedTemporaryFile(suffix=".py")
|
||||
f.write(temporary_python_file.encode("ascii"))
|
||||
f.flush()
|
||||
directory = os.path.dirname(f.name)
|
||||
# Get the module name and strip ".py" from the end.
|
||||
module_name = os.path.basename(f.name)[:-3]
|
||||
sys.path.append(directory)
|
||||
module = __import__(module_name)
|
||||
|
||||
# Define an actor that closes over this temporary module. This should
|
||||
# fail when it is unpickled.
|
||||
@ray.remote
|
||||
class Foo(object):
|
||||
def __init__(self):
|
||||
self.x = module.temporary_python_file()
|
||||
# Define an actor that closes over this temporary module. This should
|
||||
# fail when it is unpickled.
|
||||
@ray.remote
|
||||
class Foo(object):
|
||||
def __init__(self):
|
||||
self.x = module.temporary_python_file()
|
||||
|
||||
def get_val(self):
|
||||
return 1
|
||||
def get_val(self):
|
||||
return 1
|
||||
|
||||
# There should be no errors yet.
|
||||
assert len(ray.error_info()) == 0
|
||||
# There should be no errors yet.
|
||||
assert len(ray.error_info()) == 0
|
||||
|
||||
# Create an actor.
|
||||
foo = Foo.remote()
|
||||
# Create an actor.
|
||||
foo = Foo.remote()
|
||||
|
||||
# Wait for the error to arrive.
|
||||
wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
|
||||
assert "No module named" in ray.error_info()[0]["message"]
|
||||
# Wait for the error to arrive.
|
||||
wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
|
||||
assert "No module named" in ray.error_info()[0]["message"]
|
||||
|
||||
# Wait for the error from when the __init__ tries to run.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
|
||||
assert ("failed to be imported, and so cannot execute this method" in
|
||||
ray.error_info()[1]["message"])
|
||||
# Wait for the error from when the __init__ tries to run.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
|
||||
assert ("failed to be imported, and so cannot execute this method" in
|
||||
ray.error_info()[1]["message"])
|
||||
|
||||
# Check that if we try to get the function it throws an exception and
|
||||
# does not hang.
|
||||
with pytest.raises(Exception):
|
||||
ray.get(foo.get_val.remote())
|
||||
# Check that if we try to get the function it throws an exception and
|
||||
# does not hang.
|
||||
with pytest.raises(Exception):
|
||||
ray.get(foo.get_val.remote())
|
||||
|
||||
# Wait for the error from when the call to get_val.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
assert ("failed to be imported, and so cannot execute this method" in
|
||||
ray.error_info()[2]["message"])
|
||||
# Wait for the error from when the call to get_val.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
assert ("failed to be imported, and so cannot execute this method" in
|
||||
ray.error_info()[2]["message"])
|
||||
|
||||
f.close()
|
||||
f.close()
|
||||
|
||||
# Clean up the junk we added to sys.path.
|
||||
sys.path.pop(-1)
|
||||
# Clean up the junk we added to sys.path.
|
||||
sys.path.pop(-1)
|
||||
|
||||
|
||||
class ActorTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
def test_failed_actor_init(ray_start_regular):
|
||||
error_message1 = "actor constructor failed"
|
||||
error_message2 = "actor method failed"
|
||||
|
||||
def testFailedActorInit(self):
|
||||
ray.init(num_workers=0)
|
||||
@ray.remote
|
||||
class FailedActor(object):
|
||||
def __init__(self):
|
||||
raise Exception(error_message1)
|
||||
|
||||
error_message1 = "actor constructor failed"
|
||||
error_message2 = "actor method failed"
|
||||
def get_val(self):
|
||||
return 1
|
||||
|
||||
@ray.remote
|
||||
class FailedActor(object):
|
||||
def __init__(self):
|
||||
raise Exception(error_message1)
|
||||
def fail_method(self):
|
||||
raise Exception(error_message2)
|
||||
|
||||
def get_val(self):
|
||||
return 1
|
||||
a = FailedActor.remote()
|
||||
|
||||
def fail_method(self):
|
||||
raise Exception(error_message2)
|
||||
# Make sure that we get errors from a failed constructor.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
|
||||
assert len(ray.error_info()) == 1
|
||||
assert error_message1 in ray.error_info()[0]["message"]
|
||||
|
||||
a = FailedActor.remote()
|
||||
|
||||
# Make sure that we get errors from a failed constructor.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
|
||||
assert len(ray.error_info()) == 1
|
||||
assert error_message1 in ray.error_info()[0]["message"]
|
||||
|
||||
# Make sure that we get errors from a failed method.
|
||||
a.fail_method.remote()
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
assert len(ray.error_info()) == 2
|
||||
assert error_message2 in ray.error_info()[1]["message"]
|
||||
|
||||
def testIncorrectMethodCalls(self):
|
||||
ray.init(num_workers=0)
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self, missing_variable_name):
|
||||
pass
|
||||
|
||||
def get_val(self, x):
|
||||
pass
|
||||
|
||||
# Make sure that we get errors if we call the constructor incorrectly.
|
||||
|
||||
# Create an actor with too few arguments.
|
||||
with pytest.raises(Exception):
|
||||
a = Actor.remote()
|
||||
|
||||
# Create an actor with too many arguments.
|
||||
with pytest.raises(Exception):
|
||||
a = Actor.remote(1, 2)
|
||||
|
||||
# Create an actor the correct number of arguments.
|
||||
a = Actor.remote(1)
|
||||
|
||||
# Call a method with too few arguments.
|
||||
with pytest.raises(Exception):
|
||||
a.get_val.remote()
|
||||
|
||||
# Call a method with too many arguments.
|
||||
with pytest.raises(Exception):
|
||||
a.get_val.remote(1, 2)
|
||||
# Call a method that doesn't exist.
|
||||
with pytest.raises(AttributeError):
|
||||
a.nonexistent_method()
|
||||
with pytest.raises(AttributeError):
|
||||
a.nonexistent_method.remote()
|
||||
# Make sure that we get errors from a failed method.
|
||||
a.fail_method.remote()
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
assert len(ray.error_info()) == 2
|
||||
assert error_message2 in ray.error_info()[1]["message"]
|
||||
|
||||
|
||||
class WorkerDeath(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
|
||||
def testWorkerRaisingException(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker._get_next_task_from_local_scheduler = None
|
||||
|
||||
# Running this task should cause the worker to raise an exception after
|
||||
# the task has successfully completed.
|
||||
f.remote()
|
||||
|
||||
wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1)
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
assert len(ray.error_info()) == 2
|
||||
|
||||
def testWorkerDying(self):
|
||||
ray.init(num_workers=0)
|
||||
|
||||
# Define a remote function that will kill the worker that runs it.
|
||||
@ray.remote
|
||||
def f():
|
||||
eval("exit()")
|
||||
|
||||
f.remote()
|
||||
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
|
||||
error_info = ray.error_info()
|
||||
assert len(error_info) == 1
|
||||
assert "died or was killed while executing" in error_info[0]["message"]
|
||||
|
||||
def testActorWorkerDying(self):
|
||||
ray.init(num_workers=0)
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def kill(self):
|
||||
eval("exit()")
|
||||
|
||||
@ray.remote
|
||||
def consume(x):
|
||||
def test_incorrect_method_calls(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self, missing_variable_name):
|
||||
pass
|
||||
|
||||
def get_val(self, x):
|
||||
pass
|
||||
|
||||
# Make sure that we get errors if we call the constructor incorrectly.
|
||||
|
||||
# Create an actor with too few arguments.
|
||||
with pytest.raises(Exception):
|
||||
a = Actor.remote()
|
||||
[obj], _ = ray.wait([a.kill.remote()], timeout=5000)
|
||||
|
||||
# Create an actor with too many arguments.
|
||||
with pytest.raises(Exception):
|
||||
a = Actor.remote(1, 2)
|
||||
|
||||
# Create an actor the correct number of arguments.
|
||||
a = Actor.remote(1)
|
||||
|
||||
# Call a method with too few arguments.
|
||||
with pytest.raises(Exception):
|
||||
a.get_val.remote()
|
||||
|
||||
# Call a method with too many arguments.
|
||||
with pytest.raises(Exception):
|
||||
a.get_val.remote(1, 2)
|
||||
# Call a method that doesn't exist.
|
||||
with pytest.raises(AttributeError):
|
||||
a.nonexistent_method()
|
||||
with pytest.raises(AttributeError):
|
||||
a.nonexistent_method.remote()
|
||||
|
||||
|
||||
def test_worker_raising_exception(ray_start_regular):
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker._get_next_task_from_local_scheduler = None
|
||||
|
||||
# Running this task should cause the worker to raise an exception after
|
||||
# the task has successfully completed.
|
||||
f.remote()
|
||||
|
||||
wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1)
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
assert len(ray.error_info()) == 2
|
||||
|
||||
|
||||
def test_worker_dying(ray_start_regular):
|
||||
# Define a remote function that will kill the worker that runs it.
|
||||
@ray.remote
|
||||
def f():
|
||||
eval("exit()")
|
||||
|
||||
f.remote()
|
||||
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
|
||||
error_info = ray.error_info()
|
||||
assert len(error_info) == 1
|
||||
assert "died or was killed while executing" in error_info[0]["message"]
|
||||
|
||||
|
||||
def test_actor_worker_dying(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def kill(self):
|
||||
eval("exit()")
|
||||
|
||||
@ray.remote
|
||||
def consume(x):
|
||||
pass
|
||||
|
||||
a = Actor.remote()
|
||||
[obj], _ = ray.wait([a.kill.remote()], timeout=5000)
|
||||
with pytest.raises(Exception):
|
||||
ray.get(obj)
|
||||
with pytest.raises(Exception):
|
||||
ray.get(consume.remote(obj))
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
def test_actor_worker_dying_future_tasks(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
|
||||
def sleep(self):
|
||||
time.sleep(1)
|
||||
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.getpid.remote())
|
||||
tasks1 = [a.sleep.remote() for _ in range(10)]
|
||||
os.kill(pid, 9)
|
||||
time.sleep(0.1)
|
||||
tasks2 = [a.sleep.remote() for _ in range(10)]
|
||||
for obj in tasks1 + tasks2:
|
||||
with pytest.raises(Exception):
|
||||
ray.get(obj)
|
||||
with pytest.raises(Exception):
|
||||
ray.get(consume.remote(obj))
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
|
||||
def testActorWorkerDyingFutureTasks(self):
|
||||
ray.init(num_workers=0)
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
|
||||
def sleep(self):
|
||||
time.sleep(1)
|
||||
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.getpid.remote())
|
||||
tasks1 = [a.sleep.remote() for _ in range(10)]
|
||||
os.kill(pid, 9)
|
||||
time.sleep(0.1)
|
||||
tasks2 = [a.sleep.remote() for _ in range(10)]
|
||||
for obj in tasks1 + tasks2:
|
||||
with pytest.raises(Exception):
|
||||
ray.get(obj)
|
||||
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
|
||||
def testActorWorkerDyingNothingInProgress(self):
|
||||
ray.init(num_workers=0)
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.getpid.remote())
|
||||
os.kill(pid, 9)
|
||||
time.sleep(0.1)
|
||||
task2 = a.getpid.remote()
|
||||
with pytest.raises(Exception):
|
||||
ray.get(task2)
|
||||
wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
def test_actor_worker_dying_nothing_in_progress(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.getpid.remote())
|
||||
os.kill(pid, 9)
|
||||
time.sleep(0.1)
|
||||
task2 = a.getpid.remote()
|
||||
with pytest.raises(Exception):
|
||||
ray.get(task2)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_start_object_store_memory():
|
||||
# Start the Ray processes.
|
||||
store_size = 10**6
|
||||
ray.init(num_cpus=1, object_store_memory=store_size)
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY") == "1",
|
||||
"This test does not work with xray yet.")
|
||||
class PutErrorTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
reason="This test does not work with xray yet.")
|
||||
def test_put_error1(ray_start_object_store_memory):
|
||||
num_objects = 3
|
||||
object_size = 4 * 10**5
|
||||
|
||||
def testPutError1(self):
|
||||
store_size = 10**6
|
||||
ray.worker._init(start_ray_local=True, object_store_memory=store_size)
|
||||
# Define a task with a single dependency, a numpy array, that returns
|
||||
# another array.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
num_objects = 3
|
||||
object_size = 4 * 10**5
|
||||
@ray.remote
|
||||
def put_arg_task():
|
||||
# Launch num_objects instances of the remote task, each dependent
|
||||
# on the one before it. The result of the first task should get
|
||||
# evicted.
|
||||
args = []
|
||||
arg = single_dependency.remote(0, np.zeros(
|
||||
object_size, dtype=np.uint8))
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
|
||||
# Define a task with a single dependency, a numpy array, that returns
|
||||
# another array.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
# Get the last value to force all tasks to finish.
|
||||
value = ray.get(args[-1])
|
||||
assert value[0] == i
|
||||
|
||||
@ray.remote
|
||||
def put_arg_task():
|
||||
# Launch num_objects instances of the remote task, each dependent
|
||||
# on the one before it. The result of the first task should get
|
||||
# evicted.
|
||||
args = []
|
||||
arg = single_dependency.remote(
|
||||
0, np.zeros(object_size, dtype=np.uint8))
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
# Get the first value (which should have been evicted) to force
|
||||
# reconstruction. Currently, since we're not able to reconstruct
|
||||
# `ray.put` objects that were evicted and whose originating tasks
|
||||
# are still running, this for-loop should hang and push an error to
|
||||
# the driver.
|
||||
ray.get(args[0])
|
||||
|
||||
# Get the last value to force all tasks to finish.
|
||||
value = ray.get(args[-1])
|
||||
assert value[0] == i
|
||||
put_arg_task.remote()
|
||||
|
||||
# Get the first value (which should have been evicted) to force
|
||||
# reconstruction. Currently, since we're not able to reconstruct
|
||||
# `ray.put` objects that were evicted and whose originating tasks
|
||||
# are still running, this for-loop should hang and push an error to
|
||||
# the driver.
|
||||
ray.get(args[0])
|
||||
|
||||
put_arg_task.remote()
|
||||
|
||||
# Make sure we receive the correct error message.
|
||||
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
|
||||
|
||||
def testPutError2(self):
|
||||
# This is the same as the previous test, but it calls ray.put directly.
|
||||
store_size = 10**6
|
||||
ray.worker._init(start_ray_local=True, object_store_memory=store_size)
|
||||
|
||||
num_objects = 3
|
||||
object_size = 4 * 10**5
|
||||
|
||||
# Define a task with a single dependency, a numpy array, that returns
|
||||
# another array.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
@ray.remote
|
||||
def put_task():
|
||||
# Launch num_objects instances of the remote task, each dependent
|
||||
# on the one before it. The result of the first task should get
|
||||
# evicted.
|
||||
args = []
|
||||
arg = ray.put(np.zeros(object_size, dtype=np.uint8))
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
|
||||
# Get the last value to force all tasks to finish.
|
||||
value = ray.get(args[-1])
|
||||
assert value[0] == i
|
||||
|
||||
# Get the first value (which should have been evicted) to force
|
||||
# reconstruction. Currently, since we're not able to reconstruct
|
||||
# `ray.put` objects that were evicted and whose originating tasks
|
||||
# are still running, this for-loop should hang and push an error to
|
||||
# the driver.
|
||||
ray.get(args[0])
|
||||
|
||||
put_task.remote()
|
||||
|
||||
# Make sure we receive the correct error message.
|
||||
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
|
||||
# Make sure we receive the correct error message.
|
||||
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
class ConfigurationTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_XRAY") == "1",
|
||||
reason="This test does not work with xray yet.")
|
||||
def test_put_error2(ray_start_object_store_memory):
|
||||
# This is the same as the previous test, but it calls ray.put directly.
|
||||
num_objects = 3
|
||||
object_size = 4 * 10**5
|
||||
|
||||
def testVersionMismatch(self):
|
||||
ray_version = ray.__version__
|
||||
ray.__version__ = "fake ray version"
|
||||
# Define a task with a single dependency, a numpy array, that returns
|
||||
# another array.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
ray.init(num_workers=1)
|
||||
@ray.remote
|
||||
def put_task():
|
||||
# Launch num_objects instances of the remote task, each dependent
|
||||
# on the one before it. The result of the first task should get
|
||||
# evicted.
|
||||
args = []
|
||||
arg = ray.put(np.zeros(object_size, dtype=np.uint8))
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
|
||||
wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1)
|
||||
# Get the last value to force all tasks to finish.
|
||||
value = ray.get(args[-1])
|
||||
assert value[0] == i
|
||||
|
||||
ray.__version__ = ray_version
|
||||
# Get the first value (which should have been evicted) to force
|
||||
# reconstruction. Currently, since we're not able to reconstruct
|
||||
# `ray.put` objects that were evicted and whose originating tasks
|
||||
# are still running, this for-loop should hang and push an error to
|
||||
# the driver.
|
||||
ray.get(args[0])
|
||||
|
||||
put_task.remote()
|
||||
|
||||
# Make sure we receive the correct error message.
|
||||
wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
class WarningTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
def test_version_mismatch():
|
||||
ray_version = ray.__version__
|
||||
ray.__version__ = "fake ray version"
|
||||
|
||||
def testExportLargeObjects(self):
|
||||
import ray.ray_constants as ray_constants
|
||||
ray.init(num_cpus=1)
|
||||
|
||||
ray.init(num_workers=1)
|
||||
wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1)
|
||||
|
||||
large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE)
|
||||
# Reset the version.
|
||||
ray.__version__ = ray_version
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_export_large_objects(ray_start_regular):
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE)
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
large_object
|
||||
|
||||
# Make sure that a warning is generated.
|
||||
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1)
|
||||
|
||||
@ray.remote
|
||||
class Foo(object):
|
||||
def __init__(self):
|
||||
large_object
|
||||
|
||||
# Make sure that a warning is generated.
|
||||
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1)
|
||||
Foo.remote()
|
||||
|
||||
@ray.remote
|
||||
class Foo(object):
|
||||
def __init__(self):
|
||||
large_object
|
||||
|
||||
Foo.remote()
|
||||
|
||||
# Make sure that a warning is generated.
|
||||
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
# Make sure that a warning is generated.
|
||||
wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2)
|
||||
|
||||
+107
-109
@@ -2,120 +2,118 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import os
|
||||
import unittest
|
||||
import ray
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
|
||||
class MicroBenchmarkTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
|
||||
def testTiming(self):
|
||||
@ray.remote
|
||||
def empty_function():
|
||||
pass
|
||||
|
||||
@ray.remote
|
||||
def trivial_function():
|
||||
return 1
|
||||
|
||||
ray.init(num_workers=3)
|
||||
|
||||
# Measure the time required to submit a remote task to the scheduler.
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
empty_function.remote()
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to submit an empty function call:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.00038.
|
||||
|
||||
# Measure the time required to submit a remote task to the scheduler
|
||||
# (where the remote task returns one value).
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
trivial_function.remote()
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to submit a trivial function call:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.001.
|
||||
|
||||
# Measure the time required to submit a remote task to the scheduler
|
||||
# and get the result.
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
x = trivial_function.remote()
|
||||
ray.get(x)
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to submit a trivial function call and get the "
|
||||
"result:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.0013.
|
||||
|
||||
# Measure the time required to do do a put.
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
ray.put(1)
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to put an int:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.00087.
|
||||
|
||||
def testCache(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
A = np.random.rand(1, 1000000)
|
||||
v = np.random.rand(1000000)
|
||||
A_id = ray.put(A)
|
||||
v_id = ray.put(v)
|
||||
a = time.time()
|
||||
for i in range(100):
|
||||
A.dot(v)
|
||||
b = time.time() - a
|
||||
c = time.time()
|
||||
for i in range(100):
|
||||
ray.get(A_id).dot(ray.get(v_id))
|
||||
d = time.time() - c
|
||||
|
||||
if d > 1.5 * b:
|
||||
if os.getenv("TRAVIS") is None:
|
||||
raise Exception("The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
else:
|
||||
print("WARNING: The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
@pytest.fixture
|
||||
def ray_start_regular():
|
||||
# Start the Ray processes.
|
||||
ray.init(num_cpus=3)
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
def test_timing(ray_start_regular):
|
||||
@ray.remote
|
||||
def empty_function():
|
||||
pass
|
||||
|
||||
@ray.remote
|
||||
def trivial_function():
|
||||
return 1
|
||||
|
||||
# Measure the time required to submit a remote task to the scheduler.
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
empty_function.remote()
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to submit an empty function call:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.00038.
|
||||
|
||||
# Measure the time required to submit a remote task to the scheduler
|
||||
# (where the remote task returns one value).
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
trivial_function.remote()
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to submit a trivial function call:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.001.
|
||||
|
||||
# Measure the time required to submit a remote task to the scheduler
|
||||
# and get the result.
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
x = trivial_function.remote()
|
||||
ray.get(x)
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to submit a trivial function call and get the "
|
||||
"result:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.0013.
|
||||
|
||||
# Measure the time required to do do a put.
|
||||
elapsed_times = []
|
||||
for _ in range(1000):
|
||||
start_time = time.time()
|
||||
ray.put(1)
|
||||
end_time = time.time()
|
||||
elapsed_times.append(end_time - start_time)
|
||||
elapsed_times = np.sort(elapsed_times)
|
||||
average_elapsed_time = sum(elapsed_times) / 1000
|
||||
print("Time required to put an int:")
|
||||
print(" Average: {}".format(average_elapsed_time))
|
||||
print(" 90th percentile: {}".format(elapsed_times[900]))
|
||||
print(" 99th percentile: {}".format(elapsed_times[990]))
|
||||
print(" worst: {}".format(elapsed_times[999]))
|
||||
# average_elapsed_time should be about 0.00087.
|
||||
|
||||
|
||||
def test_cache(ray_start_regular):
|
||||
A = np.random.rand(1, 1000000)
|
||||
v = np.random.rand(1000000)
|
||||
A_id = ray.put(A)
|
||||
v_id = ray.put(v)
|
||||
a = time.time()
|
||||
for i in range(100):
|
||||
A.dot(v)
|
||||
b = time.time() - a
|
||||
c = time.time()
|
||||
for i in range(100):
|
||||
ray.get(A_id).dot(ray.get(v_id))
|
||||
d = time.time() - c
|
||||
|
||||
if d > 1.5 * b:
|
||||
if os.getenv("TRAVIS") is None:
|
||||
raise Exception("The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
else:
|
||||
print("WARNING: The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
|
||||
+201
-189
@@ -1,15 +1,16 @@
|
||||
from __future__ import absolute_import, division, print_function
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.test.test_utils import run_and_get_output
|
||||
import pytest
|
||||
|
||||
|
||||
def run_string_as_driver(driver_script):
|
||||
@@ -30,52 +31,56 @@ def run_string_as_driver(driver_script):
|
||||
return out
|
||||
|
||||
|
||||
class MultiNodeTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
out = run_and_get_output(["ray", "start", "--head"])
|
||||
# Get the redis address from the output.
|
||||
redis_substring_prefix = "redis_address=\""
|
||||
redis_address_location = (
|
||||
out.find(redis_substring_prefix) + len(redis_substring_prefix))
|
||||
redis_address = out[redis_address_location:]
|
||||
self.redis_address = redis_address.split("\"")[0]
|
||||
@pytest.fixture
|
||||
def ray_start_head():
|
||||
out = run_and_get_output(["ray", "start", "--head"])
|
||||
# Get the redis address from the output.
|
||||
redis_substring_prefix = "redis_address=\""
|
||||
redis_address_location = (
|
||||
out.find(redis_substring_prefix) + len(redis_substring_prefix))
|
||||
redis_address = out[redis_address_location:]
|
||||
redis_address = redis_address.split("\"")[0]
|
||||
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
# Kill the Ray cluster.
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
yield redis_address
|
||||
|
||||
def testErrorIsolation(self):
|
||||
# Connect a driver to the Ray cluster.
|
||||
ray.init(redis_address=self.redis_address)
|
||||
# Disconnect from the Ray cluster.
|
||||
ray.shutdown()
|
||||
# Kill the Ray cluster.
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# There shouldn't be any errors yet.
|
||||
assert len(ray.error_info()) == 0
|
||||
|
||||
error_string1 = "error_string1"
|
||||
error_string2 = "error_string2"
|
||||
def test_error_isolation(ray_start_head):
|
||||
redis_address = ray_start_head
|
||||
# Connect a driver to the Ray cluster.
|
||||
ray.init(redis_address=redis_address)
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
raise Exception(error_string1)
|
||||
# There shouldn't be any errors yet.
|
||||
assert len(ray.error_info()) == 0
|
||||
|
||||
# Run a remote function that throws an error.
|
||||
with pytest.raises(Exception):
|
||||
ray.get(f.remote())
|
||||
error_string1 = "error_string1"
|
||||
error_string2 = "error_string2"
|
||||
|
||||
# Wait for the error to appear in Redis.
|
||||
while len(ray.error_info()) != 1:
|
||||
time.sleep(0.1)
|
||||
print("Waiting for error to appear.")
|
||||
@ray.remote
|
||||
def f():
|
||||
raise Exception(error_string1)
|
||||
|
||||
# Make sure we got the error.
|
||||
assert len(ray.error_info()) == 1
|
||||
assert error_string1 in ray.error_info()[0]["message"]
|
||||
# Run a remote function that throws an error.
|
||||
with pytest.raises(Exception):
|
||||
ray.get(f.remote())
|
||||
|
||||
# Start another driver and make sure that it does not receive this
|
||||
# error. Make the other driver throw an error, and make sure it
|
||||
# receives that error.
|
||||
driver_script = """
|
||||
# Wait for the error to appear in Redis.
|
||||
while len(ray.error_info()) != 1:
|
||||
time.sleep(0.1)
|
||||
print("Waiting for error to appear.")
|
||||
|
||||
# Make sure we got the error.
|
||||
assert len(ray.error_info()) == 1
|
||||
assert error_string1 in ray.error_info()[0]["message"]
|
||||
|
||||
# Start another driver and make sure that it does not receive this
|
||||
# error. Make the other driver throw an error, and make sure it
|
||||
# receives that error.
|
||||
driver_script = """
|
||||
import ray
|
||||
import time
|
||||
|
||||
@@ -101,25 +106,28 @@ assert len(ray.error_info()) == 1
|
||||
assert "{}" in ray.error_info()[0]["message"]
|
||||
|
||||
print("success")
|
||||
""".format(self.redis_address, error_string2, error_string2)
|
||||
""".format(redis_address, error_string2, error_string2)
|
||||
|
||||
out = run_string_as_driver(driver_script)
|
||||
# Make sure the other driver succeeded.
|
||||
assert "success" in out
|
||||
out = run_string_as_driver(driver_script)
|
||||
# Make sure the other driver succeeded.
|
||||
assert "success" in out
|
||||
|
||||
# Make sure that the other error message doesn't show up for this
|
||||
# driver.
|
||||
assert len(ray.error_info()) == 1
|
||||
assert error_string1 in ray.error_info()[0]["message"]
|
||||
# Make sure that the other error message doesn't show up for this
|
||||
# driver.
|
||||
assert len(ray.error_info()) == 1
|
||||
assert error_string1 in ray.error_info()[0]["message"]
|
||||
|
||||
def testRemoteFunctionIsolation(self):
|
||||
# This test will run multiple remote functions with the same names in
|
||||
# two different drivers. Connect a driver to the Ray cluster.
|
||||
ray.init(redis_address=self.redis_address)
|
||||
|
||||
# Start another driver and make sure that it can define and call its
|
||||
# own commands with the same names.
|
||||
driver_script = """
|
||||
def test_remote_function_isolation(ray_start_head):
|
||||
# This test will run multiple remote functions with the same names in
|
||||
# two different drivers. Connect a driver to the Ray cluster.
|
||||
redis_address = ray_start_head
|
||||
|
||||
ray.init(redis_address=redis_address)
|
||||
|
||||
# Start another driver and make sure that it can define and call its
|
||||
# own commands with the same names.
|
||||
driver_script = """
|
||||
import ray
|
||||
import time
|
||||
ray.init(redis_address="{}")
|
||||
@@ -133,32 +141,35 @@ for _ in range(10000):
|
||||
result = ray.get([f.remote(), g.remote(0, 0)])
|
||||
assert result == [3, 4]
|
||||
print("success")
|
||||
""".format(self.redis_address)
|
||||
""".format(redis_address)
|
||||
|
||||
out = run_string_as_driver(driver_script)
|
||||
out = run_string_as_driver(driver_script)
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
return 1
|
||||
@ray.remote
|
||||
def f():
|
||||
return 1
|
||||
|
||||
@ray.remote
|
||||
def g(x):
|
||||
return 2
|
||||
@ray.remote
|
||||
def g(x):
|
||||
return 2
|
||||
|
||||
for _ in range(10000):
|
||||
result = ray.get([f.remote(), g.remote(0)])
|
||||
assert result == [1, 2]
|
||||
for _ in range(10000):
|
||||
result = ray.get([f.remote(), g.remote(0)])
|
||||
assert result == [1, 2]
|
||||
|
||||
# Make sure the other driver succeeded.
|
||||
assert "success" in out
|
||||
# Make sure the other driver succeeded.
|
||||
assert "success" in out
|
||||
|
||||
def testDriverExitingQuickly(self):
|
||||
# This test will create some drivers that submit some tasks and then
|
||||
# exit without waiting for the tasks to complete.
|
||||
ray.init(redis_address=self.redis_address)
|
||||
|
||||
# Define a driver that creates an actor and exits.
|
||||
driver_script1 = """
|
||||
def test_driver_exiting_quickly(ray_start_head):
|
||||
# This test will create some drivers that submit some tasks and then
|
||||
# exit without waiting for the tasks to complete.
|
||||
redis_address = ray_start_head
|
||||
|
||||
ray.init(redis_address=redis_address)
|
||||
|
||||
# Define a driver that creates an actor and exits.
|
||||
driver_script1 = """
|
||||
import ray
|
||||
ray.init(redis_address="{}")
|
||||
@ray.remote
|
||||
@@ -167,10 +178,10 @@ class Foo(object):
|
||||
pass
|
||||
Foo.remote()
|
||||
print("success")
|
||||
""".format(self.redis_address)
|
||||
""".format(redis_address)
|
||||
|
||||
# Define a driver that creates some tasks and exits.
|
||||
driver_script2 = """
|
||||
# Define a driver that creates some tasks and exits.
|
||||
driver_script2 = """
|
||||
import ray
|
||||
ray.init(redis_address="{}")
|
||||
@ray.remote
|
||||
@@ -178,137 +189,142 @@ def f():
|
||||
return 1
|
||||
f.remote()
|
||||
print("success")
|
||||
""".format(self.redis_address)
|
||||
""".format(redis_address)
|
||||
|
||||
# Create some drivers and let them exit and make sure everything is
|
||||
# still alive.
|
||||
for _ in range(3):
|
||||
out = run_string_as_driver(driver_script1)
|
||||
# Make sure the first driver ran to completion.
|
||||
assert "success" in out
|
||||
out = run_string_as_driver(driver_script2)
|
||||
# Make sure the first driver ran to completion.
|
||||
assert "success" in out
|
||||
assert ray.services.all_processes_alive()
|
||||
# Create some drivers and let them exit and make sure everything is
|
||||
# still alive.
|
||||
for _ in range(3):
|
||||
out = run_string_as_driver(driver_script1)
|
||||
# Make sure the first driver ran to completion.
|
||||
assert "success" in out
|
||||
out = run_string_as_driver(driver_script2)
|
||||
# Make sure the first driver ran to completion.
|
||||
assert "success" in out
|
||||
assert ray.services.all_processes_alive()
|
||||
|
||||
|
||||
class StartRayScriptTest(unittest.TestCase):
|
||||
def testCallingStartRayHead(self):
|
||||
# Test that we can call start-ray.sh with various command line
|
||||
# parameters. TODO(rkn): This test only tests the --head code path. We
|
||||
# should also test the non-head node code path.
|
||||
def test_calling_start_ray_head():
|
||||
# Test that we can call start-ray.sh with various command line
|
||||
# parameters. TODO(rkn): This test only tests the --head code path. We
|
||||
# should also test the non-head node code path.
|
||||
|
||||
# Test starting Ray with no arguments.
|
||||
run_and_get_output(["ray", "start", "--head"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with no arguments.
|
||||
run_and_get_output(["ray", "start", "--head"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with a number of workers specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--num-workers", "20"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with a number of workers specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--num-workers", "20"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with a redis port specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with a redis port specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with a node IP address specified.
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--node-ip-address", "127.0.0.1"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with a node IP address specified.
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--node-ip-address", "127.0.0.1"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with an object manager port specified.
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--object-manager-port", "12345"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with an object manager port specified.
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--object-manager-port", "12345"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with the number of CPUs specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with the number of CPUs specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with the number of GPUs specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with the number of GPUs specified.
|
||||
run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with the max redis clients specified.
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--redis-max-clients", "100"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
# Test starting Ray with the max redis clients specified.
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--redis-max-clients", "100"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
if "RAY_USE_NEW_GCS" not in os.environ:
|
||||
# Test starting Ray with redis shard ports specified.
|
||||
run_and_get_output([
|
||||
"ray", "start", "--head", "--redis-shard-ports",
|
||||
"6380,6381,6382"
|
||||
])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with all arguments specified.
|
||||
run_and_get_output([
|
||||
"ray", "start", "--head", "--num-workers", "2", "--redis-port",
|
||||
"6379", "--redis-shard-ports", "6380,6381,6382",
|
||||
"--object-manager-port", "12345", "--num-cpus", "2",
|
||||
"--num-gpus", "0", "--redis-max-clients", "100", "--resources",
|
||||
"{\"Custom\": 1}"
|
||||
])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with invalid arguments.
|
||||
with pytest.raises(Exception):
|
||||
run_and_get_output([
|
||||
"ray", "start", "--head", "--redis-address", "127.0.0.1:6379"
|
||||
])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
def testUsingHostnames(self):
|
||||
# Start the Ray processes on this machine.
|
||||
if "RAY_USE_NEW_GCS" not in os.environ:
|
||||
# Test starting Ray with redis shard ports specified.
|
||||
run_and_get_output([
|
||||
"ray", "start", "--head", "--node-ip-address=localhost",
|
||||
"--redis-port=6379"
|
||||
"ray", "start", "--head", "--redis-shard-ports", "6380,6381,6382"
|
||||
])
|
||||
|
||||
ray.init(node_ip_address="localhost", redis_address="localhost:6379")
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
return 1
|
||||
|
||||
assert ray.get(f.remote()) == 1
|
||||
|
||||
# Kill the Ray cluster.
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
# Test starting Ray with all arguments specified.
|
||||
run_and_get_output([
|
||||
"ray", "start", "--head", "--num-workers", "2", "--redis-port",
|
||||
"6379", "--redis-shard-ports", "6380,6381,6382",
|
||||
"--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus",
|
||||
"0", "--redis-max-clients", "100", "--resources", "{\"Custom\": 1}"
|
||||
])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
class MiscellaneousTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
# Test starting Ray with invalid arguments.
|
||||
with pytest.raises(Exception):
|
||||
run_and_get_output(
|
||||
["ray", "start", "--head", "--redis-address", "127.0.0.1:6379"])
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
def testConnectingInLocalCase(self):
|
||||
address_info = ray.init(num_cpus=0)
|
||||
|
||||
# Define a driver that just connects to Redis.
|
||||
driver_script = """
|
||||
@pytest.fixture
|
||||
def ray_start_head_local():
|
||||
# Start the Ray processes on this machine.
|
||||
run_and_get_output([
|
||||
"ray", "start", "--head", "--node-ip-address=localhost",
|
||||
"--redis-port=6379"
|
||||
])
|
||||
|
||||
yield None
|
||||
|
||||
# Disconnect from the Ray cluster.
|
||||
ray.shutdown()
|
||||
# Kill the Ray cluster.
|
||||
subprocess.Popen(["ray", "stop"]).wait()
|
||||
|
||||
|
||||
def test_using_hostnames(ray_start_head_local):
|
||||
ray.init(node_ip_address="localhost", redis_address="localhost:6379")
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
return 1
|
||||
|
||||
assert ray.get(f.remote()) == 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_start_regular():
|
||||
# Start the Ray processes.
|
||||
address_info = ray.init(num_cpus=1)
|
||||
yield address_info
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_connecting_in_local_case(ray_start_regular):
|
||||
address_info = ray_start_regular
|
||||
|
||||
# Define a driver that just connects to Redis.
|
||||
driver_script = """
|
||||
import ray
|
||||
ray.init(redis_address="{}")
|
||||
print("success")
|
||||
""".format(address_info["redis_address"])
|
||||
|
||||
out = run_string_as_driver(driver_script)
|
||||
# Make sure the other driver succeeded.
|
||||
assert "success" in out
|
||||
out = run_string_as_driver(driver_script)
|
||||
# Make sure the other driver succeeded.
|
||||
assert "success" in out
|
||||
|
||||
|
||||
class RunDriverForMultipleTimesTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
|
||||
def testRunDriverForTwice(self):
|
||||
# We used to have issue 2165 and 2288:
|
||||
# https://github.com/ray-project/ray/issues/2165
|
||||
# https://github.com/ray-project/ray/issues/2288
|
||||
# both complain that driver will hang when run for the second time.
|
||||
# This test is used to verify the fix for above issue, it will run the
|
||||
# same driver for twice and verify whether both of them succeed.
|
||||
address_info = ray.init()
|
||||
driver_script = """
|
||||
def test_run_driver_twice(ray_start_regular):
|
||||
# We used to have issue 2165 and 2288:
|
||||
# https://github.com/ray-project/ray/issues/2165
|
||||
# https://github.com/ray-project/ray/issues/2288
|
||||
# both complain that driver will hang when run for the second time.
|
||||
# This test is used to verify the fix for above issue, it will run the
|
||||
# same driver for twice and verify whether both of them succeed.
|
||||
address_info = ray_start_regular
|
||||
driver_script = """
|
||||
import ray
|
||||
import ray.tune as tune
|
||||
import os
|
||||
@@ -338,10 +354,6 @@ tune.run_experiments({{
|
||||
print("success")
|
||||
""".format(address_info["redis_address"])
|
||||
|
||||
for i in range(2):
|
||||
out = run_string_as_driver(driver_script)
|
||||
assert "success" in out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
for i in range(2):
|
||||
out = run_string_as_driver(driver_script)
|
||||
assert "success" in out
|
||||
|
||||
+2
-2
@@ -1287,7 +1287,7 @@ class APITest(unittest.TestCase):
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"For now, RAY_USE_NEW_GCS supports 1 shard, and credis "
|
||||
"supports 1-node chain for that shard only.")
|
||||
class APITestSharded(APITest):
|
||||
@@ -2187,7 +2187,7 @@ def wait_for_num_objects(num_objects, timeout=10):
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
os.environ.get('RAY_USE_NEW_GCS', False),
|
||||
os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"New GCS API doesn't have a Python API yet.")
|
||||
class GlobalStateAPI(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
|
||||
+165
-174
@@ -3,8 +3,8 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from numpy.testing import assert_almost_equal
|
||||
import pytest
|
||||
import tensorflow as tf
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
|
||||
@@ -93,178 +93,169 @@ class TrainActor(object):
|
||||
return self.values[1].get_weights()
|
||||
|
||||
|
||||
class TensorFlowTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
|
||||
def testTensorFlowVariables(self):
|
||||
ray.init(num_workers=2)
|
||||
|
||||
sess = tf.Session()
|
||||
loss, init, _, _ = make_linear_network()
|
||||
sess.run(init)
|
||||
|
||||
variables = ray.experimental.TensorFlowVariables(loss, sess)
|
||||
weights = variables.get_weights()
|
||||
|
||||
for (name, val) in weights.items():
|
||||
weights[name] += 1.0
|
||||
|
||||
variables.set_weights(weights)
|
||||
assert weights == variables.get_weights()
|
||||
|
||||
loss2, init2, _, _ = make_linear_network("w", "b")
|
||||
sess.run(init2)
|
||||
|
||||
variables2 = ray.experimental.TensorFlowVariables(loss2, sess)
|
||||
weights2 = variables2.get_weights()
|
||||
|
||||
for (name, val) in weights2.items():
|
||||
weights2[name] += 2.0
|
||||
|
||||
variables2.set_weights(weights2)
|
||||
assert weights2 == variables2.get_weights()
|
||||
flat_weights = variables2.get_flat() + 2.0
|
||||
variables2.set_flat(flat_weights)
|
||||
assert_almost_equal(flat_weights, variables2.get_flat())
|
||||
|
||||
variables3 = ray.experimental.TensorFlowVariables([loss2])
|
||||
assert variables3.sess is None
|
||||
sess = tf.Session()
|
||||
variables3.set_session(sess)
|
||||
assert variables3.sess == sess
|
||||
|
||||
# Test that the variable names for the two different nets are not
|
||||
# modified by TensorFlow to be unique (i.e., they should already
|
||||
# be unique because of the variable prefix).
|
||||
def testVariableNameCollision(self):
|
||||
ray.init(num_workers=2)
|
||||
|
||||
net1 = NetActor()
|
||||
net2 = NetActor()
|
||||
|
||||
# This is checking that the variable names of the two nets are the
|
||||
# same, i.e., that the names in the weight dictionaries are the same.
|
||||
net1.values[0].set_weights(net2.values[0].get_weights())
|
||||
|
||||
# Test that TensorFlowVariables can take in addition variables through
|
||||
# input_variables arg and with no loss.
|
||||
def testAdditionalVariablesNoLoss(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
net = LossActor(use_loss=False)
|
||||
assert len(net.values[0].variables.items()) == 1
|
||||
assert len(net.values[0].placeholders.items()) == 1
|
||||
|
||||
net.values[0].set_weights(net.values[0].get_weights())
|
||||
|
||||
# Test that TensorFlowVariables can take in addition variables through
|
||||
# input_variables arg and with a loss.
|
||||
def testAdditionalVariablesWithLoss(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
net = LossActor()
|
||||
assert len(net.values[0].variables.items()) == 3
|
||||
assert len(net.values[0].placeholders.items()) == 3
|
||||
|
||||
net.values[0].set_weights(net.values[0].get_weights())
|
||||
|
||||
# Test that different networks on the same worker are independent and
|
||||
# we can get/set their weights without any interaction.
|
||||
def testNetworksIndependent(self):
|
||||
# Note we use only one worker to ensure that all of the remote
|
||||
# functions run on the same worker.
|
||||
ray.init(num_workers=1)
|
||||
net1 = NetActor()
|
||||
net2 = NetActor()
|
||||
|
||||
# Make sure the two networks have different weights. TODO(rkn): Note
|
||||
# that equality comparisons of numpy arrays normally does not work.
|
||||
# This only works because at the moment they have size 1.
|
||||
weights1 = net1.get_weights()
|
||||
weights2 = net2.get_weights()
|
||||
assert weights1 != weights2
|
||||
|
||||
# Set the weights and get the weights, and make sure they are
|
||||
# unchanged.
|
||||
new_weights1 = net1.set_and_get_weights(weights1)
|
||||
new_weights2 = net2.set_and_get_weights(weights2)
|
||||
assert weights1 == new_weights1
|
||||
assert weights2 == new_weights2
|
||||
|
||||
# Swap the weights.
|
||||
new_weights1 = net2.set_and_get_weights(weights1)
|
||||
new_weights2 = net1.set_and_get_weights(weights2)
|
||||
assert weights1 == new_weights1
|
||||
assert weights2 == new_weights2
|
||||
|
||||
# This test creates an additional network on the driver so that the
|
||||
# tensorflow variables on the driver and the worker differ.
|
||||
def testNetworkDriverWorkerIndependent(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
# Create a network on the driver locally.
|
||||
sess1 = tf.Session()
|
||||
loss1, init1, _, _ = make_linear_network()
|
||||
ray.experimental.TensorFlowVariables(loss1, sess1)
|
||||
sess1.run(init1)
|
||||
|
||||
net2 = ray.remote(NetActor).remote()
|
||||
weights2 = ray.get(net2.get_weights.remote())
|
||||
|
||||
new_weights2 = ray.get(
|
||||
net2.set_and_get_weights.remote(net2.get_weights.remote()))
|
||||
assert weights2 == new_weights2
|
||||
|
||||
def testVariablesControlDependencies(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
# Creates a network and appends a momentum optimizer.
|
||||
sess = tf.Session()
|
||||
loss, init, _, _ = make_linear_network()
|
||||
minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss)
|
||||
net_vars = ray.experimental.TensorFlowVariables(minimizer, sess)
|
||||
sess.run(init)
|
||||
|
||||
# Tests if all variables are properly retrieved, 2 variables and 2
|
||||
# momentum variables.
|
||||
assert len(net_vars.variables.items()) == 4
|
||||
|
||||
def testRemoteTrainingStep(self):
|
||||
ray.init(num_workers=1)
|
||||
|
||||
net = ray.remote(TrainActor).remote()
|
||||
ray.get(net.training_step.remote(net.get_weights.remote()))
|
||||
|
||||
def testRemoteTrainingLoss(self):
|
||||
ray.init(num_workers=2)
|
||||
|
||||
net = ray.remote(TrainActor).remote()
|
||||
net_values = TrainActor().values
|
||||
loss, variables, _, sess, grads, train, placeholders = net_values
|
||||
|
||||
before_acc = sess.run(
|
||||
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
|
||||
|
||||
for _ in range(3):
|
||||
gradients_list = ray.get([
|
||||
net.training_step.remote(variables.get_weights())
|
||||
for _ in range(2)
|
||||
])
|
||||
mean_grads = [
|
||||
sum(gradients[i]
|
||||
for gradients in gradients_list) / len(gradients_list)
|
||||
for i in range(len(gradients_list[0]))
|
||||
]
|
||||
feed_dict = {
|
||||
grad[0]: mean_grad
|
||||
for (grad, mean_grad) in zip(grads, mean_grads)
|
||||
}
|
||||
sess.run(train, feed_dict=feed_dict)
|
||||
after_acc = sess.run(
|
||||
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
|
||||
assert before_acc < after_acc
|
||||
@pytest.fixture
|
||||
def ray_start_regular():
|
||||
# Start the Ray processes.
|
||||
ray.init(num_cpus=2)
|
||||
yield None
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
def test_tensorflow_variables(ray_start_regular):
|
||||
sess = tf.Session()
|
||||
loss, init, _, _ = make_linear_network()
|
||||
sess.run(init)
|
||||
|
||||
variables = ray.experimental.TensorFlowVariables(loss, sess)
|
||||
weights = variables.get_weights()
|
||||
|
||||
for (name, val) in weights.items():
|
||||
weights[name] += 1.0
|
||||
|
||||
variables.set_weights(weights)
|
||||
assert weights == variables.get_weights()
|
||||
|
||||
loss2, init2, _, _ = make_linear_network("w", "b")
|
||||
sess.run(init2)
|
||||
|
||||
variables2 = ray.experimental.TensorFlowVariables(loss2, sess)
|
||||
weights2 = variables2.get_weights()
|
||||
|
||||
for (name, val) in weights2.items():
|
||||
weights2[name] += 2.0
|
||||
|
||||
variables2.set_weights(weights2)
|
||||
assert weights2 == variables2.get_weights()
|
||||
flat_weights = variables2.get_flat() + 2.0
|
||||
variables2.set_flat(flat_weights)
|
||||
assert_almost_equal(flat_weights, variables2.get_flat())
|
||||
|
||||
variables3 = ray.experimental.TensorFlowVariables([loss2])
|
||||
assert variables3.sess is None
|
||||
sess = tf.Session()
|
||||
variables3.set_session(sess)
|
||||
assert variables3.sess == sess
|
||||
|
||||
|
||||
# Test that the variable names for the two different nets are not
|
||||
# modified by TensorFlow to be unique (i.e., they should already
|
||||
# be unique because of the variable prefix).
|
||||
def test_variable_name_collision(ray_start_regular):
|
||||
net1 = NetActor()
|
||||
net2 = NetActor()
|
||||
|
||||
# This is checking that the variable names of the two nets are the
|
||||
# same, i.e., that the names in the weight dictionaries are the same.
|
||||
net1.values[0].set_weights(net2.values[0].get_weights())
|
||||
|
||||
|
||||
# Test that TensorFlowVariables can take in addition variables through
|
||||
# input_variables arg and with no loss.
|
||||
def test_additional_variables_no_loss(ray_start_regular):
|
||||
net = LossActor(use_loss=False)
|
||||
assert len(net.values[0].variables.items()) == 1
|
||||
assert len(net.values[0].placeholders.items()) == 1
|
||||
|
||||
net.values[0].set_weights(net.values[0].get_weights())
|
||||
|
||||
|
||||
# Test that TensorFlowVariables can take in addition variables through
|
||||
# input_variables arg and with a loss.
|
||||
def test_additional_variables_with_loss(ray_start_regular):
|
||||
net = LossActor()
|
||||
assert len(net.values[0].variables.items()) == 3
|
||||
assert len(net.values[0].placeholders.items()) == 3
|
||||
|
||||
net.values[0].set_weights(net.values[0].get_weights())
|
||||
|
||||
|
||||
# Test that different networks on the same worker are independent and
|
||||
# we can get/set their weights without any interaction.
|
||||
def test_networks_independent(ray_start_regular):
|
||||
# Note we use only one worker to ensure that all of the remote
|
||||
# functions run on the same worker.
|
||||
net1 = NetActor()
|
||||
net2 = NetActor()
|
||||
|
||||
# Make sure the two networks have different weights. TODO(rkn): Note
|
||||
# that equality comparisons of numpy arrays normally does not work.
|
||||
# This only works because at the moment they have size 1.
|
||||
weights1 = net1.get_weights()
|
||||
weights2 = net2.get_weights()
|
||||
assert weights1 != weights2
|
||||
|
||||
# Set the weights and get the weights, and make sure they are
|
||||
# unchanged.
|
||||
new_weights1 = net1.set_and_get_weights(weights1)
|
||||
new_weights2 = net2.set_and_get_weights(weights2)
|
||||
assert weights1 == new_weights1
|
||||
assert weights2 == new_weights2
|
||||
|
||||
# Swap the weights.
|
||||
new_weights1 = net2.set_and_get_weights(weights1)
|
||||
new_weights2 = net1.set_and_get_weights(weights2)
|
||||
assert weights1 == new_weights1
|
||||
assert weights2 == new_weights2
|
||||
|
||||
|
||||
# This test creates an additional network on the driver so that the
|
||||
# tensorflow variables on the driver and the worker differ.
|
||||
def test_network_driver_worker_independent(ray_start_regular):
|
||||
# Create a network on the driver locally.
|
||||
sess1 = tf.Session()
|
||||
loss1, init1, _, _ = make_linear_network()
|
||||
ray.experimental.TensorFlowVariables(loss1, sess1)
|
||||
sess1.run(init1)
|
||||
|
||||
net2 = ray.remote(NetActor).remote()
|
||||
weights2 = ray.get(net2.get_weights.remote())
|
||||
|
||||
new_weights2 = ray.get(
|
||||
net2.set_and_get_weights.remote(net2.get_weights.remote()))
|
||||
assert weights2 == new_weights2
|
||||
|
||||
|
||||
def test_variables_control_dependencies(ray_start_regular):
|
||||
# Creates a network and appends a momentum optimizer.
|
||||
sess = tf.Session()
|
||||
loss, init, _, _ = make_linear_network()
|
||||
minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss)
|
||||
net_vars = ray.experimental.TensorFlowVariables(minimizer, sess)
|
||||
sess.run(init)
|
||||
|
||||
# Tests if all variables are properly retrieved, 2 variables and 2
|
||||
# momentum variables.
|
||||
assert len(net_vars.variables.items()) == 4
|
||||
|
||||
|
||||
def test_remote_training_step(ray_start_regular):
|
||||
net = ray.remote(TrainActor).remote()
|
||||
ray.get(net.training_step.remote(net.get_weights.remote()))
|
||||
|
||||
|
||||
def test_remote_training_loss(ray_start_regular):
|
||||
net = ray.remote(TrainActor).remote()
|
||||
net_values = TrainActor().values
|
||||
loss, variables, _, sess, grads, train, placeholders = net_values
|
||||
|
||||
before_acc = sess.run(
|
||||
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
|
||||
|
||||
for _ in range(3):
|
||||
gradients_list = ray.get([
|
||||
net.training_step.remote(variables.get_weights()) for _ in range(2)
|
||||
])
|
||||
mean_grads = [
|
||||
sum(gradients[i]
|
||||
for gradients in gradients_list) / len(gradients_list)
|
||||
for i in range(len(gradients_list[0]))
|
||||
]
|
||||
feed_dict = {
|
||||
grad[0]: mean_grad
|
||||
for (grad, mean_grad) in zip(grads, mean_grads)
|
||||
}
|
||||
sess.run(train, feed_dict=feed_dict)
|
||||
after_acc = sess.run(
|
||||
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
|
||||
assert before_acc < after_acc
|
||||
|
||||
Reference in New Issue
Block a user