Fix frequent failure of Jenkins CI. (#2490)

This commit is contained in:
Yuhong Guo
2018-08-03 01:28:28 +08:00
committed by Robert Nishihara
parent d8ba667175
commit d2ebe4d9a3
5 changed files with 63 additions and 31 deletions
+12 -7
View File
@@ -37,14 +37,19 @@ def _wait_for_nodes_to_join(num_nodes, timeout=20):
ready = True
# Check that for each node, a local scheduler and a plasma manager
# are present.
for ip_address, clients in client_table.items():
client_types = [client["ClientType"] for client in clients]
if "local_scheduler" not in client_types:
ready = False
if "plasma_manager" not in client_types:
ready = False
if ready:
if ray.global_state.use_raylet:
# In raylet mode, this is a list of map.
# The GCS info will appear as a whole instead of part by part.
return
else:
for ip_address, clients in client_table.items():
client_types = [client["ClientType"] for client in clients]
if "local_scheduler" not in client_types:
ready = False
if "plasma_manager" not in client_types:
ready = False
if ready:
return
if num_ready_nodes > num_nodes:
# Too many nodes have joined. Something must be wrong.
raise Exception("{} nodes have joined the cluster, but we were "
+6 -1
View File
@@ -1411,7 +1411,12 @@ def get_address_info_from_redis_helper(redis_address,
(client_node_ip_address == "127.0.0.1"
and redis_ip_address == ray.services.get_node_ip_address())):
raylets.append(client)
# Make sure that at least one raylet has started locally.
# This handles a race condition where Redis has started but
# the raylet has not connected.
if len(raylets) == 0:
raise Exception(
"Redis has started but no raylets have registered yet.")
object_store_addresses = [
services.ObjectStoreAddress(
name=ray.utils.decode(raylet.ObjectStoreSocketName()),