From dff601727233ab069d458355a16353b7fa180992 Mon Sep 17 00:00:00 2001 From: Edward Oakes Date: Mon, 2 Dec 2019 15:22:00 -0800 Subject: [PATCH] Fix "failed to create head node" issue (#6304) * Fix failed to create head node issue * comments --- python/ray/autoscaler/commands.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py index f0b836748..0961642ad 100644 --- a/python/ray/autoscaler/commands.py +++ b/python/ray/autoscaler/commands.py @@ -199,9 +199,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) - nodes = provider.non_terminated_nodes(head_node_tags) - assert len(nodes) == 1, "Failed to create head node." - head_node = nodes[0] + start = time.time() + head_node = None + while True: + if time.time() - start > 5: + raise RuntimeError("Failed to create head node.") + nodes = provider.non_terminated_nodes(head_node_tags) + if len(nodes) == 1: + head_node = nodes[0] + break + time.sleep(1) # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do here.