diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py index f0b836748..0961642ad 100644 --- a/python/ray/autoscaler/commands.py +++ b/python/ray/autoscaler/commands.py @@ -199,9 +199,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, config["cluster_name"]) provider.create_node(config["head_node"], head_node_tags, 1) - nodes = provider.non_terminated_nodes(head_node_tags) - assert len(nodes) == 1, "Failed to create head node." - head_node = nodes[0] + start = time.time() + head_node = None + while True: + if time.time() - start > 5: + raise RuntimeError("Failed to create head node.") + nodes = provider.non_terminated_nodes(head_node_tags) + if len(nodes) == 1: + head_node = nodes[0] + break + time.sleep(1) # TODO(ekl) right now we always update the head node even if the hash # matches. We could prompt the user for what they want to do here.