From c7ae4e5e1f35a46079d04d96b955c262dee5c684 Mon Sep 17 00:00:00 2001 From: Edward Oakes Date: Sat, 17 Aug 2019 20:44:08 -0700 Subject: [PATCH] Check for dead processes in blocked ray start (#5458) --- python/ray/scripts/scripts.py | 12 +++++- python/ray/tests/test_multi_node.py | 57 +++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 1e8a5c7c2..8dbb1e963 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -8,6 +8,7 @@ import json import logging import os import subprocess +import sys import ray.services as services from ray.autoscaler.commands import ( @@ -384,7 +385,16 @@ def start(node_ip_address, redis_address, address, redis_port, if block: import time while True: - time.sleep(30) + time.sleep(1) + deceased = node.dead_processes() + if len(deceased) > 0: + logger.error("Ray processes died unexpectedly:") + for process_type, process in deceased: + logger.error("\t{} died with exit code {}".format( + process_type, process.returncode)) + logger.error("Killing remaining processes and exiting...") + node.kill_all_processes(check_alive=False, allow_graceful=True) + sys.exit(1) @cli.command() diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index e4043957f..50eac65ab 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -327,6 +327,63 @@ def test_calling_start_ray_head(): ["ray", "start", "--head", "--redis-address", "127.0.0.1:6379"]) subprocess.Popen(["ray", "stop"]).wait() + # Test --block. Killing any child process should cause the command to exit. + blocked = subprocess.Popen(["ray", "start", "--head", "--block"]) + blocked.poll() + + # Wait for up to 10s for the ray command to spawn a child process. + for _ in range(10): + try: + subprocess.check_output(["pgrep", "-P", str(blocked.pid)]) + break + except subprocess.CalledProcessError: + time.sleep(1) + else: + assert False, "ray start didn't spawn children within 10s of starting" + + blocked.poll() + assert blocked.returncode is None + + # Kill all child processes of the ray command and check that it exits. + subprocess.check_output(["pkill", "-P", str(blocked.pid)]) + for _ in range(10): + time.sleep(1) + blocked.poll() + if blocked.returncode is not None: + break + else: + assert False, "ray start didn't exit within 10s of child process dying" + + assert blocked.returncode != 0 + + # Test --block. Killing the command should clean up all child processes. + blocked = subprocess.Popen(["ray", "start", "--head", "--block"]) + blocked.poll() + assert blocked.returncode is None + + # Wait for up to 10s for the ray command to spawn a child process. + for _ in range(10): + try: + subprocess.check_output(["pgrep", "-P", str(blocked.pid)]) + break + except subprocess.CalledProcessError: + time.sleep(1) + else: + assert False, "ray start didn't spawn children within 10s of starting" + + blocked.terminate() + + # Check that the child processes are cleaned up within 10s. + for _ in range(10): + try: + subprocess.check_output( + ["pgrep", "-P", str(blocked.pid), "raylet"]) + except subprocess.CalledProcessError: + # pgrep didn't find anything, so the child processes are dead. + break + else: + assert False, "ray start didn't kill children within 10s of exiting." + @pytest.mark.parametrize( "call_ray_start", [