mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 20:57:51 +08:00
Use process reaper for cleanup (#6253)
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
"""
|
||||
This is a lightweight "reaper" process used to ensure that ray processes are
|
||||
cleaned up properly when the main ray process dies unexpectedly (e.g.,
|
||||
segfaults or gets SIGKILLed). Note that processes may not be cleaned up
|
||||
properly if this process is SIGTERMed or SIGKILLed.
|
||||
|
||||
It detects that its parent has died by reading from stdin, which must be
|
||||
inherited from the parent process so that the OS will deliver an EOF if the
|
||||
parent dies. When this happens, the reaper process kills the rest of its
|
||||
process group (first attempting graceful shutdown with SIGTERM, then escalating
|
||||
to SIGKILL).
|
||||
"""
|
||||
|
||||
SIGTERM_GRACE_PERIOD_SECONDS = 1
|
||||
|
||||
|
||||
def reap_process_group(*args):
|
||||
def sigterm_handler(*args):
|
||||
# Give a one-second grace period for other processes to clean up.
|
||||
time.sleep(SIGTERM_GRACE_PERIOD_SECONDS)
|
||||
# SIGKILL the pgroup (including ourselves) as a last-resort.
|
||||
os.killpg(0, signal.SIGKILL)
|
||||
|
||||
# Set a SIGTERM handler to handle SIGTERMing ourselves with the group.
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
|
||||
# Our parent must have died, SIGTERM the group (including ourselves).
|
||||
# TODO(mehrdadn): killpg isn't supported on Windows.
|
||||
os.killpg(0, signal.SIGTERM)
|
||||
|
||||
|
||||
def main():
|
||||
# Read from stdout forever. Because stdout is a file descriptor
|
||||
# inherited from our parent process, we will get an EOF if the parent
|
||||
# dies, which is signaled by an empty return from read().
|
||||
# We intentionally don't set any signal handlers here, so a SIGTERM from
|
||||
# the parent can be used to kill this process gracefully without it killing
|
||||
# the rest of the process group.
|
||||
while len(sys.stdin.read()) != 0:
|
||||
pass
|
||||
reap_process_group()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user