Windows process issues (#7739)

This commit is contained in:
mehrdadn
2020-03-29 12:48:32 -07:00
committed by GitHub
parent 6ce8b63bb6
commit fc23f79f82
10 changed files with 150 additions and 34 deletions
+14 -18
View File
@@ -6,9 +6,9 @@ import os
import logging
import signal
import socket
import subprocess
import sys
import tempfile
import threading
import time
import ray
@@ -185,7 +185,7 @@ class Node:
self.kill_all_processes(check_alive=False, allow_graceful=True)
sys.exit(1)
signal.signal(signal.SIGTERM, sigterm_handler)
ray.utils.set_sigterm_handler(sigterm_handler)
def _init_temp(self, redis_client):
# Create an dictionary to store temp file index.
@@ -718,25 +718,21 @@ class Node:
time.sleep(0.1)
if allow_graceful:
# Allow the process one second to exit gracefully.
process.terminate()
timer = threading.Timer(1, lambda process: process.kill(),
[process])
# Allow the process one second to exit gracefully.
timeout_seconds = 1
try:
timer.start()
process.wait(timeout_seconds)
except subprocess.TimeoutExpired:
pass
# If the process did not exit, force kill it.
if process.poll() is None:
process.kill()
# The reason we usually don't call process.wait() here is that
# there's some chance we'd end up waiting a really long time.
if wait:
process.wait()
finally:
timer.cancel()
if process.poll() is not None:
continue
# If the process did not exit within one second, force kill it.
process.kill()
# The reason we usually don't call process.wait() here is that
# there's some chance we'd end up waiting a really long time.
if wait:
process.wait()
del self.all_processes[process_type]
+29 -1
View File
@@ -1,10 +1,12 @@
import collections
import errno
import io
import json
import logging
import multiprocessing
import os
import random
import signal
import socket
import subprocess
import sys
@@ -76,6 +78,32 @@ ProcessInfo = collections.namedtuple("ProcessInfo", [
])
class ConsolePopen(subprocess.Popen):
if sys.platform == "win32":
def terminate(self):
if isinstance(self.stdin, io.IOBase):
self.stdin.close()
if self._use_signals:
self.send_signal(signal.CTRL_BREAK_EVENT)
else:
super(ConsolePopen, self).terminate()
def __init__(self, *args, **kwargs):
# CREATE_NEW_PROCESS_GROUP is used to send Ctrl+C on Windows:
# https://docs.python.org/3/library/subprocess.html#subprocess.Popen.send_signal
new_pgroup = subprocess.CREATE_NEW_PROCESS_GROUP
flags = 0
if ray.utils.detect_fate_sharing_support():
# If we don't have kernel-mode fate-sharing, then don't do this
# because our children need to be in out process group for
# the process reaper to properly terminate them.
flags = new_pgroup
kwargs.setdefault("creationflags", flags)
self._use_signals = (kwargs["creationflags"] & new_pgroup)
super(ConsolePopen, self).__init__(*args, **kwargs)
def address(ip_address, port):
return ip_address + ":" + str(port)
@@ -464,7 +492,7 @@ def start_ray_process(command,
if fate_share and sys.platform.startswith("linux"):
ray.utils.set_kill_on_parent_death_linux()
process = subprocess.Popen(
process = ConsolePopen(
command,
env=modified_env,
cwd=cwd,
+14
View File
@@ -5,6 +5,7 @@ import inspect
import logging
import numpy as np
import os
import signal
import subprocess
import sys
import tempfile
@@ -561,10 +562,12 @@ def detect_fate_sharing_support_win32():
# https://docs.microsoft.com/en-us/windows/win32/api/jobapi2/nf-jobapi2-setinformationjobobject
JobObjectExtendedLimitInformation = 9
JOB_OBJECT_LIMIT_BREAKAWAY_OK = 0x00000800
JOB_OBJECT_LIMIT_DIE_ON_UNHANDLED_EXCEPTION = 0x00000400
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x00002000
buf = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()
buf.BasicLimitInformation.LimitFlags = (
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
| JOB_OBJECT_LIMIT_DIE_ON_UNHANDLED_EXCEPTION
| JOB_OBJECT_LIMIT_BREAKAWAY_OK)
infoclass = JobObjectExtendedLimitInformation
if not kernel32.SetInformationJobObject(
@@ -636,6 +639,17 @@ def set_kill_child_on_death_win32(child_proc):
assert False, "AssignProcessToJobObject used despite being unavailable"
def set_sigterm_handler(sigterm_handler):
"""Registers a handler for SIGTERM in a platform-compatible manner."""
if sys.platform == "win32":
# Note that these signal handlers only work for console applications.
# TODO(mehrdadn): implement graceful process termination mechanism
# SIGINT is Ctrl+C, SIGBREAK is Ctrl+Break.
signal.signal(signal.SIGBREAK, sigterm_handler)
else:
signal.signal(signal.SIGTERM, sigterm_handler)
def try_make_directory_shared(directory_path):
try:
os.chmod(directory_path, 0o0777)
+16 -8
View File
@@ -422,7 +422,7 @@ class Worker:
shutdown(True)
sys.exit(1)
signal.signal(signal.SIGTERM, sigterm_handler)
ray.utils.set_sigterm_handler(sigterm_handler)
self.core_worker.run_task_loop()
sys.exit(0)
@@ -882,7 +882,7 @@ def sigterm_handler(signum, frame):
try:
signal.signal(signal.SIGTERM, sigterm_handler)
ray.utils.set_sigterm_handler(sigterm_handler)
except ValueError:
logger.warning("Failed to set SIGTERM handler, processes might"
"not be cleaned up properly on exit.")
@@ -1222,14 +1222,18 @@ def connect(node,
# Redirect stdout/stderr at the file descriptor level. If we simply
# set sys.stdout and sys.stderr, then logging from C++ can fail to
# be redirected.
os.dup2(log_stdout_file.fileno(), sys.stdout.fileno())
os.dup2(log_stderr_file.fileno(), sys.stderr.fileno())
if log_stdout_file is not None:
os.dup2(log_stdout_file.fileno(), sys.stdout.fileno())
if log_stderr_file is not None:
os.dup2(log_stderr_file.fileno(), sys.stderr.fileno())
# We also manually set sys.stdout and sys.stderr because that seems
# to have an affect on the output buffering. Without doing this,
# stdout and stderr are heavily buffered resulting in seemingly
# lost logging statements.
sys.stdout = log_stdout_file
sys.stderr = log_stderr_file
if log_stdout_file is not None:
sys.stdout = log_stdout_file
if log_stderr_file is not None:
sys.stderr = log_stderr_file
# This should always be the first message to appear in the worker's
# stdout and stderr log files. The string "Ray worker pid:" is
# parsed in the log monitor process.
@@ -1238,8 +1242,12 @@ def connect(node,
sys.stdout.flush()
sys.stderr.flush()
worker_dict["stdout_file"] = os.path.abspath(log_stdout_file.name)
worker_dict["stderr_file"] = os.path.abspath(log_stderr_file.name)
worker_dict["stdout_file"] = os.path.abspath(
(log_stdout_file
if log_stdout_file is not None else sys.stdout).name)
worker_dict["stderr_file"] = os.path.abspath(
(log_stderr_file
if log_stderr_file is not None else sys.stderr).name)
worker.redis_client.hmset(b"Workers:" + worker.worker_id, worker_dict)
else:
raise ValueError("Invalid worker mode. Expected DRIVER or WORKER.")