mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 03:50:57 +08:00
Windows process issues (#7739)
This commit is contained in:
+14
-18
@@ -6,9 +6,9 @@ import os
|
||||
import logging
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
|
||||
import ray
|
||||
@@ -185,7 +185,7 @@ class Node:
|
||||
self.kill_all_processes(check_alive=False, allow_graceful=True)
|
||||
sys.exit(1)
|
||||
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
ray.utils.set_sigterm_handler(sigterm_handler)
|
||||
|
||||
def _init_temp(self, redis_client):
|
||||
# Create an dictionary to store temp file index.
|
||||
@@ -718,25 +718,21 @@ class Node:
|
||||
time.sleep(0.1)
|
||||
|
||||
if allow_graceful:
|
||||
# Allow the process one second to exit gracefully.
|
||||
process.terminate()
|
||||
timer = threading.Timer(1, lambda process: process.kill(),
|
||||
[process])
|
||||
# Allow the process one second to exit gracefully.
|
||||
timeout_seconds = 1
|
||||
try:
|
||||
timer.start()
|
||||
process.wait(timeout_seconds)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
||||
# If the process did not exit, force kill it.
|
||||
if process.poll() is None:
|
||||
process.kill()
|
||||
# The reason we usually don't call process.wait() here is that
|
||||
# there's some chance we'd end up waiting a really long time.
|
||||
if wait:
|
||||
process.wait()
|
||||
finally:
|
||||
timer.cancel()
|
||||
|
||||
if process.poll() is not None:
|
||||
continue
|
||||
|
||||
# If the process did not exit within one second, force kill it.
|
||||
process.kill()
|
||||
# The reason we usually don't call process.wait() here is that
|
||||
# there's some chance we'd end up waiting a really long time.
|
||||
if wait:
|
||||
process.wait()
|
||||
|
||||
del self.all_processes[process_type]
|
||||
|
||||
|
||||
+29
-1
@@ -1,10 +1,12 @@
|
||||
import collections
|
||||
import errno
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import random
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -76,6 +78,32 @@ ProcessInfo = collections.namedtuple("ProcessInfo", [
|
||||
])
|
||||
|
||||
|
||||
class ConsolePopen(subprocess.Popen):
|
||||
if sys.platform == "win32":
|
||||
|
||||
def terminate(self):
|
||||
if isinstance(self.stdin, io.IOBase):
|
||||
self.stdin.close()
|
||||
if self._use_signals:
|
||||
self.send_signal(signal.CTRL_BREAK_EVENT)
|
||||
else:
|
||||
super(ConsolePopen, self).terminate()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# CREATE_NEW_PROCESS_GROUP is used to send Ctrl+C on Windows:
|
||||
# https://docs.python.org/3/library/subprocess.html#subprocess.Popen.send_signal
|
||||
new_pgroup = subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
flags = 0
|
||||
if ray.utils.detect_fate_sharing_support():
|
||||
# If we don't have kernel-mode fate-sharing, then don't do this
|
||||
# because our children need to be in out process group for
|
||||
# the process reaper to properly terminate them.
|
||||
flags = new_pgroup
|
||||
kwargs.setdefault("creationflags", flags)
|
||||
self._use_signals = (kwargs["creationflags"] & new_pgroup)
|
||||
super(ConsolePopen, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
def address(ip_address, port):
|
||||
return ip_address + ":" + str(port)
|
||||
|
||||
@@ -464,7 +492,7 @@ def start_ray_process(command,
|
||||
if fate_share and sys.platform.startswith("linux"):
|
||||
ray.utils.set_kill_on_parent_death_linux()
|
||||
|
||||
process = subprocess.Popen(
|
||||
process = ConsolePopen(
|
||||
command,
|
||||
env=modified_env,
|
||||
cwd=cwd,
|
||||
|
||||
@@ -5,6 +5,7 @@ import inspect
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
@@ -561,10 +562,12 @@ def detect_fate_sharing_support_win32():
|
||||
# https://docs.microsoft.com/en-us/windows/win32/api/jobapi2/nf-jobapi2-setinformationjobobject
|
||||
JobObjectExtendedLimitInformation = 9
|
||||
JOB_OBJECT_LIMIT_BREAKAWAY_OK = 0x00000800
|
||||
JOB_OBJECT_LIMIT_DIE_ON_UNHANDLED_EXCEPTION = 0x00000400
|
||||
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x00002000
|
||||
buf = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()
|
||||
buf.BasicLimitInformation.LimitFlags = (
|
||||
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
|
||||
| JOB_OBJECT_LIMIT_DIE_ON_UNHANDLED_EXCEPTION
|
||||
| JOB_OBJECT_LIMIT_BREAKAWAY_OK)
|
||||
infoclass = JobObjectExtendedLimitInformation
|
||||
if not kernel32.SetInformationJobObject(
|
||||
@@ -636,6 +639,17 @@ def set_kill_child_on_death_win32(child_proc):
|
||||
assert False, "AssignProcessToJobObject used despite being unavailable"
|
||||
|
||||
|
||||
def set_sigterm_handler(sigterm_handler):
|
||||
"""Registers a handler for SIGTERM in a platform-compatible manner."""
|
||||
if sys.platform == "win32":
|
||||
# Note that these signal handlers only work for console applications.
|
||||
# TODO(mehrdadn): implement graceful process termination mechanism
|
||||
# SIGINT is Ctrl+C, SIGBREAK is Ctrl+Break.
|
||||
signal.signal(signal.SIGBREAK, sigterm_handler)
|
||||
else:
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
|
||||
|
||||
def try_make_directory_shared(directory_path):
|
||||
try:
|
||||
os.chmod(directory_path, 0o0777)
|
||||
|
||||
+16
-8
@@ -422,7 +422,7 @@ class Worker:
|
||||
shutdown(True)
|
||||
sys.exit(1)
|
||||
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
ray.utils.set_sigterm_handler(sigterm_handler)
|
||||
self.core_worker.run_task_loop()
|
||||
sys.exit(0)
|
||||
|
||||
@@ -882,7 +882,7 @@ def sigterm_handler(signum, frame):
|
||||
|
||||
|
||||
try:
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
ray.utils.set_sigterm_handler(sigterm_handler)
|
||||
except ValueError:
|
||||
logger.warning("Failed to set SIGTERM handler, processes might"
|
||||
"not be cleaned up properly on exit.")
|
||||
@@ -1222,14 +1222,18 @@ def connect(node,
|
||||
# Redirect stdout/stderr at the file descriptor level. If we simply
|
||||
# set sys.stdout and sys.stderr, then logging from C++ can fail to
|
||||
# be redirected.
|
||||
os.dup2(log_stdout_file.fileno(), sys.stdout.fileno())
|
||||
os.dup2(log_stderr_file.fileno(), sys.stderr.fileno())
|
||||
if log_stdout_file is not None:
|
||||
os.dup2(log_stdout_file.fileno(), sys.stdout.fileno())
|
||||
if log_stderr_file is not None:
|
||||
os.dup2(log_stderr_file.fileno(), sys.stderr.fileno())
|
||||
# We also manually set sys.stdout and sys.stderr because that seems
|
||||
# to have an affect on the output buffering. Without doing this,
|
||||
# stdout and stderr are heavily buffered resulting in seemingly
|
||||
# lost logging statements.
|
||||
sys.stdout = log_stdout_file
|
||||
sys.stderr = log_stderr_file
|
||||
if log_stdout_file is not None:
|
||||
sys.stdout = log_stdout_file
|
||||
if log_stderr_file is not None:
|
||||
sys.stderr = log_stderr_file
|
||||
# This should always be the first message to appear in the worker's
|
||||
# stdout and stderr log files. The string "Ray worker pid:" is
|
||||
# parsed in the log monitor process.
|
||||
@@ -1238,8 +1242,12 @@ def connect(node,
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
worker_dict["stdout_file"] = os.path.abspath(log_stdout_file.name)
|
||||
worker_dict["stderr_file"] = os.path.abspath(log_stderr_file.name)
|
||||
worker_dict["stdout_file"] = os.path.abspath(
|
||||
(log_stdout_file
|
||||
if log_stdout_file is not None else sys.stdout).name)
|
||||
worker_dict["stderr_file"] = os.path.abspath(
|
||||
(log_stderr_file
|
||||
if log_stderr_file is not None else sys.stderr).name)
|
||||
worker.redis_client.hmset(b"Workers:" + worker.worker_id, worker_dict)
|
||||
else:
|
||||
raise ValueError("Invalid worker mode. Expected DRIVER or WORKER.")
|
||||
|
||||
Reference in New Issue
Block a user