Make more tests run on Windows (#8446)

* Remove worker Wait() call due to SIGCHLD being ignored

* Port _pid_alive to Windows

* Show PID as well as TID in glog

* Update TensorFlow version for Python 3.8 on Windows

* Handle missing Pillow on Windows

* Work around dm-tree PermissionError on Windows

* Fix some lint errors on Windows with Python 3.8

* Simplify torch requirements

* Quiet git clean

* Handle finalizer issues

* Exit with the signal number

* Get rid of wget

* Fix some Windows compatibility issues with tests

Co-authored-by: Mehrdad <noreply@github.com>
This commit is contained in:
mehrdadn
2020-05-20 12:25:04 -07:00
committed by GitHub
parent aa7a58e92f
commit ebf060d484
17 changed files with 103 additions and 44 deletions
+6 -1
View File
@@ -53,7 +53,12 @@ class Monitor:
def __del__(self):
"""Destruct the monitor object."""
# We close the pubsub client to avoid leaking file descriptors.
self.primary_subscribe_client.close()
try:
primary_subscribe_client = self.primary_subscribe_client
except AttributeError:
primary_subscribe_client = None
if primary_subscribe_client is not None:
primary_subscribe_client.close()
def subscribe(self, channel):
"""Subscribe to the given channel on the primary Redis shard.
+21 -4
View File
@@ -1,4 +1,5 @@
import asyncio
import errno
import json
import fnmatch
import os
@@ -12,6 +13,9 @@ import ray
import psutil # We must import psutil after ray because we bundle it with ray.
if sys.platform == "win32":
import _winapi
class RayTestTimeoutException(Exception):
"""Exception used to identify timeouts from test utilities."""
@@ -27,11 +31,24 @@ def _pid_alive(pid):
Returns:
This returns false if the process is dead. Otherwise, it returns true.
"""
no_such_process = errno.EINVAL if sys.platform == "win32" else errno.ESRCH
alive = True
try:
os.kill(pid, 0)
return True
except OSError:
return False
if sys.platform == "win32":
SYNCHRONIZE = 0x00100000 # access mask defined in <winnt.h>
handle = _winapi.OpenProcess(SYNCHRONIZE, False, pid)
try:
alive = (_winapi.WaitForSingleObject(handle, 0) !=
_winapi.WAIT_OBJECT_0)
finally:
_winapi.CloseHandle(handle)
else:
os.kill(pid, 0)
except OSError as ex:
if ex.errno != no_such_process:
raise
alive = False
return alive
def wait_for_pid_to_exit(pid, timeout=20):
+5 -5
View File
@@ -358,7 +358,7 @@ def test_decorator_args(ray_start_regular):
with pytest.raises(Exception):
@ray.remote(invalid_kwarg=0) # noqa: F811
class Actor:
class Actor: # noqa: F811
def __init__(self):
pass
@@ -366,25 +366,25 @@ def test_decorator_args(ray_start_regular):
with pytest.raises(Exception):
@ray.remote(num_cpus=0, invalid_kwarg=0) # noqa: F811
class Actor:
class Actor: # noqa: F811
def __init__(self):
pass
# This is a valid way of using the decorator.
@ray.remote(num_cpus=1) # noqa: F811
class Actor:
class Actor: # noqa: F811
def __init__(self):
pass
# This is a valid way of using the decorator.
@ray.remote(num_gpus=1) # noqa: F811
class Actor:
class Actor: # noqa: F811
def __init__(self):
pass
# This is a valid way of using the decorator.
@ray.remote(num_cpus=1, num_gpus=1) # noqa: F811
class Actor:
class Actor: # noqa: F811
def __init__(self):
pass
+8 -6
View File
@@ -15,6 +15,8 @@ from ray.test_utils import (relevant_errors, wait_for_condition,
wait_for_errors, wait_for_pid_to_exit,
generate_internal_config_map)
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
@pytest.fixture
def ray_checkpointable_actor_cls(request):
@@ -153,7 +155,7 @@ def test_actor_restart():
pid = ray.get(actor.get_pid.remote())
results = [actor.increase.remote() for _ in range(100)]
# Kill actor process, while the above task is still being executed.
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
# Make sure that all tasks were executed in order before the actor's death.
res = results.pop(0)
i = 1
@@ -194,7 +196,7 @@ def test_actor_restart():
# kill actor process one more time.
results = [actor.increase.remote() for _ in range(100)]
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
# The actor has exceeded max restarts, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
@@ -235,7 +237,7 @@ def test_actor_restart_with_retry():
pid = ray.get(actor.get_pid.remote())
results = [actor.increase.remote() for _ in range(100)]
# Kill actor process, while the above task is still being executed.
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
# Check that none of the tasks failed and the actor is restarted.
seq = list(range(1, 101))
results = ray.get(results)
@@ -255,7 +257,7 @@ def test_actor_restart_with_retry():
# kill actor process one more time.
results = [actor.increase.remote() for _ in range(100)]
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
# The actor has exceeded max restarts, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
@@ -351,7 +353,7 @@ def test_actor_restart_without_task(ray_start_regular):
pid = ray.get(actor.get_pid.remote())
p = probe.remote()
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
ray.get(p)
assert wait_for_condition(lambda: not actor_resource_available())
@@ -507,7 +509,7 @@ def test_multiple_actor_restart(ray_start_cluster_head):
def kill_actor(actor):
"""A helper function that kills an actor process."""
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
wait_for_pid_to_exit(pid)
+4 -4
View File
@@ -334,19 +334,19 @@ def test_identical_function_names(ray_start_regular):
return 1
@ray.remote # noqa: F811
def g():
def g(): # noqa: F811
return 2
@ray.remote # noqa: F811
def g():
def g(): # noqa: F811
return 3
@ray.remote # noqa: F811
def g():
def g(): # noqa: F811
return 4
@ray.remote # noqa: F811
def g():
def g(): # noqa: F811
return 5
result_values = ray.get([g.remote() for _ in range(num_calls)])
+3 -2
View File
@@ -1,3 +1,4 @@
import os
import shutil
import tempfile
import threading
@@ -165,7 +166,7 @@ SMALL_CLUSTER = {
},
"auth": {
"ssh_user": "ubuntu",
"ssh_private_key": "/dev/null",
"ssh_private_key": os.devnull,
},
"head_node": {
"TestProp": 1,
@@ -310,7 +311,7 @@ class AutoscalingTest(unittest.TestCase):
return path
def testInvalidConfig(self):
invalid_config = "/dev/null"
invalid_config = os.devnull
with pytest.raises(ValueError):
StandardAutoscaler(
invalid_config, LoadMetrics(), update_interval_s=0)
+3 -1
View File
@@ -1,5 +1,5 @@
import os
from signal import SIGKILL
import signal
import sys
import time
@@ -8,6 +8,8 @@ import pytest
import ray
from ray.test_utils import run_string_as_driver_nonblocking, SignalActor
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
# This test checks that when a worker dies in the middle of a get, the plasma
# store and raylet will not die.
@@ -11,6 +11,8 @@ import ray.ray_constants as ray_constants
from ray.cluster_utils import Cluster
from ray.test_utils import RayTestTimeoutException
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
@pytest.fixture(params=[(1, 4), (4, 4)])
def ray_start_workers_separate_multinode(request):
@@ -62,7 +64,7 @@ def test_worker_failed(ray_start_workers_separate_multinode):
time.sleep(0.1)
# Kill the workers as the tasks execute.
for pid in pids:
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
time.sleep(0.1)
# Make sure that we either get the object or we get an appropriate
# exception.
+3 -1
View File
@@ -11,6 +11,8 @@ import ray.ray_constants as ray_constants
from ray.cluster_utils import Cluster
from ray.test_utils import RayTestTimeoutException
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
@pytest.fixture(params=[(1, 4), (4, 4)])
def ray_start_workers_separate_multinode(request):
@@ -62,7 +64,7 @@ def test_worker_failed(ray_start_workers_separate_multinode):
time.sleep(0.1)
# Kill the workers as the tasks execute.
for pid in pids:
os.kill(pid, signal.SIGKILL)
os.kill(pid, SIGKILL)
time.sleep(0.1)
# Make sure that we either get the object or we get an appropriate
# exception.
@@ -3,6 +3,7 @@ import json
import logging
import os
import signal
import sys
import numpy as np
@@ -12,6 +13,8 @@ import ray
import ray.cluster_utils
from ray.test_utils import SignalActor, put_object, wait_for_condition
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
logger = logging.getLogger(__name__)
@@ -269,7 +272,7 @@ def test_recursively_return_borrowed_object_id(one_worker_100MiB, use_ray_put,
_fill_object_store_and_get(final_oid_bytes)
if failure:
os.kill(owner_pid, signal.SIGKILL)
os.kill(owner_pid, SIGKILL)
else:
# Remove all references.
del head_oid
+7 -3
View File
@@ -9,7 +9,6 @@ import json
import logging
import os
import redis
import signal
from six.moves import queue
import sys
import threading
@@ -900,7 +899,7 @@ atexit.register(shutdown, True)
# TODO(edoakes): this should only be set in the driver.
def sigterm_handler(signum, frame):
sys.exit(signal.SIGTERM)
sys.exit(signum)
try:
@@ -1366,7 +1365,12 @@ def disconnect(exiting_interpreter=False):
worker.node = None # Disconnect the worker from the node.
worker.cached_functions_to_run = []
worker.serialization_context_map.clear()
ray.actor.ActorClassMethodMetadata.reset_cache()
try:
ray_actor = ray.actor
except AttributeError:
ray_actor = None # This can occur during program termination
if ray_actor is not None:
ray_actor.ActorClassMethodMetadata.reset_cache()
@contextmanager