mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 05:43:03 +08:00
Make more tests run on Windows (#8446)
* Remove worker Wait() call due to SIGCHLD being ignored * Port _pid_alive to Windows * Show PID as well as TID in glog * Update TensorFlow version for Python 3.8 on Windows * Handle missing Pillow on Windows * Work around dm-tree PermissionError on Windows * Fix some lint errors on Windows with Python 3.8 * Simplify torch requirements * Quiet git clean * Handle finalizer issues * Exit with the signal number * Get rid of wget * Fix some Windows compatibility issues with tests Co-authored-by: Mehrdad <noreply@github.com>
This commit is contained in:
@@ -53,7 +53,12 @@ class Monitor:
|
||||
def __del__(self):
|
||||
"""Destruct the monitor object."""
|
||||
# We close the pubsub client to avoid leaking file descriptors.
|
||||
self.primary_subscribe_client.close()
|
||||
try:
|
||||
primary_subscribe_client = self.primary_subscribe_client
|
||||
except AttributeError:
|
||||
primary_subscribe_client = None
|
||||
if primary_subscribe_client is not None:
|
||||
primary_subscribe_client.close()
|
||||
|
||||
def subscribe(self, channel):
|
||||
"""Subscribe to the given channel on the primary Redis shard.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import errno
|
||||
import json
|
||||
import fnmatch
|
||||
import os
|
||||
@@ -12,6 +13,9 @@ import ray
|
||||
|
||||
import psutil # We must import psutil after ray because we bundle it with ray.
|
||||
|
||||
if sys.platform == "win32":
|
||||
import _winapi
|
||||
|
||||
|
||||
class RayTestTimeoutException(Exception):
|
||||
"""Exception used to identify timeouts from test utilities."""
|
||||
@@ -27,11 +31,24 @@ def _pid_alive(pid):
|
||||
Returns:
|
||||
This returns false if the process is dead. Otherwise, it returns true.
|
||||
"""
|
||||
no_such_process = errno.EINVAL if sys.platform == "win32" else errno.ESRCH
|
||||
alive = True
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
if sys.platform == "win32":
|
||||
SYNCHRONIZE = 0x00100000 # access mask defined in <winnt.h>
|
||||
handle = _winapi.OpenProcess(SYNCHRONIZE, False, pid)
|
||||
try:
|
||||
alive = (_winapi.WaitForSingleObject(handle, 0) !=
|
||||
_winapi.WAIT_OBJECT_0)
|
||||
finally:
|
||||
_winapi.CloseHandle(handle)
|
||||
else:
|
||||
os.kill(pid, 0)
|
||||
except OSError as ex:
|
||||
if ex.errno != no_such_process:
|
||||
raise
|
||||
alive = False
|
||||
return alive
|
||||
|
||||
|
||||
def wait_for_pid_to_exit(pid, timeout=20):
|
||||
|
||||
@@ -358,7 +358,7 @@ def test_decorator_args(ray_start_regular):
|
||||
with pytest.raises(Exception):
|
||||
|
||||
@ray.remote(invalid_kwarg=0) # noqa: F811
|
||||
class Actor:
|
||||
class Actor: # noqa: F811
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@@ -366,25 +366,25 @@ def test_decorator_args(ray_start_regular):
|
||||
with pytest.raises(Exception):
|
||||
|
||||
@ray.remote(num_cpus=0, invalid_kwarg=0) # noqa: F811
|
||||
class Actor:
|
||||
class Actor: # noqa: F811
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# This is a valid way of using the decorator.
|
||||
@ray.remote(num_cpus=1) # noqa: F811
|
||||
class Actor:
|
||||
class Actor: # noqa: F811
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# This is a valid way of using the decorator.
|
||||
@ray.remote(num_gpus=1) # noqa: F811
|
||||
class Actor:
|
||||
class Actor: # noqa: F811
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# This is a valid way of using the decorator.
|
||||
@ray.remote(num_cpus=1, num_gpus=1) # noqa: F811
|
||||
class Actor:
|
||||
class Actor: # noqa: F811
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ from ray.test_utils import (relevant_errors, wait_for_condition,
|
||||
wait_for_errors, wait_for_pid_to_exit,
|
||||
generate_internal_config_map)
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_checkpointable_actor_cls(request):
|
||||
@@ -153,7 +155,7 @@ def test_actor_restart():
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
# Kill actor process, while the above task is still being executed.
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
# Make sure that all tasks were executed in order before the actor's death.
|
||||
res = results.pop(0)
|
||||
i = 1
|
||||
@@ -194,7 +196,7 @@ def test_actor_restart():
|
||||
# kill actor process one more time.
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
@@ -235,7 +237,7 @@ def test_actor_restart_with_retry():
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
# Kill actor process, while the above task is still being executed.
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
# Check that none of the tasks failed and the actor is restarted.
|
||||
seq = list(range(1, 101))
|
||||
results = ray.get(results)
|
||||
@@ -255,7 +257,7 @@ def test_actor_restart_with_retry():
|
||||
# kill actor process one more time.
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
@@ -351,7 +353,7 @@ def test_actor_restart_without_task(ray_start_regular):
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
|
||||
p = probe.remote()
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
ray.get(p)
|
||||
assert wait_for_condition(lambda: not actor_resource_available())
|
||||
|
||||
@@ -507,7 +509,7 @@ def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
def kill_actor(actor):
|
||||
"""A helper function that kills an actor process."""
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
|
||||
|
||||
@@ -334,19 +334,19 @@ def test_identical_function_names(ray_start_regular):
|
||||
return 1
|
||||
|
||||
@ray.remote # noqa: F811
|
||||
def g():
|
||||
def g(): # noqa: F811
|
||||
return 2
|
||||
|
||||
@ray.remote # noqa: F811
|
||||
def g():
|
||||
def g(): # noqa: F811
|
||||
return 3
|
||||
|
||||
@ray.remote # noqa: F811
|
||||
def g():
|
||||
def g(): # noqa: F811
|
||||
return 4
|
||||
|
||||
@ray.remote # noqa: F811
|
||||
def g():
|
||||
def g(): # noqa: F811
|
||||
return 5
|
||||
|
||||
result_values = ray.get([g.remote() for _ in range(num_calls)])
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import threading
|
||||
@@ -165,7 +166,7 @@ SMALL_CLUSTER = {
|
||||
},
|
||||
"auth": {
|
||||
"ssh_user": "ubuntu",
|
||||
"ssh_private_key": "/dev/null",
|
||||
"ssh_private_key": os.devnull,
|
||||
},
|
||||
"head_node": {
|
||||
"TestProp": 1,
|
||||
@@ -310,7 +311,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
return path
|
||||
|
||||
def testInvalidConfig(self):
|
||||
invalid_config = "/dev/null"
|
||||
invalid_config = os.devnull
|
||||
with pytest.raises(ValueError):
|
||||
StandardAutoscaler(
|
||||
invalid_config, LoadMetrics(), update_interval_s=0)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import os
|
||||
from signal import SIGKILL
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
|
||||
@@ -8,6 +8,8 @@ import pytest
|
||||
import ray
|
||||
from ray.test_utils import run_string_as_driver_nonblocking, SignalActor
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
# This test checks that when a worker dies in the middle of a get, the plasma
|
||||
# store and raylet will not die.
|
||||
|
||||
@@ -11,6 +11,8 @@ import ray.ray_constants as ray_constants
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import RayTestTimeoutException
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
@pytest.fixture(params=[(1, 4), (4, 4)])
|
||||
def ray_start_workers_separate_multinode(request):
|
||||
@@ -62,7 +64,7 @@ def test_worker_failed(ray_start_workers_separate_multinode):
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for pid in pids:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
time.sleep(0.1)
|
||||
# Make sure that we either get the object or we get an appropriate
|
||||
# exception.
|
||||
|
||||
@@ -11,6 +11,8 @@ import ray.ray_constants as ray_constants
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import RayTestTimeoutException
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
|
||||
@pytest.fixture(params=[(1, 4), (4, 4)])
|
||||
def ray_start_workers_separate_multinode(request):
|
||||
@@ -62,7 +64,7 @@ def test_worker_failed(ray_start_workers_separate_multinode):
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for pid in pids:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.kill(pid, SIGKILL)
|
||||
time.sleep(0.1)
|
||||
# Make sure that we either get the object or we get an appropriate
|
||||
# exception.
|
||||
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -12,6 +13,8 @@ import ray
|
||||
import ray.cluster_utils
|
||||
from ray.test_utils import SignalActor, put_object, wait_for_condition
|
||||
|
||||
SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -269,7 +272,7 @@ def test_recursively_return_borrowed_object_id(one_worker_100MiB, use_ray_put,
|
||||
_fill_object_store_and_get(final_oid_bytes)
|
||||
|
||||
if failure:
|
||||
os.kill(owner_pid, signal.SIGKILL)
|
||||
os.kill(owner_pid, SIGKILL)
|
||||
else:
|
||||
# Remove all references.
|
||||
del head_oid
|
||||
|
||||
@@ -9,7 +9,6 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import redis
|
||||
import signal
|
||||
from six.moves import queue
|
||||
import sys
|
||||
import threading
|
||||
@@ -900,7 +899,7 @@ atexit.register(shutdown, True)
|
||||
|
||||
# TODO(edoakes): this should only be set in the driver.
|
||||
def sigterm_handler(signum, frame):
|
||||
sys.exit(signal.SIGTERM)
|
||||
sys.exit(signum)
|
||||
|
||||
|
||||
try:
|
||||
@@ -1366,7 +1365,12 @@ def disconnect(exiting_interpreter=False):
|
||||
worker.node = None # Disconnect the worker from the node.
|
||||
worker.cached_functions_to_run = []
|
||||
worker.serialization_context_map.clear()
|
||||
ray.actor.ActorClassMethodMetadata.reset_cache()
|
||||
try:
|
||||
ray_actor = ray.actor
|
||||
except AttributeError:
|
||||
ray_actor = None # This can occur during program termination
|
||||
if ray_actor is not None:
|
||||
ray_actor.ActorClassMethodMetadata.reset_cache()
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
||||
Reference in New Issue
Block a user