mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 00:33:25 +08:00
Availability after worker failure (#316)
* Availability after a killed worker * Workers exit cleanly * Memory cleanup in photon C tests * Worker failure in multinode * Consolidate worker cleanup handlers * Update the result table before handling a task submission * KILL_WORKER_TIMEOUT -> KILL_WORKER_TIMEOUT_MILLISECONDS * Log a warning instead of crashing if no result table entry found
This commit is contained in:
committed by
Robert Nishihara
parent
232601f90d
commit
be1618f041
@@ -216,14 +216,18 @@ class TestGlobalStateStore(unittest.TestCase):
|
||||
"node_id")
|
||||
|
||||
def testTaskTableAddAndLookup(self):
|
||||
TASK_STATUS_WAITING = 1
|
||||
TASK_STATUS_SCHEDULED = 2
|
||||
TASK_STATUS_QUEUED = 4
|
||||
|
||||
# Check that task table adds, updates, and lookups work correctly.
|
||||
task_args = [1, b"node_id", b"task_spec"]
|
||||
task_args = [TASK_STATUS_WAITING, b"node_id", b"task_spec"]
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
|
||||
*task_args)
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
|
||||
self.assertEqual(response, task_args)
|
||||
|
||||
task_args[0] = 2
|
||||
task_args[0] = TASK_STATUS_SCHEDULED
|
||||
self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id", *task_args[:2])
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
|
||||
self.assertEqual(response, task_args)
|
||||
@@ -241,7 +245,7 @@ class TestGlobalStateStore(unittest.TestCase):
|
||||
|
||||
# If the current value is the same as the test value, and the set value is
|
||||
# different, the update happens, and the response is the entire task.
|
||||
task_args[1] += 1
|
||||
task_args[1] = TASK_STATUS_QUEUED
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
@@ -252,7 +256,7 @@ class TestGlobalStateStore(unittest.TestCase):
|
||||
|
||||
# If the current value is no longer the same as the test value, the
|
||||
# response is nil.
|
||||
task_args[1] += 1
|
||||
task_args[1] = TASK_STATUS_WAITING
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
@@ -262,6 +266,27 @@ class TestGlobalStateStore(unittest.TestCase):
|
||||
self.assertEqual(get_response2, get_response)
|
||||
self.assertNotEqual(get_response2, task_args[1:])
|
||||
|
||||
# If the test value is a bitmask that matches the current value, the update
|
||||
# happens.
|
||||
task_args[0] = TASK_STATUS_SCHEDULED | TASK_STATUS_QUEUED
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
self.assertEqual(response, task_args[1:])
|
||||
|
||||
# If the test value is a bitmask that does not match the current value, the
|
||||
# update does not happen.
|
||||
task_args[1] = TASK_STATUS_SCHEDULED
|
||||
old_response = response
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
|
||||
"task_id",
|
||||
*task_args[:3])
|
||||
self.assertEqual(response, None)
|
||||
# Check that the update did not happen.
|
||||
get_response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
|
||||
self.assertEqual(get_response, old_response)
|
||||
self.assertNotEqual(get_response, task_args[1:])
|
||||
|
||||
def testTaskTableSubscribe(self):
|
||||
scheduling_state = 1
|
||||
node_id = "node_id"
|
||||
|
||||
+23
-8
@@ -14,6 +14,7 @@ import numpy as np
|
||||
import os
|
||||
import random
|
||||
import redis
|
||||
import signal
|
||||
import string
|
||||
import sys
|
||||
import threading
|
||||
@@ -936,23 +937,31 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
|
||||
num_gpus=num_gpus)
|
||||
|
||||
def cleanup(worker=global_worker):
|
||||
"""Disconnect the driver, and terminate any processes started in init.
|
||||
"""Disconnect the worker, and terminate any processes started in init.
|
||||
|
||||
This will automatically run at the end when a Python process that uses Ray
|
||||
exits. It is ok to run this twice in a row. Note that we manually call
|
||||
services.cleanup() in the tests because we need to start and stop many
|
||||
clusters in the tests, but the import and exit only happen once.
|
||||
"""
|
||||
# If this is a driver, push the finish time to Redis.
|
||||
if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
|
||||
{"end_time": time.time()})
|
||||
|
||||
disconnect(worker)
|
||||
worker.set_mode(None)
|
||||
if hasattr(worker, "photon_client"):
|
||||
del worker.photon_client
|
||||
if hasattr(worker, "plasma_client"):
|
||||
worker.plasma_client.shutdown()
|
||||
services.cleanup()
|
||||
|
||||
if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
# If this is a driver, push the finish time to Redis and clean up any
|
||||
# other services that were started with the driver.
|
||||
worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
|
||||
{"end_time": time.time()})
|
||||
services.cleanup()
|
||||
else:
|
||||
# If this is not a driver, make sure there are no orphan processes.
|
||||
for process_type, processes in services.all_processes.items():
|
||||
assert(len(processes) == 0)
|
||||
|
||||
worker.set_mode(None)
|
||||
|
||||
atexit.register(cleanup)
|
||||
|
||||
@@ -1559,6 +1568,12 @@ def main_loop(worker=global_worker):
|
||||
that occurred while executing the command, and waits for the next command.
|
||||
"""
|
||||
|
||||
def exit(signum, frame):
|
||||
cleanup(worker=worker)
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGTERM, exit)
|
||||
|
||||
def process_task(task): # wrapping these lines in a function should cause the local variables to go out of scope more quickly, which is useful for inspecting reference counts
|
||||
"""Execute a task assigned to this worker.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user