Availability after worker failure (#316)

* Availability after a killed worker

* Workers exit cleanly

* Memory cleanup in photon C tests

* Worker failure in multinode

* Consolidate worker cleanup handlers

* Update the result table before handling a task submission

* KILL_WORKER_TIMEOUT -> KILL_WORKER_TIMEOUT_MILLISECONDS

* Log a warning instead of crashing if no result table entry found
This commit is contained in:
Stephanie Wang
2017-02-25 20:19:36 -08:00
committed by Robert Nishihara
parent 232601f90d
commit be1618f041
14 changed files with 307 additions and 118 deletions
+29 -4
View File
@@ -216,14 +216,18 @@ class TestGlobalStateStore(unittest.TestCase):
"node_id")
def testTaskTableAddAndLookup(self):
TASK_STATUS_WAITING = 1
TASK_STATUS_SCHEDULED = 2
TASK_STATUS_QUEUED = 4
# Check that task table adds, updates, and lookups work correctly.
task_args = [1, b"node_id", b"task_spec"]
task_args = [TASK_STATUS_WAITING, b"node_id", b"task_spec"]
response = self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
*task_args)
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
self.assertEqual(response, task_args)
task_args[0] = 2
task_args[0] = TASK_STATUS_SCHEDULED
self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id", *task_args[:2])
response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
self.assertEqual(response, task_args)
@@ -241,7 +245,7 @@ class TestGlobalStateStore(unittest.TestCase):
# If the current value is the same as the test value, and the set value is
# different, the update happens, and the response is the entire task.
task_args[1] += 1
task_args[1] = TASK_STATUS_QUEUED
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id",
*task_args[:3])
@@ -252,7 +256,7 @@ class TestGlobalStateStore(unittest.TestCase):
# If the current value is no longer the same as the test value, the
# response is nil.
task_args[1] += 1
task_args[1] = TASK_STATUS_WAITING
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id",
*task_args[:3])
@@ -262,6 +266,27 @@ class TestGlobalStateStore(unittest.TestCase):
self.assertEqual(get_response2, get_response)
self.assertNotEqual(get_response2, task_args[1:])
# If the test value is a bitmask that matches the current value, the update
# happens.
task_args[0] = TASK_STATUS_SCHEDULED | TASK_STATUS_QUEUED
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id",
*task_args[:3])
self.assertEqual(response, task_args[1:])
# If the test value is a bitmask that does not match the current value, the
# update does not happen.
task_args[1] = TASK_STATUS_SCHEDULED
old_response = response
response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
"task_id",
*task_args[:3])
self.assertEqual(response, None)
# Check that the update did not happen.
get_response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
self.assertEqual(get_response, old_response)
self.assertNotEqual(get_response, task_args[1:])
def testTaskTableSubscribe(self):
scheduling_state = 1
node_id = "node_id"
+23 -8
View File
@@ -14,6 +14,7 @@ import numpy as np
import os
import random
import redis
import signal
import string
import sys
import threading
@@ -936,23 +937,31 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
num_gpus=num_gpus)
def cleanup(worker=global_worker):
"""Disconnect the driver, and terminate any processes started in init.
"""Disconnect the worker, and terminate any processes started in init.
This will automatically run at the end when a Python process that uses Ray
exits. It is ok to run this twice in a row. Note that we manually call
services.cleanup() in the tests because we need to start and stop many
clusters in the tests, but the import and exit only happen once.
"""
# If this is a driver, push the finish time to Redis.
if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
{"end_time": time.time()})
disconnect(worker)
worker.set_mode(None)
if hasattr(worker, "photon_client"):
del worker.photon_client
if hasattr(worker, "plasma_client"):
worker.plasma_client.shutdown()
services.cleanup()
if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
# If this is a driver, push the finish time to Redis and clean up any
# other services that were started with the driver.
worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
{"end_time": time.time()})
services.cleanup()
else:
# If this is not a driver, make sure there are no orphan processes.
for process_type, processes in services.all_processes.items():
assert(len(processes) == 0)
worker.set_mode(None)
atexit.register(cleanup)
@@ -1559,6 +1568,12 @@ def main_loop(worker=global_worker):
that occurred while executing the command, and waits for the next command.
"""
def exit(signum, frame):
cleanup(worker=worker)
sys.exit(0)
signal.signal(signal.SIGTERM, exit)
def process_task(task): # wrapping these lines in a function should cause the local variables to go out of scope more quickly, which is useful for inspecting reference counts
"""Execute a task assigned to this worker.