Availability after worker failure (#316)

* Availability after a killed worker * Workers exit cleanly * Memory cleanup in photon C tests * Worker failure in multinode * Consolidate worker cleanup handlers * Update the result table before handling a task submission * KILL_WORKER_TIMEOUT -> KILL_WORKER_TIMEOUT_MILLISECONDS * Log a warning instead of crashing if no result table entry found
2026-07-04 00:33:25 +08:00 · 2017-02-25 20:19:36 -08:00
parent 232601f90d
commit be1618f041
14 changed files with 307 additions and 118 deletions
@@ -216,14 +216,18 @@ class TestGlobalStateStore(unittest.TestCase):
                                 "node_id")

  def testTaskTableAddAndLookup(self):
+    TASK_STATUS_WAITING = 1
+    TASK_STATUS_SCHEDULED = 2
+    TASK_STATUS_QUEUED = 4
+
    # Check that task table adds, updates, and lookups work correctly.
-    task_args = [1, b"node_id", b"task_spec"]
+    task_args = [TASK_STATUS_WAITING, b"node_id", b"task_spec"]
    response = self.redis.execute_command("RAY.TASK_TABLE_ADD", "task_id",
                                          *task_args)
    response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
    self.assertEqual(response, task_args)

-    task_args[0] = 2
+    task_args[0] = TASK_STATUS_SCHEDULED
    self.redis.execute_command("RAY.TASK_TABLE_UPDATE", "task_id", *task_args[:2])
    response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
    self.assertEqual(response, task_args)
@@ -241,7 +245,7 @@ class TestGlobalStateStore(unittest.TestCase):

    # If the current value is the same as the test value, and the set value is
    # different, the update happens, and the response is the entire task.
-    task_args[1] += 1
+    task_args[1] = TASK_STATUS_QUEUED
    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
                                          "task_id",
                                          *task_args[:3])
@@ -252,7 +256,7 @@ class TestGlobalStateStore(unittest.TestCase):

    # If the current value is no longer the same as the test value, the
    # response is nil.
-    task_args[1] += 1
+    task_args[1] = TASK_STATUS_WAITING
    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
                                          "task_id",
                                          *task_args[:3])
@@ -262,6 +266,27 @@ class TestGlobalStateStore(unittest.TestCase):
    self.assertEqual(get_response2, get_response)
    self.assertNotEqual(get_response2, task_args[1:])

+    # If the test value is a bitmask that matches the current value, the update
+    # happens.
+    task_args[0] = TASK_STATUS_SCHEDULED | TASK_STATUS_QUEUED
+    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
+                                          "task_id",
+                                          *task_args[:3])
+    self.assertEqual(response, task_args[1:])
+
+    # If the test value is a bitmask that does not match the current value, the
+    # update does not happen.
+    task_args[1] = TASK_STATUS_SCHEDULED
+    old_response = response
+    response = self.redis.execute_command("RAY.TASK_TABLE_TEST_AND_UPDATE",
+                                          "task_id",
+                                          *task_args[:3])
+    self.assertEqual(response, None)
+    # Check that the update did not happen.
+    get_response = self.redis.execute_command("RAY.TASK_TABLE_GET", "task_id")
+    self.assertEqual(get_response, old_response)
+    self.assertNotEqual(get_response, task_args[1:])
+
  def testTaskTableSubscribe(self):
    scheduling_state = 1
    node_id = "node_id"
@@ -14,6 +14,7 @@ import numpy as np
 import os
 import random
 import redis
+import signal
 import string
 import sys
 import threading
@@ -936,23 +937,31 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
               num_gpus=num_gpus)

 def cleanup(worker=global_worker):
-  """Disconnect the driver, and terminate any processes started in init.
+  """Disconnect the worker, and terminate any processes started in init.

  This will automatically run at the end when a Python process that uses Ray
  exits. It is ok to run this twice in a row. Note that we manually call
  services.cleanup() in the tests because we need to start and stop many
  clusters in the tests, but the import and exit only happen once.
  """
-  # If this is a driver, push the finish time to Redis.
-  if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
-    worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
-                              {"end_time": time.time()})
-
  disconnect(worker)
-  worker.set_mode(None)
+  if hasattr(worker, "photon_client"):
+    del worker.photon_client
  if hasattr(worker, "plasma_client"):
    worker.plasma_client.shutdown()
-  services.cleanup()
+
+  if worker.mode in [SCRIPT_MODE, SILENT_MODE]:
+    # If this is a driver, push the finish time to Redis and clean up any
+    # other services that were started with the driver.
+    worker.redis_client.hmset(b"Drivers:" + worker.worker_id,
+                              {"end_time": time.time()})
+    services.cleanup()
+  else:
+    # If this is not a driver, make sure there are no orphan processes.
+    for process_type, processes in services.all_processes.items():
+      assert(len(processes) == 0)
+
+  worker.set_mode(None)

 atexit.register(cleanup)

@@ -1559,6 +1568,12 @@ def main_loop(worker=global_worker):
  that occurred while executing the command, and waits for the next command.
  """

+  def exit(signum, frame):
+    cleanup(worker=worker)
+    sys.exit(0)
+
+  signal.signal(signal.SIGTERM, exit)
+
  def process_task(task): # wrapping these lines in a function should cause the local variables to go out of scope more quickly, which is useful for inspecting reference counts
    """Execute a task assigned to this worker.