Clean up when a driver disconnects. (#462)

* Clean up state when drivers exit.

* Remove unnecessary field in ActorMapEntry struct.

* Have monitor release GPU resources in Redis when driver exits.

* Enable multiple drivers in multi-node tests and test driver cleanup.

* Make redis GPU allocation a redis transaction and small cleanups.

* Fix multi-node test.

* Small cleanups.

* Make global scheduler take node_ip_address so it appears in the right place in the client table.

* Cleanups.

* Fix linting and cleanups in local scheduler.

* Fix removed_driver_test.

* Fix bug related to vector -> list.

* Fix linting.

* Cleanup.

* Fix multi node tests.

* Fix jenkins tests.

* Add another multi node test with many drivers.

* Fix linting.

* Make the actor creation notification a flatbuffer message.

* Revert "Make the actor creation notification a flatbuffer message."

This reverts commit af99099c8084dbf9177fb4e34c0c9b1a12c78f39.

* Add comment explaining flatbuffer problems.
This commit is contained in:
Robert Nishihara
2017-04-24 18:10:21 -07:00
committed by Philipp Moritz
parent 8194b71f32
commit 0ac125e9b2
31 changed files with 1119 additions and 168 deletions
+86
View File
@@ -0,0 +1,86 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import redis
import time
import ray
EVENT_KEY = "RAY_MULTI_NODE_TEST_KEY"
"""This key is used internally within this file for coordinating drivers."""
def _wait_for_nodes_to_join(num_nodes, timeout=20):
"""Wait until the nodes have joined the cluster.
This will wait until exactly num_nodes have joined the cluster and each node
has a local scheduler and a plasma manager.
Args:
num_nodes: The number of nodes to wait for.
timeout: The amount of time in seconds to wait before failing.
Raises:
Exception: An exception is raised if too many nodes join the cluster or if
the timeout expires while we are waiting.
"""
start_time = time.time()
while time.time() - start_time < timeout:
client_table = ray.global_state.client_table()
num_ready_nodes = len(client_table)
if num_ready_nodes == num_nodes:
ready = True
# Check that for each node, a local scheduler and a plasma manager are
# present.
for ip_address, clients in client_table.items():
client_types = [client["ClientType"] for client in clients]
if "local_scheduler" not in client_types:
ready = False
if "plasma_manager" not in client_types:
ready = False
if ready:
return
if num_ready_nodes > num_nodes:
# Too many nodes have joined. Something must be wrong.
raise Exception("{} nodes have joined the cluster, but we were "
"expecting {} nodes.".format(num_ready_nodes, num_nodes))
time.sleep(0.1)
# If we get here then we timed out.
raise Exception("Timed out while waiting for {} nodes to join. Only {} "
"nodes have joined so far.".format(num_ready_nodes,
num_nodes))
def _broadcast_event(event_name, redis_address):
"""Broadcast an event.
Args:
event_name: The name of the event to wait for.
redis_address: The address of the Redis server to use for synchronization.
This is used to synchronize drivers for the multi-node tests.
"""
redis_host, redis_port = redis_address.split(":")
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
redis_client.rpush(EVENT_KEY, event_name)
def _wait_for_event(event_name, redis_address, extra_buffer=1):
"""Block until an event has been broadcast.
Args:
event_name: The name of the event to wait for.
redis_address: The address of the Redis server to use for synchronization.
extra_buffer: An amount of time in seconds to wait after the event.
This is used to synchronize drivers for the multi-node tests.
"""
redis_host, redis_port = redis_address.split(":")
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
while True:
event_names = redis_client.lrange(EVENT_KEY, 0, -1)
if event_name.encode("ascii") in event_names:
break
time.sleep(extra_buffer)