mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 07:18:24 +08:00
c21e189371
* Enable scheduling with custom resource labels. * Fix. * Minor fixes and ref counting fix. * Linting * Use .data() instead of .c_str(). * Fix linting. * Fix ResourcesTest.testGPUIDs test by waiting for workers to start up. * Sleep in test so that all tasks are submitted before any completes.
304 lines
12 KiB
Python
304 lines
12 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import binascii
|
|
import collections
|
|
import json
|
|
import numpy as np
|
|
import redis
|
|
import sys
|
|
|
|
import ray.local_scheduler
|
|
|
|
ERROR_KEY_PREFIX = b"Error:"
|
|
DRIVER_ID_LENGTH = 20
|
|
|
|
|
|
def _random_string():
|
|
return np.random.bytes(20)
|
|
|
|
|
|
def push_error_to_driver(redis_client, error_type, message, driver_id=None,
|
|
data=None):
|
|
"""Push an error message to the driver to be printed in the background.
|
|
|
|
Args:
|
|
redis_client: The redis client to use.
|
|
error_type (str): The type of the error.
|
|
message (str): The message that will be printed in the background
|
|
on the driver.
|
|
driver_id: The ID of the driver to push the error message to. If this
|
|
is None, then the message will be pushed to all drivers.
|
|
data: This should be a dictionary mapping strings to strings. It
|
|
will be serialized with json and stored in Redis.
|
|
"""
|
|
if driver_id is None:
|
|
driver_id = DRIVER_ID_LENGTH * b"\x00"
|
|
error_key = ERROR_KEY_PREFIX + driver_id + b":" + _random_string()
|
|
data = {} if data is None else data
|
|
redis_client.hmset(error_key, {"type": error_type,
|
|
"message": message,
|
|
"data": data})
|
|
redis_client.rpush("ErrorKeys", error_key)
|
|
|
|
|
|
def is_cython(obj):
|
|
"""Check if an object is a Cython function or method"""
|
|
|
|
# TODO(suo): We could split these into two functions, one for Cython
|
|
# functions and another for Cython methods.
|
|
# TODO(suo): There doesn't appear to be a Cython function 'type' we can
|
|
# check against via isinstance. Please correct me if I'm wrong.
|
|
def check_cython(x):
|
|
return type(x).__name__ == "cython_function_or_method"
|
|
|
|
# Check if function or method, respectively
|
|
return check_cython(obj) or \
|
|
(hasattr(obj, "__func__") and check_cython(obj.__func__))
|
|
|
|
|
|
def random_string():
|
|
"""Generate a random string to use as an ID.
|
|
|
|
Note that users may seed numpy, which could cause this function to generate
|
|
duplicate IDs. Therefore, we need to seed numpy ourselves, but we can't
|
|
interfere with the state of the user's random number generator, so we
|
|
extract the state of the random number generator and reset it after we are
|
|
done.
|
|
|
|
TODO(rkn): If we want to later guarantee that these are generated in a
|
|
deterministic manner, then we will need to make some changes here.
|
|
|
|
Returns:
|
|
A random byte string of length 20.
|
|
"""
|
|
# Get the state of the numpy random number generator.
|
|
numpy_state = np.random.get_state()
|
|
# Try to use true randomness.
|
|
np.random.seed(None)
|
|
# Generate the random ID.
|
|
random_id = np.random.bytes(20)
|
|
# Reset the state of the numpy random number generator.
|
|
np.random.set_state(numpy_state)
|
|
return random_id
|
|
|
|
|
|
def decode(byte_str):
|
|
"""Make this unicode in Python 3, otherwise leave it as bytes."""
|
|
if sys.version_info >= (3, 0):
|
|
return byte_str.decode("ascii")
|
|
else:
|
|
return byte_str
|
|
|
|
|
|
def binary_to_object_id(binary_object_id):
|
|
return ray.local_scheduler.ObjectID(binary_object_id)
|
|
|
|
|
|
def binary_to_hex(identifier):
|
|
hex_identifier = binascii.hexlify(identifier)
|
|
if sys.version_info >= (3, 0):
|
|
hex_identifier = hex_identifier.decode()
|
|
return hex_identifier
|
|
|
|
|
|
def hex_to_binary(hex_identifier):
|
|
return binascii.unhexlify(hex_identifier)
|
|
|
|
|
|
FunctionProperties = collections.namedtuple("FunctionProperties",
|
|
["num_return_vals",
|
|
"resources",
|
|
"max_calls"])
|
|
"""FunctionProperties: A named tuple storing remote functions information."""
|
|
|
|
|
|
def attempt_to_reserve_gpus(num_gpus, driver_id, local_scheduler,
|
|
redis_client):
|
|
"""Attempt to acquire GPUs on a particular local scheduler for an actor.
|
|
|
|
Args:
|
|
num_gpus: The number of GPUs to acquire.
|
|
driver_id: The ID of the driver responsible for creating the actor.
|
|
local_scheduler: Information about the local scheduler.
|
|
redis_client: The redis client to use for interacting with Redis.
|
|
|
|
Returns:
|
|
True if the GPUs were successfully reserved and false otherwise.
|
|
"""
|
|
assert num_gpus != 0
|
|
local_scheduler_id = local_scheduler["DBClientID"]
|
|
local_scheduler_total_gpus = int(local_scheduler["GPU"])
|
|
|
|
success = False
|
|
|
|
# Attempt to acquire GPU IDs atomically.
|
|
with redis_client.pipeline() as pipe:
|
|
while True:
|
|
try:
|
|
# If this key is changed before the transaction below (the
|
|
# multi/exec block), then the transaction will not take place.
|
|
pipe.watch(local_scheduler_id)
|
|
|
|
# Figure out which GPUs are currently in use.
|
|
result = redis_client.hget(local_scheduler_id, "gpus_in_use")
|
|
gpus_in_use = dict() if result is None else json.loads(
|
|
result.decode("ascii"))
|
|
num_gpus_in_use = 0
|
|
for key in gpus_in_use:
|
|
num_gpus_in_use += gpus_in_use[key]
|
|
assert num_gpus_in_use <= local_scheduler_total_gpus
|
|
|
|
pipe.multi()
|
|
|
|
if local_scheduler_total_gpus - num_gpus_in_use >= num_gpus:
|
|
# There are enough available GPUs, so try to reserve some.
|
|
# We use the hex driver ID in hex as a dictionary key so
|
|
# that the dictionary is JSON serializable.
|
|
driver_id_hex = binary_to_hex(driver_id)
|
|
if driver_id_hex not in gpus_in_use:
|
|
gpus_in_use[driver_id_hex] = 0
|
|
gpus_in_use[driver_id_hex] += num_gpus
|
|
|
|
# Stick the updated GPU IDs back in Redis
|
|
pipe.hset(local_scheduler_id, "gpus_in_use",
|
|
json.dumps(gpus_in_use))
|
|
success = True
|
|
|
|
pipe.execute()
|
|
# If a WatchError is not raised, then the operations should
|
|
# have gone through atomically.
|
|
break
|
|
except redis.WatchError:
|
|
# Another client must have changed the watched key between the
|
|
# time we started WATCHing it and the pipeline's execution. We
|
|
# should just retry.
|
|
success = False
|
|
continue
|
|
|
|
return success
|
|
|
|
|
|
def release_gpus_in_use(driver_id, local_scheduler_id, gpu_ids, redis_client):
|
|
"""Release the GPUs that a given worker was using.
|
|
|
|
Note that this does not affect the local scheduler's bookkeeping. It only
|
|
affects the GPU allocations which are recorded in the primary Redis shard,
|
|
which are redundant with the local scheduler bookkeeping.
|
|
|
|
Args:
|
|
driver_id: The ID of the driver that is releasing some GPUs.
|
|
local_scheduler_id: The ID of the local scheduler that owns the GPUs
|
|
being released.
|
|
gpu_ids: The IDs of the GPUs being released.
|
|
redis_client: A client for the primary Redis shard.
|
|
"""
|
|
# Attempt to release GPU IDs atomically.
|
|
with redis_client.pipeline() as pipe:
|
|
while True:
|
|
try:
|
|
# If this key is changed before the transaction below (the
|
|
# multi/exec block), then the transaction will not take place.
|
|
pipe.watch(local_scheduler_id)
|
|
|
|
# Figure out which GPUs are currently in use.
|
|
result = redis_client.hget(local_scheduler_id, "gpus_in_use")
|
|
gpus_in_use = dict() if result is None else json.loads(
|
|
result.decode("ascii"))
|
|
|
|
assert driver_id in gpus_in_use
|
|
assert gpus_in_use[driver_id] >= len(gpu_ids)
|
|
|
|
gpus_in_use[driver_id] -= len(gpu_ids)
|
|
|
|
pipe.multi()
|
|
|
|
pipe.hset(local_scheduler_id, "gpus_in_use",
|
|
json.dumps(gpus_in_use))
|
|
|
|
pipe.execute()
|
|
# If a WatchError is not raised, then the operations should
|
|
# have gone through atomically.
|
|
break
|
|
except redis.WatchError:
|
|
# Another client must have changed the watched key between the
|
|
# time we started WATCHing it and the pipeline's execution. We
|
|
# should just retry.
|
|
continue
|
|
|
|
|
|
def select_local_scheduler(driver_id, local_schedulers, num_gpus,
|
|
redis_client):
|
|
"""Select a local scheduler to assign this actor to.
|
|
|
|
Args:
|
|
driver_id: The ID of the driver who the actor is for.
|
|
local_schedulers: A list of dictionaries of information about the local
|
|
schedulers.
|
|
num_gpus (int): The number of GPUs that must be reserved for this
|
|
actor.
|
|
redis_client: The Redis client to use for interacting with Redis.
|
|
|
|
Returns:
|
|
The ID of the local scheduler that has been chosen.
|
|
|
|
Raises:
|
|
Exception: An exception is raised if no local scheduler can be found
|
|
with sufficient resources.
|
|
"""
|
|
local_scheduler_id = None
|
|
# Loop through all of the local schedulers in a random order.
|
|
local_schedulers = np.random.permutation(local_schedulers)
|
|
for local_scheduler in local_schedulers:
|
|
if local_scheduler["CPU"] < 1:
|
|
continue
|
|
if local_scheduler.get("GPU", 0) < num_gpus:
|
|
continue
|
|
if num_gpus == 0:
|
|
local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"])
|
|
break
|
|
else:
|
|
# Try to reserve enough GPUs on this local scheduler.
|
|
success = attempt_to_reserve_gpus(num_gpus, driver_id,
|
|
local_scheduler, redis_client)
|
|
if success:
|
|
local_scheduler_id = hex_to_binary(
|
|
local_scheduler["DBClientID"])
|
|
break
|
|
|
|
if local_scheduler_id is None:
|
|
raise Exception("Could not find a node with enough GPUs or other "
|
|
"resources to create this actor. The local scheduler "
|
|
"information is {}.".format(local_schedulers))
|
|
|
|
return local_scheduler_id
|
|
|
|
|
|
def publish_actor_creation(actor_id, driver_id, local_scheduler_id,
|
|
reconstruct, redis_client):
|
|
"""Publish a notification that an actor should be created.
|
|
|
|
This broadcast will be received by all of the local schedulers. The local
|
|
scheduler whose ID is being broadcast will create the actor. Any other
|
|
local schedulers that have already created the actor will kill it. All
|
|
local schedulers will update their internal data structures to redirect
|
|
tasks for this actor to the new local scheduler.
|
|
|
|
Args:
|
|
actor_id: The ID of the actor involved.
|
|
driver_id: The ID of the driver responsible for the actor.
|
|
local_scheduler_id: The ID of the local scheduler that is suposed to
|
|
create the actor.
|
|
reconstruct: True if the actor should be created in "reconstruct" mode.
|
|
redis_client: The client used to interact with Redis.
|
|
"""
|
|
reconstruct_bit = b"1" if reconstruct else b"0"
|
|
# Really we should encode this message as a flatbuffer object. However,
|
|
# we're having trouble getting that to work. It almost works, but in Python
|
|
# 2.7, builder.CreateString fails on byte strings that contain characters
|
|
# outside range(128).
|
|
redis_client.publish("actor_notifications",
|
|
actor_id + driver_id + local_scheduler_id +
|
|
reconstruct_bit)
|