mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 11:37:51 +08:00
Treat actor creation like a regular task. (#1668)
* Treat actor creation like a regular task. * Small cleanups. * Change semantics of actor resource handling. * Bug fix. * Minor linting * Bug fix * Fix jenkins test. * Fix actor tests * Some cleanups * Bug fix * Fix bug. * Remove cached actor tasks when a driver is removed. * Add more info to taskspec in global state API. * Fix cyclic import bug in tune. * Fix * Fix linting. * Fix linting. * Don't schedule any tasks (especially actor creaiton tasks) on local schedulers with 0 CPUs. * Bug fix. * Add test for 0 CPU case * Fix linting * Address comments. * Fix typos and add comment. * Add assertion and fix test.
This commit is contained in:
committed by
Stephanie Wang
parent
3c080f4baa
commit
96913be939
@@ -4,10 +4,8 @@ from __future__ import print_function
|
||||
|
||||
import binascii
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import redis
|
||||
import sys
|
||||
|
||||
import ray.local_scheduler
|
||||
@@ -162,192 +160,3 @@ def set_cuda_visible_devices(gpu_ids):
|
||||
gpu_ids: This is a list of integers representing GPU IDs.
|
||||
"""
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])
|
||||
|
||||
|
||||
def attempt_to_reserve_gpus(num_gpus, driver_id, local_scheduler,
|
||||
redis_client):
|
||||
"""Attempt to acquire GPUs on a particular local scheduler for an actor.
|
||||
|
||||
Args:
|
||||
num_gpus: The number of GPUs to acquire.
|
||||
driver_id: The ID of the driver responsible for creating the actor.
|
||||
local_scheduler: Information about the local scheduler.
|
||||
redis_client: The redis client to use for interacting with Redis.
|
||||
|
||||
Returns:
|
||||
True if the GPUs were successfully reserved and false otherwise.
|
||||
"""
|
||||
assert num_gpus != 0
|
||||
local_scheduler_id = local_scheduler["DBClientID"]
|
||||
local_scheduler_total_gpus = int(local_scheduler["GPU"])
|
||||
|
||||
success = False
|
||||
|
||||
# Attempt to acquire GPU IDs atomically.
|
||||
with redis_client.pipeline() as pipe:
|
||||
while True:
|
||||
try:
|
||||
# If this key is changed before the transaction below (the
|
||||
# multi/exec block), then the transaction will not take place.
|
||||
pipe.watch(local_scheduler_id)
|
||||
|
||||
# Figure out which GPUs are currently in use.
|
||||
result = redis_client.hget(local_scheduler_id, "gpus_in_use")
|
||||
gpus_in_use = dict() if result is None else json.loads(
|
||||
result.decode("ascii"))
|
||||
num_gpus_in_use = 0
|
||||
for key in gpus_in_use:
|
||||
num_gpus_in_use += gpus_in_use[key]
|
||||
assert num_gpus_in_use <= local_scheduler_total_gpus
|
||||
|
||||
pipe.multi()
|
||||
|
||||
if local_scheduler_total_gpus - num_gpus_in_use >= num_gpus:
|
||||
# There are enough available GPUs, so try to reserve some.
|
||||
# We use the hex driver ID in hex as a dictionary key so
|
||||
# that the dictionary is JSON serializable.
|
||||
driver_id_hex = binary_to_hex(driver_id)
|
||||
if driver_id_hex not in gpus_in_use:
|
||||
gpus_in_use[driver_id_hex] = 0
|
||||
gpus_in_use[driver_id_hex] += num_gpus
|
||||
|
||||
# Stick the updated GPU IDs back in Redis
|
||||
pipe.hset(local_scheduler_id, "gpus_in_use",
|
||||
json.dumps(gpus_in_use))
|
||||
success = True
|
||||
|
||||
pipe.execute()
|
||||
# If a WatchError is not raised, then the operations should
|
||||
# have gone through atomically.
|
||||
break
|
||||
except redis.WatchError:
|
||||
# Another client must have changed the watched key between the
|
||||
# time we started WATCHing it and the pipeline's execution. We
|
||||
# should just retry.
|
||||
success = False
|
||||
continue
|
||||
|
||||
return success
|
||||
|
||||
|
||||
def release_gpus_in_use(driver_id, local_scheduler_id, gpu_ids, redis_client):
|
||||
"""Release the GPUs that a given worker was using.
|
||||
|
||||
Note that this does not affect the local scheduler's bookkeeping. It only
|
||||
affects the GPU allocations which are recorded in the primary Redis shard,
|
||||
which are redundant with the local scheduler bookkeeping.
|
||||
|
||||
Args:
|
||||
driver_id: The ID of the driver that is releasing some GPUs.
|
||||
local_scheduler_id: The ID of the local scheduler that owns the GPUs
|
||||
being released.
|
||||
gpu_ids: The IDs of the GPUs being released.
|
||||
redis_client: A client for the primary Redis shard.
|
||||
"""
|
||||
# Attempt to release GPU IDs atomically.
|
||||
with redis_client.pipeline() as pipe:
|
||||
while True:
|
||||
try:
|
||||
# If this key is changed before the transaction below (the
|
||||
# multi/exec block), then the transaction will not take place.
|
||||
pipe.watch(local_scheduler_id)
|
||||
|
||||
# Figure out which GPUs are currently in use.
|
||||
result = redis_client.hget(local_scheduler_id, "gpus_in_use")
|
||||
gpus_in_use = dict() if result is None else json.loads(
|
||||
result.decode("ascii"))
|
||||
|
||||
assert driver_id in gpus_in_use
|
||||
assert gpus_in_use[driver_id] >= len(gpu_ids)
|
||||
|
||||
gpus_in_use[driver_id] -= len(gpu_ids)
|
||||
|
||||
pipe.multi()
|
||||
|
||||
pipe.hset(local_scheduler_id, "gpus_in_use",
|
||||
json.dumps(gpus_in_use))
|
||||
|
||||
pipe.execute()
|
||||
# If a WatchError is not raised, then the operations should
|
||||
# have gone through atomically.
|
||||
break
|
||||
except redis.WatchError:
|
||||
# Another client must have changed the watched key between the
|
||||
# time we started WATCHing it and the pipeline's execution. We
|
||||
# should just retry.
|
||||
continue
|
||||
|
||||
|
||||
def select_local_scheduler(driver_id, local_schedulers, num_gpus,
|
||||
redis_client):
|
||||
"""Select a local scheduler to assign this actor to.
|
||||
|
||||
Args:
|
||||
driver_id: The ID of the driver who the actor is for.
|
||||
local_schedulers: A list of dictionaries of information about the local
|
||||
schedulers.
|
||||
num_gpus (int): The number of GPUs that must be reserved for this
|
||||
actor.
|
||||
redis_client: The Redis client to use for interacting with Redis.
|
||||
|
||||
Returns:
|
||||
The ID of the local scheduler that has been chosen.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if no local scheduler can be found
|
||||
with sufficient resources.
|
||||
"""
|
||||
local_scheduler_id = None
|
||||
# Loop through all of the local schedulers in a random order.
|
||||
local_schedulers = np.random.permutation(local_schedulers)
|
||||
for local_scheduler in local_schedulers:
|
||||
if local_scheduler["CPU"] < 1:
|
||||
continue
|
||||
if local_scheduler.get("GPU", 0) < num_gpus:
|
||||
continue
|
||||
if num_gpus == 0:
|
||||
local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"])
|
||||
break
|
||||
else:
|
||||
# Try to reserve enough GPUs on this local scheduler.
|
||||
success = attempt_to_reserve_gpus(num_gpus, driver_id,
|
||||
local_scheduler, redis_client)
|
||||
if success:
|
||||
local_scheduler_id = hex_to_binary(
|
||||
local_scheduler["DBClientID"])
|
||||
break
|
||||
|
||||
if local_scheduler_id is None:
|
||||
raise Exception("Could not find a node with enough GPUs or other "
|
||||
"resources to create this actor. The local scheduler "
|
||||
"information is {}.".format(local_schedulers))
|
||||
|
||||
return local_scheduler_id
|
||||
|
||||
|
||||
def publish_actor_creation(actor_id, driver_id, local_scheduler_id,
|
||||
reconstruct, redis_client):
|
||||
"""Publish a notification that an actor should be created.
|
||||
|
||||
This broadcast will be received by all of the local schedulers. The local
|
||||
scheduler whose ID is being broadcast will create the actor. Any other
|
||||
local schedulers that have already created the actor will kill it. All
|
||||
local schedulers will update their internal data structures to redirect
|
||||
tasks for this actor to the new local scheduler.
|
||||
|
||||
Args:
|
||||
actor_id: The ID of the actor involved.
|
||||
driver_id: The ID of the driver responsible for the actor.
|
||||
local_scheduler_id: The ID of the local scheduler that is suposed to
|
||||
create the actor.
|
||||
reconstruct: True if the actor should be created in "reconstruct" mode.
|
||||
redis_client: The client used to interact with Redis.
|
||||
"""
|
||||
reconstruct_bit = b"1" if reconstruct else b"0"
|
||||
# Really we should encode this message as a flatbuffer object. However,
|
||||
# we're having trouble getting that to work. It almost works, but in Python
|
||||
# 2.7, builder.CreateString fails on byte strings that contain characters
|
||||
# outside range(128).
|
||||
redis_client.publish("actor_notifications",
|
||||
actor_id + driver_id + local_scheduler_id +
|
||||
reconstruct_bit)
|
||||
|
||||
Reference in New Issue
Block a user