Treat actor creation like a regular task. (#1668)

* Treat actor creation like a regular task. * Small cleanups. * Change semantics of actor resource handling. * Bug fix. * Minor linting * Bug fix * Fix jenkins test. * Fix actor tests * Some cleanups * Bug fix * Fix bug. * Remove cached actor tasks when a driver is removed. * Add more info to taskspec in global state API. * Fix cyclic import bug in tune. * Fix * Fix linting. * Fix linting. * Don't schedule any tasks (especially actor creaiton tasks) on local schedulers with 0 CPUs. * Bug fix. * Add test for 0 CPU case * Fix linting * Address comments. * Fix typos and add comment. * Add assertion and fix test.
2026-07-04 11:37:51 +08:00 · 2018-03-16 11:18:07 -07:00
parent 3c080f4baa
commit 96913be939
36 changed files with 901 additions and 798 deletions
@@ -4,10 +4,8 @@ from __future__ import print_function

 import binascii
 import collections
-import json
 import numpy as np
 import os
-import redis
 import sys

 import ray.local_scheduler
@@ -162,192 +160,3 @@ def set_cuda_visible_devices(gpu_ids):
        gpu_ids: This is a list of integers representing GPU IDs.
    """
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])
-
-
-def attempt_to_reserve_gpus(num_gpus, driver_id, local_scheduler,
-                            redis_client):
-    """Attempt to acquire GPUs on a particular local scheduler for an actor.
-
-    Args:
-        num_gpus: The number of GPUs to acquire.
-        driver_id: The ID of the driver responsible for creating the actor.
-        local_scheduler: Information about the local scheduler.
-        redis_client: The redis client to use for interacting with Redis.
-
-    Returns:
-        True if the GPUs were successfully reserved and false otherwise.
-    """
-    assert num_gpus != 0
-    local_scheduler_id = local_scheduler["DBClientID"]
-    local_scheduler_total_gpus = int(local_scheduler["GPU"])
-
-    success = False
-
-    # Attempt to acquire GPU IDs atomically.
-    with redis_client.pipeline() as pipe:
-        while True:
-            try:
-                # If this key is changed before the transaction below (the
-                # multi/exec block), then the transaction will not take place.
-                pipe.watch(local_scheduler_id)
-
-                # Figure out which GPUs are currently in use.
-                result = redis_client.hget(local_scheduler_id, "gpus_in_use")
-                gpus_in_use = dict() if result is None else json.loads(
-                    result.decode("ascii"))
-                num_gpus_in_use = 0
-                for key in gpus_in_use:
-                    num_gpus_in_use += gpus_in_use[key]
-                assert num_gpus_in_use <= local_scheduler_total_gpus
-
-                pipe.multi()
-
-                if local_scheduler_total_gpus - num_gpus_in_use >= num_gpus:
-                    # There are enough available GPUs, so try to reserve some.
-                    # We use the hex driver ID in hex as a dictionary key so
-                    # that the dictionary is JSON serializable.
-                    driver_id_hex = binary_to_hex(driver_id)
-                    if driver_id_hex not in gpus_in_use:
-                        gpus_in_use[driver_id_hex] = 0
-                    gpus_in_use[driver_id_hex] += num_gpus
-
-                    # Stick the updated GPU IDs back in Redis
-                    pipe.hset(local_scheduler_id, "gpus_in_use",
-                              json.dumps(gpus_in_use))
-                    success = True
-
-                pipe.execute()
-                # If a WatchError is not raised, then the operations should
-                # have gone through atomically.
-                break
-            except redis.WatchError:
-                # Another client must have changed the watched key between the
-                # time we started WATCHing it and the pipeline's execution. We
-                # should just retry.
-                success = False
-                continue
-
-    return success
-
-
-def release_gpus_in_use(driver_id, local_scheduler_id, gpu_ids, redis_client):
-    """Release the GPUs that a given worker was using.
-
-    Note that this does not affect the local scheduler's bookkeeping. It only
-    affects the GPU allocations which are recorded in the primary Redis shard,
-    which are redundant with the local scheduler bookkeeping.
-
-    Args:
-        driver_id: The ID of the driver that is releasing some GPUs.
-        local_scheduler_id: The ID of the local scheduler that owns the GPUs
-            being released.
-        gpu_ids: The IDs of the GPUs being released.
-        redis_client: A client for the primary Redis shard.
-    """
-    # Attempt to release GPU IDs atomically.
-    with redis_client.pipeline() as pipe:
-        while True:
-            try:
-                # If this key is changed before the transaction below (the
-                # multi/exec block), then the transaction will not take place.
-                pipe.watch(local_scheduler_id)
-
-                # Figure out which GPUs are currently in use.
-                result = redis_client.hget(local_scheduler_id, "gpus_in_use")
-                gpus_in_use = dict() if result is None else json.loads(
-                    result.decode("ascii"))
-
-                assert driver_id in gpus_in_use
-                assert gpus_in_use[driver_id] >= len(gpu_ids)
-
-                gpus_in_use[driver_id] -= len(gpu_ids)
-
-                pipe.multi()
-
-                pipe.hset(local_scheduler_id, "gpus_in_use",
-                          json.dumps(gpus_in_use))
-
-                pipe.execute()
-                # If a WatchError is not raised, then the operations should
-                # have gone through atomically.
-                break
-            except redis.WatchError:
-                # Another client must have changed the watched key between the
-                # time we started WATCHing it and the pipeline's execution. We
-                # should just retry.
-                continue
-
-
-def select_local_scheduler(driver_id, local_schedulers, num_gpus,
-                           redis_client):
-    """Select a local scheduler to assign this actor to.
-
-    Args:
-        driver_id: The ID of the driver who the actor is for.
-        local_schedulers: A list of dictionaries of information about the local
-            schedulers.
-        num_gpus (int): The number of GPUs that must be reserved for this
-            actor.
-        redis_client: The Redis client to use for interacting with Redis.
-
-    Returns:
-        The ID of the local scheduler that has been chosen.
-
-    Raises:
-        Exception: An exception is raised if no local scheduler can be found
-            with sufficient resources.
-    """
-    local_scheduler_id = None
-    # Loop through all of the local schedulers in a random order.
-    local_schedulers = np.random.permutation(local_schedulers)
-    for local_scheduler in local_schedulers:
-        if local_scheduler["CPU"] < 1:
-            continue
-        if local_scheduler.get("GPU", 0) < num_gpus:
-            continue
-        if num_gpus == 0:
-            local_scheduler_id = hex_to_binary(local_scheduler["DBClientID"])
-            break
-        else:
-            # Try to reserve enough GPUs on this local scheduler.
-            success = attempt_to_reserve_gpus(num_gpus, driver_id,
-                                              local_scheduler, redis_client)
-            if success:
-                local_scheduler_id = hex_to_binary(
-                                         local_scheduler["DBClientID"])
-                break
-
-    if local_scheduler_id is None:
-        raise Exception("Could not find a node with enough GPUs or other "
-                        "resources to create this actor. The local scheduler "
-                        "information is {}.".format(local_schedulers))
-
-    return local_scheduler_id
-
-
-def publish_actor_creation(actor_id, driver_id, local_scheduler_id,
-                           reconstruct, redis_client):
-    """Publish a notification that an actor should be created.
-
-    This broadcast will be received by all of the local schedulers. The local
-    scheduler whose ID is being broadcast will create the actor. Any other
-    local schedulers that have already created the actor will kill it. All
-    local schedulers will update their internal data structures to redirect
-    tasks for this actor to the new local scheduler.
-
-    Args:
-        actor_id: The ID of the actor involved.
-        driver_id: The ID of the driver responsible for the actor.
-        local_scheduler_id: The ID of the local scheduler that is suposed to
-            create the actor.
-        reconstruct: True if the actor should be created in "reconstruct" mode.
-        redis_client: The client used to interact with Redis.
-    """
-    reconstruct_bit = b"1" if reconstruct else b"0"
-    # Really we should encode this message as a flatbuffer object. However,
-    # we're having trouble getting that to work. It almost works, but in Python
-    # 2.7, builder.CreateString fails on byte strings that contain characters
-    # outside range(128).
-    redis_client.publish("actor_notifications",
-                         actor_id + driver_id + local_scheduler_id +
-                         reconstruct_bit)