[autoscaler] Add a 'request_cores' function for manual autoscaling (#4754)

2026-06-28 18:45:03 +08:00 · 2019-07-27 01:14:45 +01:00
parent d9e81da3b8
commit 06fec63c87
5 changed files with 137 additions and 11 deletions
@@ -17,6 +17,7 @@ from collections import defaultdict
 import numpy as np
 import ray.services as services
 import yaml
+from ray.worker import global_worker
 from ray.autoscaler.docker import dockerize_if_needed
 from ray.autoscaler.node_provider import get_node_provider, \
    get_default_config
@@ -26,7 +27,8 @@ from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
 from ray.autoscaler.updater import NodeUpdaterThread
 from ray.ray_constants import AUTOSCALER_MAX_NUM_FAILURES, \
    AUTOSCALER_MAX_LAUNCH_BATCH, AUTOSCALER_MAX_CONCURRENT_LAUNCHES, \
-    AUTOSCALER_UPDATE_INTERVAL_S, AUTOSCALER_HEARTBEAT_TIMEOUT_S
+    AUTOSCALER_UPDATE_INTERVAL_S, AUTOSCALER_HEARTBEAT_TIMEOUT_S, \
+    AUTOSCALER_RESOURCE_REQUEST_CHANNEL
 from six import string_types
 from six.moves import queue

@@ -159,6 +161,7 @@ class LoadMetrics(object):

    def update(self, ip, static_resources, dynamic_resources):
        self.static_resources_by_ip[ip] = static_resources
+
        # We are not guaranteed to have a corresponding dynamic resource for
        # every static resource because dynamic resources are based on the
        # available resources in the heartbeat, which does not exist if it is
@@ -406,6 +409,8 @@ class StandardAutoscaler(object):
        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)

+        self.resource_requests = defaultdict(int)
+
        logger.info("StandardAutoscaler: {}".format(self.config))

    def update(self):
@@ -432,11 +437,16 @@ class StandardAutoscaler(object):
        self.last_update_time = now
        num_pending = self.num_launches_pending.value
        nodes = self.workers()
-        self.log_info_string(nodes)
        self.load_metrics.prune_active_ips(
            [self.provider.internal_ip(node_id) for node_id in nodes])
        target_workers = self.target_num_workers()

+        if len(nodes) >= target_workers:
+            if "CPU" in self.resource_requests:
+                del self.resource_requests["CPU"]
+
+        self.log_info_string(nodes, target_workers)
+
        # Terminate any idle or out of date nodes
        last_used = self.load_metrics.last_used_time_by_ip
        horizon = now - (60 * self.config["idle_timeout_minutes"])
@@ -457,7 +467,7 @@ class StandardAutoscaler(object):
        if nodes_to_terminate:
            self.provider.terminate_nodes(nodes_to_terminate)
            nodes = self.workers()
-            self.log_info_string(nodes)
+            self.log_info_string(nodes, target_workers)

        # Terminate nodes if there are too many
        nodes_to_terminate = []
@@ -470,20 +480,22 @@ class StandardAutoscaler(object):
        if nodes_to_terminate:
            self.provider.terminate_nodes(nodes_to_terminate)
            nodes = self.workers()
-            self.log_info_string(nodes)
+            self.log_info_string(nodes, target_workers)

        # Launch new nodes if needed
        num_workers = len(nodes) + num_pending
        if num_workers < target_workers:
            max_allowed = min(self.max_launch_batch,
                              self.max_concurrent_launches - num_pending)
+
            num_launches = min(max_allowed, target_workers - num_workers)
            self.launch_new_node(num_launches)
            nodes = self.workers()
-            self.log_info_string(nodes)
+            self.log_info_string(nodes, target_workers)
        elif self.load_metrics.num_workers_connected() >= target_workers:
            logger.info("Ending bringup phase")
            self.bringup = False
+            self.log_info_string(nodes, target_workers)

        # Process any completed updates
        completed = []
@@ -501,7 +513,7 @@ class StandardAutoscaler(object):
            # immediately trying to restart Ray on the new node.
            self.load_metrics.mark_active(self.provider.internal_ip(node_id))
            nodes = self.workers()
-            self.log_info_string(nodes)
+            self.log_info_string(nodes, target_workers)

        # Update nodes with out-of-date files
        T = [
@@ -556,6 +568,20 @@ class StandardAutoscaler(object):
            # If we want any workers, we want at least initial_workers
            ideal_num_workers = max(ideal_num_workers, initial_workers)

+        # Other resources are not supported at present.
+        if "CPU" in self.resource_requests:
+            try:
+                cores_per_worker = self.config["worker_nodes"]["Resources"][
+                    "CPU"]
+            except KeyError:
+                cores_per_worker = 1  # Assume the worst
+
+            cores_desired = self.resource_requests["CPU"]
+
+            ideal_num_workers = max(
+                ideal_num_workers,
+                int(np.ceil(cores_desired / cores_per_worker)))
+
        return min(self.config["max_workers"],
                   max(self.config["min_workers"], ideal_num_workers))

@@ -659,11 +685,12 @@ class StandardAutoscaler(object):
        return self.provider.non_terminated_nodes(
            tag_filters={TAG_RAY_NODE_TYPE: "worker"})

-    def log_info_string(self, nodes):
-        logger.info("StandardAutoscaler: {}".format(self.info_string(nodes)))
+    def log_info_string(self, nodes, target):
+        logger.info("StandardAutoscaler: {}".format(
+            self.info_string(nodes, target)))
        logger.info("LoadMetrics: {}".format(self.load_metrics.info_string()))

-    def info_string(self, nodes):
+    def info_string(self, nodes, target):
        suffix = ""
        if self.num_launches_pending:
            suffix += " ({} pending)".format(self.num_launches_pending.value)
@@ -675,8 +702,15 @@ class StandardAutoscaler(object):
        if self.bringup:
            suffix += " (bringup=True)"

-        return "{}/{} target nodes{}".format(
-            len(nodes), self.target_num_workers(), suffix)
+        return "{}/{} target nodes{}".format(len(nodes), target, suffix)
+
+    def request_resources(self, resources):
+        for resource, count in resources.items():
+            self.resource_requests[resource] = max(
+                self.resource_requests[resource], count)
+
+        logger.info("StandardAutoscaler: resource_requests={}".format(
+            self.resource_requests))

    def kill_workers(self):
        logger.error("StandardAutoscaler: kill_workers triggered")
@@ -824,3 +858,34 @@ def hash_runtime_conf(file_mounts, extra_objs):
        _hash_cache[conf_str] = hasher.hexdigest()

    return _hash_cache[conf_str]
+
+
+def request_resources(num_cpus=None, num_gpus=None):
+    """Remotely request some CPU or GPU resources from the autoscaler.
+
+    This function is to be called e.g. on a node before submitting a bunch of
+    ray.remote calls to ensure that resources rapidly become available.
+
+    In the future this could be extended to do GPU cores or other custom
+    resources.
+
+    This function is non blocking.
+
+    Args:
+
+        num_cpus: int -- the number of CPU cores to request
+        num_gpus: int -- the number of GPUs to request (Not implemented)
+
+    """
+    if num_gpus is not None:
+        raise NotImplementedError(
+            "GPU resource is not yet supported through request_resources")
+    r = services.create_redis_client(
+        global_worker.node.redis_address,
+        password=global_worker.node.redis_password)
+    assert isinstance(num_cpus, int)
+    if num_cpus > 0:
+        r.publish(AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
+                  json.dumps({
+                      "CPU": num_cpus
+                  }))
@@ -173,6 +173,13 @@ class AWSNodeProvider(NodeProvider):
    def create_node(self, node_config, tags, count):
        tags = to_aws_format(tags)
        conf = node_config.copy()
+
+        # Delete unsupported keys from the node config
+        try:
+            del conf["Resources"]
+        except KeyError:
+            pass
+
        tag_pairs = [{
            "Key": TAG_RAY_CLUSTER_NAME,
            "Value": self.cluster_name,
@@ -7,6 +7,7 @@ import logging
 import os
 import time
 import traceback
+import json

 import redis

@@ -212,6 +213,23 @@ class Monitor(object):
                            binary_to_hex(job_id)))
            self._xray_clean_up_entries_for_job(job_id)

+    def autoscaler_resource_request_handler(self, _, data):
+        """Handle a notification of a resource request for the autoscaler.
+
+        Args:
+            channel: unused
+            data: a resource request as JSON, e.g. {"CPU": 1}
+        """
+
+        if not self.autoscaler:
+            return
+
+        try:
+            self.autoscaler.request_resources(json.loads(data))
+        except Exception:
+            # We don't want this to kill the monitor.
+            traceback.print_exc()
+
    def process_messages(self, max_messages=10000):
        """Process all messages ready in the subscription channels.

@@ -241,6 +259,9 @@ class Monitor(object):
                elif channel == ray.gcs_utils.XRAY_JOB_CHANNEL:
                    # Handles driver death.
                    message_handler = self.xray_job_notification_handler
+                elif (channel ==
+                      ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL):
+                    message_handler = self.autoscaler_resource_request_handler
                else:
                    raise Exception("This code should be unreachable.")

@@ -307,6 +328,10 @@ class Monitor(object):
        self.subscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_CHANNEL)
        self.subscribe(ray.gcs_utils.XRAY_JOB_CHANNEL)

+        if self.autoscaler:
+            self.subscribe(
+                ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL)
+
        # TODO(rkn): If there were any dead clients at startup, we should clean
        # up the associated state in the state tables.

@@ -125,3 +125,5 @@ LOG_MONITOR_MAX_OPEN_FILES = 200

 # A constant used as object metadata to indicate the object is raw binary.
 RAW_BUFFER_METADATA = b"RAW"
+
+AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"
@@ -275,6 +275,33 @@ class AutoscalingTest(unittest.TestCase):
        autoscaler.update()
        self.waitForNodes(2)

+    def testManualAutoscaling(self):
+        config = SMALL_CLUSTER.copy()
+        config["min_workers"] = 0
+        config["max_workers"] = 50
+        cores_per_node = 2
+        config["worker_nodes"] = {"Resources": {"CPU": cores_per_node}}
+        config_path = self.write_config(config)
+        self.provider = MockProvider()
+        autoscaler = StandardAutoscaler(
+            config_path,
+            LoadMetrics(),
+            max_launch_batch=5,
+            max_concurrent_launches=5,
+            max_failures=0,
+            update_interval_s=0)
+        assert len(self.provider.non_terminated_nodes({})) == 0
+        autoscaler.update()
+        self.waitForNodes(0)
+        autoscaler.request_resources({"CPU": cores_per_node * 10})
+        for _ in range(3):  # Maximum launch batch is 5
+            autoscaler.update()
+        self.waitForNodes(10)
+        autoscaler.request_resources({"CPU": cores_per_node * 30})
+        for _ in range(4):  # Maximum launch batch is 5
+            autoscaler.update()
+        self.waitForNodes(30)
+
    def testTerminateOutdatedNodesGracefully(self):
        config = SMALL_CLUSTER.copy()
        config["min_workers"] = 5