[Autoscaler] Account for resource backlog size (#11261)

2026-06-28 03:02:56 +08:00 · 2020-10-12 09:43:48 -07:00
parent d3bc20b727
commit 175fc41fbc
5 changed files with 84 additions and 28 deletions
@@ -15,6 +15,8 @@ from ray.core.generated.gcs_pb2 import (
    TablePrefix,
    TablePubsub,
    TaskTableData,
+    ResourceDemand,
+    ResourceLoad,
    ResourceMap,
    ResourceTableData,
    ObjectLocationInfo,
@@ -40,6 +42,8 @@ __all__ = [
    "TablePrefix",
    "TablePubsub",
    "TaskTableData",
+    "ResourceDemand",
+    "ResourceLoad",
    "ResourceMap",
    "ResourceTableData",
    "construct_error_message",
@@ -18,6 +18,44 @@ import redis
 logger = logging.getLogger(__name__)


+def parse_resource_demands(resource_load_by_shape):
+    """Handle the message.resource_load_by_shape protobuf for the demand
+    based autoscaling. Catch and log all exceptions so this doesn't
+    interfere with the utilization based autoscaler until we're confident
+    this is stable. Worker queue backlogs are added to the appropriate
+    resource demand vector.
+
+    Args:
+        resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
+            in protobuf form or None.
+
+    Returns:
+        List[ResourceDict]: Waiting bundles (ready and feasible).
+        List[ResourceDict]: Infeasible bundles.
+    """
+    waiting_bundles, infeasible_bundles = [], []
+    try:
+        for resource_demand_pb in list(
+                resource_load_by_shape.resource_demands):
+            request_shape = dict(resource_demand_pb.shape)
+            for _ in range(resource_demand_pb.num_ready_requests_queued):
+                waiting_bundles.append(request_shape)
+            for _ in range(resource_demand_pb.num_infeasible_requests_queued):
+                infeasible_bundles.append(request_shape)
+
+            # Infeasible and ready states for tasks are (logically)
+            # mutually exclusive.
+            if resource_demand_pb.num_infeasible_requests_queued > 0:
+                backlog_queue = infeasible_bundles
+            else:
+                backlog_queue = waiting_bundles
+            for _ in range(resource_demand_pb.backlog_size):
+                backlog_queue.append(request_shape)
+    except Exception:
+        logger.exception("Failed to parse resource demands.")
+    return waiting_bundles, infeasible_bundles
+
+
 class Monitor:
    """A monitor for Ray processes.

@@ -89,32 +127,6 @@ class Monitor:
        """
        self.primary_subscribe_client.psubscribe(pattern)

-    def parse_resource_demands(self, resource_load_by_shape):
-        """Handle the message.resource_load_by_shape protobuf for the demand
-        based autoscaling. Catch and log all exceptions so this doesn't
-        interfere with the utilization based autoscaler until we're confident
-        this is stable.
-
-        Args:
-            resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
-                in protobuf form or None.
-        """
-        waiting_bundles, infeasible_bundles = [], []
-        try:
-            if self.autoscaler:
-                for resource_demand_pb in list(
-                        resource_load_by_shape.resource_demands):
-                    request_shape = dict(resource_demand_pb.shape)
-                    for _ in range(
-                            resource_demand_pb.num_ready_requests_queued):
-                        waiting_bundles.append(request_shape)
-                    for _ in range(
-                            resource_demand_pb.num_infeasible_requests_queued):
-                        infeasible_bundles.append(request_shape)
-        except Exception as e:
-            logger.exception(e)
-        return waiting_bundles, infeasible_bundles
-
    def xray_heartbeat_batch_handler(self, unused_channel, data):
        """Handle an xray heartbeat batch message from Redis."""

@@ -129,7 +141,7 @@ class Monitor:
            available_resources = dict(heartbeat_message.resources_available)

            waiting_bundles, infeasible_bundles = \
-                self.parse_resource_demands(message.resource_load_by_shape)
+                parse_resource_demands(message.resource_load_by_shape)

            # Update the load metrics for this raylet.
            client_id = ray.utils.binary_to_hex(heartbeat_message.client_id)
@@ -87,6 +87,7 @@ py_test_module_list(
    "test_metrics_agent.py",
    "test_microbenchmarks.py",
    "test_mini.py",
+    "test_monitor.py",
    "test_node_manager.py",
    "test_numba.py",
    "test_queue.py",
@@ -0,0 +1,39 @@
+import ray
+from ray.monitor import parse_resource_demands
+
+
+def test_parse_resource_demands():
+    resource_load_by_shape = ray.gcs_utils.ResourceLoad(resource_demands=[
+        ray.gcs_utils.ResourceDemand(
+            shape={"CPU": 1},
+            num_ready_requests_queued=1,
+            num_infeasible_requests_queued=0,
+            backlog_size=0),
+        ray.gcs_utils.ResourceDemand(
+            shape={"CPU": 2},
+            num_ready_requests_queued=1,
+            num_infeasible_requests_queued=0,
+            backlog_size=1),
+        ray.gcs_utils.ResourceDemand(
+            shape={"CPU": 3},
+            num_ready_requests_queued=0,
+            num_infeasible_requests_queued=1,
+            backlog_size=2),
+        ray.gcs_utils.ResourceDemand(
+            shape={"CPU": 4},
+            num_ready_requests_queued=1,
+            num_infeasible_requests_queued=1,
+            backlog_size=2),
+    ])
+
+    waiting, infeasible = \
+        parse_resource_demands(resource_load_by_shape)
+
+    assert waiting.count({"CPU": 1}) == 1
+    assert waiting.count({"CPU": 2}) == 2
+    assert infeasible.count({"CPU": 3}) == 3
+    # The {"CPU": 4} case here is inconsistent, but could happen. Since the
+    # heartbeats are eventually consistent, we won't worry about whether it's
+    # counted as infeasible or waiting, as long as it's accounted for and
+    # doesn't cause an error.
+    assert len(waiting + infeasible) == 10