[Autoscaler] Account for resource backlog size (#11261)

2026-06-28 12:10:40 +08:00 · 2020-10-12 09:43:48 -07:00
parent d3bc20b727
commit 175fc41fbc
5 changed files with 84 additions and 28 deletions
@@ -18,6 +18,44 @@ import redis
 logger = logging.getLogger(__name__)


+def parse_resource_demands(resource_load_by_shape):
+    """Handle the message.resource_load_by_shape protobuf for the demand
+    based autoscaling. Catch and log all exceptions so this doesn't
+    interfere with the utilization based autoscaler until we're confident
+    this is stable. Worker queue backlogs are added to the appropriate
+    resource demand vector.
+
+    Args:
+        resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
+            in protobuf form or None.
+
+    Returns:
+        List[ResourceDict]: Waiting bundles (ready and feasible).
+        List[ResourceDict]: Infeasible bundles.
+    """
+    waiting_bundles, infeasible_bundles = [], []
+    try:
+        for resource_demand_pb in list(
+                resource_load_by_shape.resource_demands):
+            request_shape = dict(resource_demand_pb.shape)
+            for _ in range(resource_demand_pb.num_ready_requests_queued):
+                waiting_bundles.append(request_shape)
+            for _ in range(resource_demand_pb.num_infeasible_requests_queued):
+                infeasible_bundles.append(request_shape)
+
+            # Infeasible and ready states for tasks are (logically)
+            # mutually exclusive.
+            if resource_demand_pb.num_infeasible_requests_queued > 0:
+                backlog_queue = infeasible_bundles
+            else:
+                backlog_queue = waiting_bundles
+            for _ in range(resource_demand_pb.backlog_size):
+                backlog_queue.append(request_shape)
+    except Exception:
+        logger.exception("Failed to parse resource demands.")
+    return waiting_bundles, infeasible_bundles
+
+
 class Monitor:
    """A monitor for Ray processes.

@@ -89,32 +127,6 @@ class Monitor:
        """
        self.primary_subscribe_client.psubscribe(pattern)

-    def parse_resource_demands(self, resource_load_by_shape):
-        """Handle the message.resource_load_by_shape protobuf for the demand
-        based autoscaling. Catch and log all exceptions so this doesn't
-        interfere with the utilization based autoscaler until we're confident
-        this is stable.
-
-        Args:
-            resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
-                in protobuf form or None.
-        """
-        waiting_bundles, infeasible_bundles = [], []
-        try:
-            if self.autoscaler:
-                for resource_demand_pb in list(
-                        resource_load_by_shape.resource_demands):
-                    request_shape = dict(resource_demand_pb.shape)
-                    for _ in range(
-                            resource_demand_pb.num_ready_requests_queued):
-                        waiting_bundles.append(request_shape)
-                    for _ in range(
-                            resource_demand_pb.num_infeasible_requests_queued):
-                        infeasible_bundles.append(request_shape)
-        except Exception as e:
-            logger.exception(e)
-        return waiting_bundles, infeasible_bundles
-
    def xray_heartbeat_batch_handler(self, unused_channel, data):
        """Handle an xray heartbeat batch message from Redis."""

@@ -129,7 +141,7 @@ class Monitor:
            available_resources = dict(heartbeat_message.resources_available)

            waiting_bundles, infeasible_bundles = \
-                self.parse_resource_demands(message.resource_load_by_shape)
+                parse_resource_demands(message.resource_load_by_shape)

            # Update the load metrics for this raylet.
            client_id = ray.utils.binary_to_hex(heartbeat_message.client_id)