[Autoscaler] New output log format (#12772)

2026-07-02 22:47:18 +08:00 · 2020-12-23 12:02:55 -08:00
parent d95c8b8a41
commit 8df94e33e0
11 changed files with 907 additions and 154 deletions
@@ -15,11 +15,14 @@ from ray.autoscaler._private.constants import AUTOSCALER_UPDATE_INTERVAL_S
 from ray.autoscaler._private.load_metrics import LoadMetrics
 from ray.autoscaler._private.constants import \
    AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
+from ray.autoscaler._private.util import DEBUG_AUTOSCALING_STATUS
 import ray.gcs_utils
 import ray.utils
 import ray.ray_constants as ray_constants
 from ray.ray_logging import setup_component_logger
 from ray._raylet import GlobalStateAccessor
+from ray.experimental.internal_kv import _internal_kv_put, \
+    _internal_kv_initialized

 import redis

@@ -65,11 +68,7 @@ def parse_resource_demands(resource_load_by_shape):
    except Exception:
        logger.exception("Failed to parse resource demands.")

-    # Bound the total number of bundles to 2xMAX_RESOURCE_DEMAND_VECTOR_SIZE.
-    # This guarantees the resource demand scheduler bin packing algorithm takes
-    # a reasonable amount of time to run.
-    return waiting_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE], \
-        infeasible_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
+    return waiting_bundles, infeasible_bundles


 class Monitor:
@@ -184,14 +183,8 @@ class Monitor:
            data: a resource request as JSON, e.g. {"CPU": 1}
        """

-        if not self.autoscaler:
-            return
-
-        try:
-            self.autoscaler.request_resources(json.loads(data))
-        except Exception:
-            # We don't want this to kill the monitor.
-            traceback.print_exc()
+        resource_request = json.loads(data)
+        self.load_metrics.set_resource_requests(resource_request)

    def process_messages(self, max_messages=10000):
        """Process all messages ready in the subscription channels.
@@ -257,12 +250,23 @@ class Monitor:

        # Handle messages from the subscription channels.
        while True:
+            self.update_raylet_map()
+            self.update_load_metrics()
+            status = {
+                "load_metrics_report": self.load_metrics.summary()._asdict()
+            }
+
            # Process autoscaling actions
            if self.autoscaler:
                # Only used to update the load metrics for the autoscaler.
-                self.update_raylet_map()
-                self.update_load_metrics()
                self.autoscaler.update()
+                status[
+                    "autoscaler_report"] = self.autoscaler.summary()._asdict()
+
+            as_json = json.dumps(status)
+            if _internal_kv_initialized():
+                _internal_kv_put(
+                    DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True)

            # Process a round of messages.
            self.process_messages()