Batch heartbeats from node manager together in the monitor. (#3011)

2026-06-29 22:00:17 +08:00 · 2018-11-20 09:52:27 -08:00
parent abdc3b592e
commit b0bfd104f2
13 changed files with 135 additions and 37 deletions
@@ -109,6 +109,7 @@ AsyncGcsClient::AsyncGcsClient(const std::string &address, int port,
  client_table_.reset(new ClientTable({primary_context_}, this, client_id));
  error_table_.reset(new ErrorTable({primary_context_}, this));
  driver_table_.reset(new DriverTable({primary_context_}, this));
+  heartbeat_batch_table_.reset(new HeartbeatBatchTable({primary_context_}, this));
  // Tables below would be sharded.
  object_table_.reset(new ObjectTable(shard_contexts_, this, command_type));
  actor_table_.reset(new ActorTable(shard_contexts_, this));
@@ -214,6 +215,10 @@ ClassTable &AsyncGcsClient::class_table() { return *class_table_; }

 HeartbeatTable &AsyncGcsClient::heartbeat_table() { return *heartbeat_table_; }

+HeartbeatBatchTable &AsyncGcsClient::heartbeat_batch_table() {
+  return *heartbeat_batch_table_;
+}
+
 ErrorTable &AsyncGcsClient::error_table() { return *error_table_; }

 DriverTable &AsyncGcsClient::driver_table() { return *driver_table_; }
@@ -60,6 +60,7 @@ class RAY_EXPORT AsyncGcsClient {
  TaskLeaseTable &task_lease_table();
  ClientTable &client_table();
  HeartbeatTable &heartbeat_table();
+  HeartbeatBatchTable &heartbeat_batch_table();
  ErrorTable &error_table();
  DriverTable &driver_table();
  ProfileTable &profile_table();
@@ -89,6 +90,7 @@ class RAY_EXPORT AsyncGcsClient {
  std::unique_ptr<TaskReconstructionLog> task_reconstruction_log_;
  std::unique_ptr<TaskLeaseTable> task_lease_table_;
  std::unique_ptr<HeartbeatTable> heartbeat_table_;
+  std::unique_ptr<HeartbeatBatchTable> heartbeat_batch_table_;
  std::unique_ptr<ErrorTable> error_table_;
  std::unique_ptr<ProfileTable> profile_table_;
  std::unique_ptr<ClientTable> client_table_;
@@ -15,6 +15,7 @@ enum TablePrefix:int {
  FUNCTION,
  TASK_RECONSTRUCTION,
  HEARTBEAT,
+  HEARTBEAT_BATCH,
  ERROR_INFO,
  DRIVER,
  PROFILE,
@@ -30,6 +31,7 @@ enum TablePubsub:int {
  OBJECT,
  ACTOR,
  HEARTBEAT,
+  HEARTBEAT_BATCH,
  ERROR_INFO,
  TASK_LEASE,
  DRIVER,
@@ -262,6 +264,10 @@ table HeartbeatTableData {
  resource_load_capacity: [double];
 }

+table HeartbeatBatchTableData {
+  batch: [HeartbeatTableData];
+}
+
 // Data for a lease on task execution.
 table TaskLeaseData {
  // Node manager client ID.
@@ -479,6 +479,7 @@ template class Log<ActorID, ActorTableData>;
 template class Log<TaskID, TaskReconstructionData>;
 template class Table<TaskID, TaskLeaseData>;
 template class Table<ClientID, HeartbeatTableData>;
+template class Table<ClientID, HeartbeatBatchTableData>;
 template class Log<JobID, ErrorTableData>;
 template class Log<UniqueID, ClientTableData>;
 template class Log<JobID, DriverTableData>;
@@ -351,6 +351,17 @@ class HeartbeatTable : public Table<ClientID, HeartbeatTableData> {
  virtual ~HeartbeatTable() {}
 };

+class HeartbeatBatchTable : public Table<ClientID, HeartbeatBatchTableData> {
+ public:
+  HeartbeatBatchTable(const std::vector<std::shared_ptr<RedisContext>> &contexts,
+                      AsyncGcsClient *client)
+      : Table(contexts, client) {
+    pubsub_channel_ = TablePubsub::HEARTBEAT_BATCH;
+    prefix_ = TablePrefix::HEARTBEAT_BATCH;
+  }
+  virtual ~HeartbeatBatchTable() {}
+};
+
 class DriverTable : public Log<JobID, DriverTableData> {
 public:
  DriverTable(const std::vector<std::shared_ptr<RedisContext>> &contexts,
@@ -359,6 +370,7 @@ class DriverTable : public Log<JobID, DriverTableData> {
    pubsub_channel_ = TablePubsub::DRIVER;
    prefix_ = TablePrefix::DRIVER;
  };
+
  virtual ~DriverTable() {}

  /// Appends driver data to the driver table.
@@ -23,14 +23,16 @@ Monitor::Monitor(boost::asio::io_service &io_service, const std::string &redis_a
  RAY_CHECK_OK(gcs_client_.Attach(io_service));
 }

-void Monitor::HandleHeartbeat(const ClientID &client_id) {
+void Monitor::HandleHeartbeat(const ClientID &client_id,
+                              const HeartbeatTableDataT &heartbeat_data) {
  heartbeats_[client_id] = num_heartbeats_timeout_;
+  heartbeat_buffer_[client_id] = heartbeat_data;
 }

 void Monitor::Start() {
  const auto heartbeat_callback = [this](gcs::AsyncGcsClient *client, const ClientID &id,
                                         const HeartbeatTableDataT &heartbeat_data) {
-    HandleHeartbeat(id);
+    HandleHeartbeat(id, heartbeat_data);
  };
  RAY_CHECK_OK(gcs_client_.heartbeat_table().Subscribe(
      UniqueID::nil(), UniqueID::nil(), heartbeat_callback, nullptr, nullptr));
@@ -66,6 +68,18 @@ void Monitor::Tick() {
    }
  }

+  // Send any buffered heartbeats as a single publish.
+  if (!heartbeat_buffer_.empty()) {
+    auto batch = std::make_shared<HeartbeatBatchTableDataT>();
+    for (const auto &heartbeat : heartbeat_buffer_) {
+      batch->batch.push_back(std::unique_ptr<HeartbeatTableDataT>(
+          new HeartbeatTableDataT(heartbeat.second)));
+    }
+    RAY_CHECK_OK(gcs_client_.heartbeat_batch_table().Add(UniqueID::nil(), UniqueID::nil(),
+                                                         batch, nullptr));
+    heartbeat_buffer_.clear();
+  }
+
  auto heartbeat_period = boost::posix_time::milliseconds(
      RayConfig::instance().heartbeat_timeout_milliseconds());
  heartbeat_timer_.expires_from_now(heartbeat_period);
@@ -33,7 +33,9 @@ class Monitor {
  /// Handle a heartbeat from a Raylet.
  ///
  /// \param client_id The client ID of the Raylet that sent the heartbeat.
-  void HandleHeartbeat(const ClientID &client_id);
+  /// \param heartbeat_data The heartbeat sent by the client.
+  void HandleHeartbeat(const ClientID &client_id,
+                       const HeartbeatTableDataT &heartbeat_data);

 private:
  /// A client to the GCS, through which heartbeats are received.
@@ -47,6 +49,8 @@ class Monitor {
  std::unordered_map<ClientID, int64_t> heartbeats_;
  /// The Raylets that have been marked as dead in the client table.
  std::unordered_set<ClientID> dead_clients_;
+  /// A buffer containing heartbeats received from node managers in the last tick.
+  std::unordered_map<ClientID, HeartbeatTableDataT> heartbeat_buffer_;
 };

 }  // namespace raylet
@@ -156,15 +156,16 @@ ray::Status NodeManager::RegisterGcs() {
  };
  gcs_client_->client_table().RegisterClientRemovedCallback(node_manager_client_removed);

-  // Subscribe to node manager heartbeats.
-  const auto heartbeat_added = [this](gcs::AsyncGcsClient *client, const ClientID &id,
-                                      const HeartbeatTableDataT &heartbeat_data) {
-    HeartbeatAdded(client, id, heartbeat_data);
+  // Subscribe to heartbeat batches from the monitor.
+  const auto &heartbeat_batch_added = [this](
+      gcs::AsyncGcsClient *client, const ClientID &id,
+      const HeartbeatBatchTableDataT &heartbeat_batch) {
+    HeartbeatBatchAdded(heartbeat_batch);
  };
-  RAY_RETURN_NOT_OK(gcs_client_->heartbeat_table().Subscribe(
-      UniqueID::nil(), UniqueID::nil(), heartbeat_added, nullptr,
+  RAY_RETURN_NOT_OK(gcs_client_->heartbeat_batch_table().Subscribe(
+      UniqueID::nil(), UniqueID::nil(), heartbeat_batch_added, nullptr,
      [](gcs::AsyncGcsClient *client) {
-        RAY_LOG(DEBUG) << "heartbeat table subscription done callback called.";
+        RAY_LOG(DEBUG) << "Heartbeat batch table subscription done.";
      }));

  // Subscribe to driver table updates.
@@ -399,14 +400,9 @@ void NodeManager::ClientRemoved(const ClientTableDataT &client_data) {
  remote_server_connections_.erase(client_id);
 }

-void NodeManager::HeartbeatAdded(gcs::AsyncGcsClient *client, const ClientID &client_id,
+void NodeManager::HeartbeatAdded(const ClientID &client_id,
                                 const HeartbeatTableDataT &heartbeat_data) {
  RAY_LOG(DEBUG) << "[HeartbeatAdded]: received heartbeat from client id " << client_id;
-  const ClientID &local_client_id = gcs_client_->client_table().GetLocalClientId();
-  if (client_id == local_client_id) {
-    // Skip heartbeats from self.
-    return;
-  }
  // Locate the client id in remote client table and update available resources based on
  // the received heartbeat information.
  auto it = cluster_resource_map_.find(client_id);
@@ -427,9 +423,8 @@ void NodeManager::HeartbeatAdded(gcs::AsyncGcsClient *client, const ClientID &cl
  remote_resources.SetAvailableResources(std::move(remote_available));
  // Extract the load information and save it locally.
  remote_resources.SetLoadResources(std::move(remote_load));
-
-  auto decision = scheduling_policy_.SpillOver(remote_resources);
  // Extract decision for this local scheduler.
+  auto decision = scheduling_policy_.SpillOver(remote_resources);
  std::unordered_set<TaskID> local_task_ids;
  for (const auto &task_id : decision) {
    // (See design_docs/task_states.rst for the state transition diagram.)
@@ -448,6 +443,19 @@ void NodeManager::HeartbeatAdded(gcs::AsyncGcsClient *client, const ClientID &cl
  }
 }

+void NodeManager::HeartbeatBatchAdded(const HeartbeatBatchTableDataT &heartbeat_batch) {
+  const ClientID &local_client_id = gcs_client_->client_table().GetLocalClientId();
+  // Update load information provided by each heartbeat.
+  for (const auto &heartbeat_data : heartbeat_batch.batch) {
+    const ClientID &client_id = ClientID::from_binary(heartbeat_data->client_id);
+    if (client_id == local_client_id) {
+      // Skip heartbeats from self.
+      continue;
+    }
+    HeartbeatAdded(client_id, *heartbeat_data);
+  }
+}
+
 void NodeManager::HandleActorCreation(const ActorID &actor_id,
                                      const std::vector<ActorTableDataT> &data) {
  RAY_LOG(DEBUG) << "Actor creation notification received: " << actor_id;
@@ -127,12 +127,14 @@ class NodeManager {

  /// Handler for a heartbeat notification from the GCS.
  ///
-  /// \param client The GCS client.
  /// \param id The ID of the node manager that sent the heartbeat.
  /// \param data The heartbeat data including load information.
  /// \return Void.
-  void HeartbeatAdded(gcs::AsyncGcsClient *client, const ClientID &id,
-                      const HeartbeatTableDataT &data);
+  void HeartbeatAdded(const ClientID &id, const HeartbeatTableDataT &data);
+  /// Handler for a heartbeat batch notification from the GCS
+  ///
+  /// \param heartbeat_batch The batch of heartbeat data.
+  void HeartbeatBatchAdded(const HeartbeatBatchTableDataT &heartbeat_batch);

  /// Methods for task scheduling.

@@ -1,6 +1,8 @@
-#include "scheduling_policy.h"
-
+#include <algorithm>
 #include <chrono>
+#include <random>
+
+#include "scheduling_policy.h"

 #include "ray/util/logging.h"

@@ -123,21 +125,23 @@ std::vector<TaskID> SchedulingPolicy::SpillOver(

  ResourceSet new_load(remote_scheduling_resources.GetLoadResources());

-  // Check if we can accommodate an infeasible task.
+  // Check if we can accommodate infeasible tasks.
  for (const auto &task : scheduling_queue_.GetInfeasibleTasks()) {
    const auto &spec = task.GetTaskSpecification();
-    if (spec.GetRequiredPlacementResources().IsSubset(
-            remote_scheduling_resources.GetTotalResources())) {
+    const auto &placement_resources = spec.GetRequiredPlacementResources();
+    if (placement_resources.IsSubset(remote_scheduling_resources.GetTotalResources())) {
      decision.push_back(spec.TaskId());
      new_load.AddResources(spec.GetRequiredResources());
    }
  }

+  // Try to accommodate up to a single ready task.
  for (const auto &task : scheduling_queue_.GetReadyTasks()) {
    const auto &spec = task.GetTaskSpecification();
    if (!spec.IsActorTask()) {
+      // Make sure the node has enough available resources to prevent forwarding cycles.
      if (spec.GetRequiredPlacementResources().IsSubset(
-              remote_scheduling_resources.GetTotalResources())) {
+              remote_scheduling_resources.GetAvailableResources())) {
        decision.push_back(spec.TaskId());
        new_load.AddResources(spec.GetRequiredResources());
        break;
@@ -36,6 +36,13 @@ class SchedulingPolicy {
      std::unordered_map<ClientID, SchedulingResources> &cluster_resources,
      const ClientID &local_client_id);

+  /// \brief Given a set of cluster resources perform a spill-over scheduling operation.
+  ///
+  /// \param cluster_resources: a set of cluster resources containing resource and load
+  /// information for some subset of the cluster. For all client IDs in the returned
+  /// placement map, the corresponding SchedulingResources::resources_load_ is
+  /// incremented by the aggregate resource demand of the tasks assigned to it.
+  /// \return Scheduling decision, mapping tasks to raylets for placement.
  std::vector<TaskID> SpillOver(SchedulingResources &remote_scheduling_resources) const;

  /// \brief SchedulingPolicy destructor.
@@ -145,8 +145,8 @@ const std::string ResourceSet::ToString() const {
  // Convert the first element to a string.
  if (it != resource_capacity_.end()) {
    return_string += "{" + it->first + "," + std::to_string(it->second) + "}";
+    it++;
  }
-  it++;

  // Add the remaining elements to the string (along with a comma).
  for (; it != resource_capacity_.end(); ++it) {
@@ -3,6 +3,7 @@ from __future__ import division
 from __future__ import print_function

 import collections
+import json
 import random
 import numpy as np
 import os
@@ -919,13 +920,16 @@ def test_actor_multiple_gpus_from_multiple_tasks(shutdown_only):
        num_local_schedulers=num_local_schedulers,
        redirect_output=True,
        num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]),
-        num_gpus=(num_local_schedulers * [num_gpus_per_scheduler]))
+        num_gpus=(num_local_schedulers * [num_gpus_per_scheduler]),
+        _internal_config=json.dumps({
+            "num_heartbeats_timeout": 1000
+        }))

    @ray.remote
-    def create_actors(n):
+    def create_actors(i, n):
        @ray.remote(num_gpus=1)
        class Actor(object):
-            def __init__(self):
+            def __init__(self, i, j):
                self.gpu_ids = ray.get_gpu_ids()

            def get_location_and_ids(self):
@@ -933,15 +937,44 @@ def test_actor_multiple_gpus_from_multiple_tasks(shutdown_only):
                    ray.worker.global_worker.plasma_client.store_socket_name),
                        tuple(self.gpu_ids))

-        # Create n actors.
-        for _ in range(n):
-            Actor.remote()
+            def sleep(self):
+                time.sleep(100)

-    ray.get([
-        create_actors.remote(num_gpus_per_scheduler)
-        for _ in range(num_local_schedulers)
+        # Create n actors.
+        actors = []
+        for j in range(n):
+            actors.append(Actor.remote(i, j))
+
+        locations = ray.get(
+            [actor.get_location_and_ids.remote() for actor in actors])
+
+        # Put each actor to sleep for a long time to prevent them from getting
+        # terminated.
+        for actor in actors:
+            actor.sleep.remote()
+
+        return locations
+
+    all_locations = ray.get([
+        create_actors.remote(i, num_gpus_per_scheduler)
+        for i in range(num_local_schedulers)
    ])

+    # Make sure that no two actors are assigned to the same GPU.
+    node_names = {
+        location
+        for locations in all_locations for location, gpu_id in locations
+    }
+    assert len(node_names) == num_local_schedulers
+
+    # Keep track of which GPU IDs are being used for each location.
+    gpus_in_use = {node_name: [] for node_name in node_names}
+    for locations in all_locations:
+        for location, gpu_ids in locations:
+            gpus_in_use[location].extend(gpu_ids)
+    for node_name in node_names:
+        assert len(set(gpus_in_use[node_name])) == num_gpus_per_scheduler
+
    @ray.remote(num_gpus=1)
    class Actor(object):
        def __init__(self):