From 44aa9c173f9cadd29d9e642ba1c1b9f07f9835ec Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Thu, 4 Feb 2021 10:37:28 +0800 Subject: [PATCH] Rename timeout to period with heartbeat interval (#13872) --- python/ray/includes/ray_config.pxd | 2 +- python/ray/includes/ray_config.pxi | 4 ++-- python/ray/tests/test_actor_failures.py | 2 +- python/ray/tests/test_failure.py | 4 ++-- python/ray/tests/test_reconstruction.py | 18 +++++++++--------- src/ray/common/ray_config_def.h | 4 ++-- .../gcs/gcs_server/gcs_heartbeat_manager.cc | 2 +- src/ray/raylet/main.cc | 2 +- src/ray/raylet/node_manager.cc | 4 ++-- 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/ray/includes/ray_config.pxd b/python/ray/includes/ray_config.pxd index 079f30690..309132cf7 100644 --- a/python/ray/includes/ray_config.pxd +++ b/python/ray/includes/ray_config.pxd @@ -13,7 +13,7 @@ cdef extern from "ray/common/ray_config.h" nogil: int64_t handler_warning_timeout_ms() const - int64_t raylet_heartbeat_timeout_milliseconds() const + int64_t raylet_heartbeat_period_milliseconds() const int64_t debug_dump_period_milliseconds() const diff --git a/python/ray/includes/ray_config.pxi b/python/ray/includes/ray_config.pxi index 96a2a14f2..d6c28805c 100644 --- a/python/ray/includes/ray_config.pxi +++ b/python/ray/includes/ray_config.pxi @@ -10,8 +10,8 @@ cdef class Config: return RayConfig.instance().handler_warning_timeout_ms() @staticmethod - def raylet_heartbeat_timeout_milliseconds(): - return RayConfig.instance().raylet_heartbeat_timeout_milliseconds() + def raylet_heartbeat_period_milliseconds(): + return RayConfig.instance().raylet_heartbeat_period_milliseconds() @staticmethod def debug_dump_period_milliseconds(): diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 227fb48d2..4e2e19f1b 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -275,7 +275,7 @@ def test_named_actor_max_task_retries(ray_init_with_task_retry_delay): def test_actor_restart_on_node_failure(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 1000, "task_retry_delay_ms": 100, } diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index abd82011d..f6aad1fa3 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -990,7 +990,7 @@ def test_raylet_crash_when_get(ray_start_regular): def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, - "raylet_heartbeat_timeout_milliseconds": 10, + "raylet_heartbeat_period_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) @@ -1202,7 +1202,7 @@ def test_serialized_id(ray_start_cluster): def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, } cluster = Cluster() # Head node with no resources. diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index 1cd1f133a..35d00a9b8 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -17,7 +17,7 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM def test_cached_object(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } cluster = ray_start_cluster @@ -59,7 +59,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -118,7 +118,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster, def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -167,7 +167,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled): def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -224,7 +224,7 @@ def test_basic_reconstruction_actor_task(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -297,7 +297,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -377,7 +377,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster, def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -442,7 +442,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } # Workaround to reset the config to the default value. @@ -494,7 +494,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): def test_reconstruction_stress(ray_start_cluster): config = { "num_heartbeats_timeout": 10, - "raylet_heartbeat_timeout_milliseconds": 100, + "raylet_heartbeat_period_milliseconds": 100, "max_direct_call_object_size": 100, "task_retry_delay_ms": 100, "object_timeout_milliseconds": 200, diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index cd6bd84ce..f109bbd59 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -35,7 +35,7 @@ RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000) RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000) /// The duration between heartbeats sent by the raylets. -RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 100) +RAY_CONFIG(int64_t, raylet_heartbeat_period_milliseconds, 100) /// If a component has not sent a heartbeat in the last num_heartbeats_timeout /// heartbeat intervals, the raylet monitor process will report /// it as dead to the db_client table. @@ -93,7 +93,7 @@ RAY_CONFIG(bool, record_ref_creation_sites, true) /// serialized, then either passed as an argument or returned from a task. /// NOTE(swang): The timer is checked by the raylet during every heartbeat, so /// this should be set to a value larger than -/// raylet_heartbeat_timeout_milliseconds. +/// raylet_heartbeat_period_milliseconds. RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000) /// If object_pinning_enabled is on, then objects that have been unpinned are diff --git a/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc b/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc index b6dd56945..5991c20a8 100644 --- a/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_heartbeat_manager.cc @@ -103,7 +103,7 @@ void GcsHeartbeatManager::DetectDeadNodes() { void GcsHeartbeatManager::ScheduleTick() { auto heartbeat_period = boost::posix_time::milliseconds( - RayConfig::instance().raylet_heartbeat_timeout_milliseconds()); + RayConfig::instance().raylet_heartbeat_period_milliseconds()); detect_timer_.expires_from_now(heartbeat_period); detect_timer_.async_wait([this](const boost::system::error_code &error) { if (error == boost::asio::error::operation_aborted) { diff --git a/src/ray/raylet/main.cc b/src/ray/raylet/main.cc index ba6a53ee4..1d47f23b3 100644 --- a/src/ray/raylet/main.cc +++ b/src/ray/raylet/main.cc @@ -196,7 +196,7 @@ int main(int argc, char *argv[]) { } node_manager_config.heartbeat_period_ms = - RayConfig::instance().raylet_heartbeat_timeout_milliseconds(); + RayConfig::instance().raylet_heartbeat_period_milliseconds(); node_manager_config.report_resources_period_ms = RayConfig::instance().raylet_report_resources_period_milliseconds(); node_manager_config.debug_dump_period_ms = diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 2c20bab40..d0e3be78b 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -410,7 +410,7 @@ void NodeManager::Heartbeat() { uint64_t now_ms = current_time_ms(); uint64_t interval = now_ms - last_heartbeat_at_ms_; if (interval > RayConfig::instance().num_heartbeats_warning() * - RayConfig::instance().raylet_heartbeat_timeout_milliseconds()) { + RayConfig::instance().raylet_heartbeat_period_milliseconds()) { RAY_LOG(WARNING) << "Last heartbeat was sent " << interval << " ms ago. There might be resource pressure on this node. If heartbeat keeps " @@ -723,7 +723,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) { << "Exiting because this node manager has mistakenly been marked dead by the " << "monitor: GCS didn't receive heartbeats within timeout " << RayConfig::instance().num_heartbeats_timeout() * - RayConfig::instance().raylet_heartbeat_timeout_milliseconds() + RayConfig::instance().raylet_heartbeat_period_milliseconds() << " ms. This is likely since the machine or raylet became overloaded."; // Below, when we remove node_id from all of these data structures, we could