Rename timeout to period with heartbeat interval (#13872)

This commit is contained in:
Tao Wang
2021-02-04 10:37:28 +08:00
committed by GitHub
parent e0d9c8f0a8
commit 44aa9c173f
9 changed files with 21 additions and 21 deletions
+1 -1
View File
@@ -13,7 +13,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
int64_t handler_warning_timeout_ms() const
int64_t raylet_heartbeat_timeout_milliseconds() const
int64_t raylet_heartbeat_period_milliseconds() const
int64_t debug_dump_period_milliseconds() const
+2 -2
View File
@@ -10,8 +10,8 @@ cdef class Config:
return RayConfig.instance().handler_warning_timeout_ms()
@staticmethod
def raylet_heartbeat_timeout_milliseconds():
return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()
def raylet_heartbeat_period_milliseconds():
return RayConfig.instance().raylet_heartbeat_period_milliseconds()
@staticmethod
def debug_dump_period_milliseconds():
+1 -1
View File
@@ -275,7 +275,7 @@ def test_named_actor_max_task_retries(ray_init_with_task_retry_delay):
def test_actor_restart_on_node_failure(ray_start_cluster):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 1000,
"task_retry_delay_ms": 100,
}
+2 -2
View File
@@ -990,7 +990,7 @@ def test_raylet_crash_when_get(ray_start_regular):
def test_connect_with_disconnected_node(shutdown_only):
config = {
"num_heartbeats_timeout": 50,
"raylet_heartbeat_timeout_milliseconds": 10,
"raylet_heartbeat_period_milliseconds": 10,
}
cluster = Cluster()
cluster.add_node(num_cpus=0, _system_config=config)
@@ -1202,7 +1202,7 @@ def test_serialized_id(ray_start_cluster):
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
}
cluster = Cluster()
# Head node with no resources.
+9 -9
View File
@@ -17,7 +17,7 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
def test_cached_object(ray_start_cluster):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
cluster = ray_start_cluster
@@ -59,7 +59,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -118,7 +118,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -167,7 +167,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -224,7 +224,7 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -297,7 +297,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -377,7 +377,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -442,7 +442,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"object_timeout_milliseconds": 200,
}
# Workaround to reset the config to the default value.
@@ -494,7 +494,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
def test_reconstruction_stress(ray_start_cluster):
config = {
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"raylet_heartbeat_period_milliseconds": 100,
"max_direct_call_object_size": 100,
"task_retry_delay_ms": 100,
"object_timeout_milliseconds": 200,
+2 -2
View File
@@ -35,7 +35,7 @@ RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000)
RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000)
/// The duration between heartbeats sent by the raylets.
RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 100)
RAY_CONFIG(int64_t, raylet_heartbeat_period_milliseconds, 100)
/// If a component has not sent a heartbeat in the last num_heartbeats_timeout
/// heartbeat intervals, the raylet monitor process will report
/// it as dead to the db_client table.
@@ -93,7 +93,7 @@ RAY_CONFIG(bool, record_ref_creation_sites, true)
/// serialized, then either passed as an argument or returned from a task.
/// NOTE(swang): The timer is checked by the raylet during every heartbeat, so
/// this should be set to a value larger than
/// raylet_heartbeat_timeout_milliseconds.
/// raylet_heartbeat_period_milliseconds.
RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000)
/// If object_pinning_enabled is on, then objects that have been unpinned are
@@ -103,7 +103,7 @@ void GcsHeartbeatManager::DetectDeadNodes() {
void GcsHeartbeatManager::ScheduleTick() {
auto heartbeat_period = boost::posix_time::milliseconds(
RayConfig::instance().raylet_heartbeat_timeout_milliseconds());
RayConfig::instance().raylet_heartbeat_period_milliseconds());
detect_timer_.expires_from_now(heartbeat_period);
detect_timer_.async_wait([this](const boost::system::error_code &error) {
if (error == boost::asio::error::operation_aborted) {
+1 -1
View File
@@ -196,7 +196,7 @@ int main(int argc, char *argv[]) {
}
node_manager_config.heartbeat_period_ms =
RayConfig::instance().raylet_heartbeat_timeout_milliseconds();
RayConfig::instance().raylet_heartbeat_period_milliseconds();
node_manager_config.report_resources_period_ms =
RayConfig::instance().raylet_report_resources_period_milliseconds();
node_manager_config.debug_dump_period_ms =
+2 -2
View File
@@ -410,7 +410,7 @@ void NodeManager::Heartbeat() {
uint64_t now_ms = current_time_ms();
uint64_t interval = now_ms - last_heartbeat_at_ms_;
if (interval > RayConfig::instance().num_heartbeats_warning() *
RayConfig::instance().raylet_heartbeat_timeout_milliseconds()) {
RayConfig::instance().raylet_heartbeat_period_milliseconds()) {
RAY_LOG(WARNING)
<< "Last heartbeat was sent " << interval
<< " ms ago. There might be resource pressure on this node. If heartbeat keeps "
@@ -723,7 +723,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) {
<< "Exiting because this node manager has mistakenly been marked dead by the "
<< "monitor: GCS didn't receive heartbeats within timeout "
<< RayConfig::instance().num_heartbeats_timeout() *
RayConfig::instance().raylet_heartbeat_timeout_milliseconds()
RayConfig::instance().raylet_heartbeat_period_milliseconds()
<< " ms. This is likely since the machine or raylet became overloaded.";
// Below, when we remove node_id from all of these data structures, we could