mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 18:06:25 +08:00
Rename timeout to period with heartbeat interval (#13872)
This commit is contained in:
@@ -13,7 +13,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
int64_t handler_warning_timeout_ms() const
|
||||
|
||||
int64_t raylet_heartbeat_timeout_milliseconds() const
|
||||
int64_t raylet_heartbeat_period_milliseconds() const
|
||||
|
||||
int64_t debug_dump_period_milliseconds() const
|
||||
|
||||
|
||||
@@ -10,8 +10,8 @@ cdef class Config:
|
||||
return RayConfig.instance().handler_warning_timeout_ms()
|
||||
|
||||
@staticmethod
|
||||
def raylet_heartbeat_timeout_milliseconds():
|
||||
return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()
|
||||
def raylet_heartbeat_period_milliseconds():
|
||||
return RayConfig.instance().raylet_heartbeat_period_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def debug_dump_period_milliseconds():
|
||||
|
||||
@@ -275,7 +275,7 @@ def test_named_actor_max_task_retries(ray_init_with_task_retry_delay):
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 1000,
|
||||
"task_retry_delay_ms": 100,
|
||||
}
|
||||
|
||||
@@ -990,7 +990,7 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
def test_connect_with_disconnected_node(shutdown_only):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 50,
|
||||
"raylet_heartbeat_timeout_milliseconds": 10,
|
||||
"raylet_heartbeat_period_milliseconds": 10,
|
||||
}
|
||||
cluster = Cluster()
|
||||
cluster.add_node(num_cpus=0, _system_config=config)
|
||||
@@ -1202,7 +1202,7 @@ def test_serialized_id(ray_start_cluster):
|
||||
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
}
|
||||
cluster = Cluster()
|
||||
# Head node with no resources.
|
||||
|
||||
@@ -17,7 +17,7 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
|
||||
def test_cached_object(ray_start_cluster):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
cluster = ray_start_cluster
|
||||
@@ -59,7 +59,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
|
||||
reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -118,7 +118,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
|
||||
def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -167,7 +167,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
|
||||
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -224,7 +224,7 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
|
||||
reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -297,7 +297,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
|
||||
reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -377,7 +377,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
|
||||
def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -442,7 +442,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
|
||||
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
}
|
||||
# Workaround to reset the config to the default value.
|
||||
@@ -494,7 +494,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
|
||||
def test_reconstruction_stress(ray_start_cluster):
|
||||
config = {
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"raylet_heartbeat_period_milliseconds": 100,
|
||||
"max_direct_call_object_size": 100,
|
||||
"task_retry_delay_ms": 100,
|
||||
"object_timeout_milliseconds": 200,
|
||||
|
||||
@@ -35,7 +35,7 @@ RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000)
|
||||
RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000)
|
||||
|
||||
/// The duration between heartbeats sent by the raylets.
|
||||
RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 100)
|
||||
RAY_CONFIG(int64_t, raylet_heartbeat_period_milliseconds, 100)
|
||||
/// If a component has not sent a heartbeat in the last num_heartbeats_timeout
|
||||
/// heartbeat intervals, the raylet monitor process will report
|
||||
/// it as dead to the db_client table.
|
||||
@@ -93,7 +93,7 @@ RAY_CONFIG(bool, record_ref_creation_sites, true)
|
||||
/// serialized, then either passed as an argument or returned from a task.
|
||||
/// NOTE(swang): The timer is checked by the raylet during every heartbeat, so
|
||||
/// this should be set to a value larger than
|
||||
/// raylet_heartbeat_timeout_milliseconds.
|
||||
/// raylet_heartbeat_period_milliseconds.
|
||||
RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000)
|
||||
|
||||
/// If object_pinning_enabled is on, then objects that have been unpinned are
|
||||
|
||||
@@ -103,7 +103,7 @@ void GcsHeartbeatManager::DetectDeadNodes() {
|
||||
|
||||
void GcsHeartbeatManager::ScheduleTick() {
|
||||
auto heartbeat_period = boost::posix_time::milliseconds(
|
||||
RayConfig::instance().raylet_heartbeat_timeout_milliseconds());
|
||||
RayConfig::instance().raylet_heartbeat_period_milliseconds());
|
||||
detect_timer_.expires_from_now(heartbeat_period);
|
||||
detect_timer_.async_wait([this](const boost::system::error_code &error) {
|
||||
if (error == boost::asio::error::operation_aborted) {
|
||||
|
||||
@@ -196,7 +196,7 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
node_manager_config.heartbeat_period_ms =
|
||||
RayConfig::instance().raylet_heartbeat_timeout_milliseconds();
|
||||
RayConfig::instance().raylet_heartbeat_period_milliseconds();
|
||||
node_manager_config.report_resources_period_ms =
|
||||
RayConfig::instance().raylet_report_resources_period_milliseconds();
|
||||
node_manager_config.debug_dump_period_ms =
|
||||
|
||||
@@ -410,7 +410,7 @@ void NodeManager::Heartbeat() {
|
||||
uint64_t now_ms = current_time_ms();
|
||||
uint64_t interval = now_ms - last_heartbeat_at_ms_;
|
||||
if (interval > RayConfig::instance().num_heartbeats_warning() *
|
||||
RayConfig::instance().raylet_heartbeat_timeout_milliseconds()) {
|
||||
RayConfig::instance().raylet_heartbeat_period_milliseconds()) {
|
||||
RAY_LOG(WARNING)
|
||||
<< "Last heartbeat was sent " << interval
|
||||
<< " ms ago. There might be resource pressure on this node. If heartbeat keeps "
|
||||
@@ -723,7 +723,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) {
|
||||
<< "Exiting because this node manager has mistakenly been marked dead by the "
|
||||
<< "monitor: GCS didn't receive heartbeats within timeout "
|
||||
<< RayConfig::instance().num_heartbeats_timeout() *
|
||||
RayConfig::instance().raylet_heartbeat_timeout_milliseconds()
|
||||
RayConfig::instance().raylet_heartbeat_period_milliseconds()
|
||||
<< " ms. This is likely since the machine or raylet became overloaded.";
|
||||
|
||||
// Below, when we remove node_id from all of these data structures, we could
|
||||
|
||||
Reference in New Issue
Block a user