Revert "Update Node Monitor Grace Period default duration to 50s"

benluddy · benluddy · commit 2712f53cb232 · 2025-07-24T13:41:47.000-04:00
This reverts commit f8bf6b9.
diff --git a/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go b/cmd/kube-controller-manager/app/options/nodelifecyclecontroller.go
@@ -41,8 +41,7 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) {
 	fs.DurationVar(&o.NodeMonitorGracePeriod.Duration, "node-monitor-grace-period", o.NodeMonitorGracePeriod.Duration,
 		"Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+
 			"Must be N times more than kubelet's nodeStatusUpdateFrequency, "+
-			"where N means number of retries allowed for kubelet to post node status. "+
-			"This value should also be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS")
+			"where N means number of retries allowed for kubelet to post node status.")
 	fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.")
 	fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.")
 	fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, fmt.Sprintf("Number of nodes from which %s treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller. Notice: If nodes reside in multiple zones, this threshold will be considered as zone node size threshold for each zone to determine node eviction rate independently.", names.NodeLifecycleController))
diff --git a/pkg/controller/nodelifecycle/config/types.go b/pkg/controller/nodelifecycle/config/types.go
@@ -32,8 +32,7 @@ type NodeLifecycleControllerConfiguration struct {
 	// NodeMonitorGracePeriod is the amount of time which we allow a running node to be
 	// unresponsive before marking it unhealthy. Must be N times more than kubelet's
 	// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
-	// to post node status. This value should also be greater than the sum of
-	// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
+	// to post node status.
 	NodeMonitorGracePeriod metav1.Duration
 	// secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold
 	LargeClusterSizeThreshold int32
diff --git a/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go b/pkg/controller/nodelifecycle/config/v1alpha1/defaults.go
@@ -37,13 +37,8 @@ func RecommendedDefaultNodeLifecycleControllerConfiguration(obj *kubectrlmgrconf
 	if obj.PodEvictionTimeout == zero {
 		obj.PodEvictionTimeout = metav1.Duration{Duration: 5 * time.Minute}
 	}
-	// NodeMonitorGracePeriod is set to a default value of 50 seconds.
-	// This value should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
-	// and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
-	// to ensure that the server has adequate time to handle slow or idle connections
-	// properly before marking a node as unhealthy.
 	if obj.NodeMonitorGracePeriod == zero {
-		obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 50 * time.Second}
+		obj.NodeMonitorGracePeriod = metav1.Duration{Duration: 40 * time.Second}
 	}
 	if obj.NodeStartupGracePeriod == zero {
 		obj.NodeStartupGracePeriod = metav1.Duration{Duration: 60 * time.Second}
diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go
@@ -282,11 +282,7 @@ type Controller struct {
 	//    be less than the node health signal update frequency, since there will
 	//    only be fresh values from Kubelet at an interval of node health signal
 	//    update frequency.
-	// 2. nodeMonitorGracePeriod should be greater than the sum of HTTP2_PING_TIMEOUT_SECONDS (30s)
-	// 	  and HTTP2_READ_IDLE_TIMEOUT_SECONDS (15s) from the http2 health check
-	// 	  to ensure that the server has adequate time to handle slow or idle connections
-	//    properly before marking a node as unhealthy.
-	// 3. nodeMonitorGracePeriod can't be too large for user experience - larger
+	// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
 	//    value takes longer for user to see up-to-date node health.
 	nodeMonitorGracePeriod time.Duration
 
diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go
@@ -52,7 +52,7 @@ import (
 )
 
 const (
-	testNodeMonitorGracePeriod = 50 * time.Second
+	testNodeMonitorGracePeriod = 40 * time.Second
 	testNodeStartupGracePeriod = 60 * time.Second
 	testNodeMonitorPeriod      = 5 * time.Second
 	testRateLimiterQPS         = float32(100000)
diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go
diff --git a/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/types.go b/staging/src/k8s.io/kube-controller-manager/config/v1alpha1/types.go
@@ -403,8 +403,7 @@ type NodeLifecycleControllerConfiguration struct {
 	// nodeMontiorGracePeriod is the amount of time which we allow a running node to be
 	// unresponsive before marking it unhealthy. Must be N times more than kubelet's
 	// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
-	// to post node status. This value should also be greater than the sum of
-	// HTTP2_PING_TIMEOUT_SECONDS and HTTP2_READ_IDLE_TIMEOUT_SECONDS.
+	// to post node status.
 	NodeMonitorGracePeriod metav1.Duration
 	// podEvictionTimeout is the grace period for deleting pods on failed nodes.
 	PodEvictionTimeout metav1.Duration

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ import (`
`52`	`52`	`)`
`53`	`53`
`54`	`54`	`const (`
`55`		`- testNodeMonitorGracePeriod = 50 * time.Second`
	`55`	`+ testNodeMonitorGracePeriod = 40 * time.Second`
`56`	`56`	`testNodeStartupGracePeriod = 60 * time.Second`
`57`	`57`	`testNodeMonitorPeriod = 5 * time.Second`
`58`	`58`	`testRateLimiterQPS = float32(100000)`