Do not recommend increasing max_shards_per_node (#120458)

Today if the `shards_capacity` health indicator detects a problem then it recommends increasing the limit, which goes against the advice in the manual about not increasing these limits and also makes it rather pointless having a limit in the first place. This commit improves the recommendation to suggest either adding nodes or else reducing the shard count.
elastic · Jan 20, 2025 · de5be24 · de5be24
1 parent 7678eb3
commit de5be24
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 14 deletions.
diff --git a/docs/changelog/120458.yaml b/docs/changelog/120458.yaml
@@ -0,0 +1,5 @@
+pr: 120458
+summary: Do not recommend increasing `max_shards_per_node`
+area: Health
+type: bug
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -83,6 +83,7 @@ public enum ReferenceDocs {
     JDK_LOCALE_DIFFERENCES,
     ALLOCATION_EXPLAIN_MAX_RETRY,
     SECURE_SETTINGS,
+    CLUSTER_SHARD_LIMIT,
     // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
     ;
 

diff --git a/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.TriFunction;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.features.FeatureService;
@@ -55,21 +56,18 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
         "The cluster is running low on room to add new shards. Adding data to new indices is at risk";
     private static final String INDEX_CREATION_RISK =
         "The cluster is running low on room to add new shards. Adding data to new indices might soon fail.";
-    private static final String HELP_GUIDE = "https://ela.st/fix-shards-capacity";
     private static final TriFunction<String, Setting<?>, String, Diagnosis> SHARD_MAX_CAPACITY_REACHED_FN = (
         id,
         setting,
         indexType) -> new Diagnosis(
             new Diagnosis.Definition(
                 NAME,
                 id,
-                "Elasticsearch is about to reach the maximum number of shards it can host, based on your current settings.",
-                "Increase the value of ["
-                    + setting.getKey()
-                    + "] cluster setting or remove "
+                "Elasticsearch is about to reach the maximum number of shards it can host as set by [" + setting.getKey() + "].",
+                "Increase the number of nodes in your cluster or remove some "
                     + indexType
-                    + " indices to clear up resources.",
-                HELP_GUIDE
+                    + " indices to reduce the number of shards in the cluster.",
+                ReferenceDocs.CLUSTER_SHARD_LIMIT.toString()
             ),
             null
         );
@@ -83,12 +81,12 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
         new HealthIndicatorImpact(NAME, "creation_of_new_indices_at_risk", 2, INDEX_CREATION_RISK, List.of(ImpactArea.INGEST))
     );
     static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_DATA_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
-        "increase_max_shards_per_node",
+        "decrease_shards_per_non_frozen_node",
         ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
-        "data"
+        "non-frozen"
     );
     static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
-        "increase_max_shards_per_node_frozen",
+        "decrease_shards_per_frozen_node",
         ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
         "frozen"
     );

diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt
@@ -45,3 +45,4 @@ FORMING_SINGLE_NODE_CLUSTERS                                    modules-discover
 JDK_LOCALE_DIFFERENCES                                          mapping-date-format.html#custom-date-format-locales
 ALLOCATION_EXPLAIN_MAX_RETRY                                    cluster-allocation-explain.html#maximum-number-of-retries-exceeded
 SECURE_SETTINGS                                                 secure-settings.html
+CLUSTER_SHARD_LIMIT                                             misc-cluster-settings.html#cluster-shard-limit
diff --git a/...rc/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java b/...rc/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java
@@ -62,6 +62,10 @@
 import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP;
 import static org.elasticsearch.indices.ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP;
 import static org.elasticsearch.indices.ShardLimitValidator.NORMAL_GROUP;
+import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
+import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
+import static org.hamcrest.Matchers.allOf;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
@@ -93,7 +97,6 @@ public void setUp() throws Exception {
             .build();
 
         clusterService = ClusterServiceUtils.createClusterService(threadPool);
-
         featureService = Mockito.mock(FeatureService.class);
         Mockito.when(featureService.clusterHasFeature(any(), any())).thenReturn(true);
     }
@@ -159,6 +162,30 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
         );
     }
 
+    public void testDiagnoses() {
+        assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().indicatorName());
+        assertEquals("decrease_shards_per_non_frozen_node", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().id());
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().cause(),
+            allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()))
+        );
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().action(),
+            allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some non-frozen indices"))
+        );
+
+        assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().indicatorName());
+        assertEquals("decrease_shards_per_frozen_node", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().id());
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().cause(),
+            allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey()))
+        );
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().action(),
+            allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some frozen indices"))
+        );
+    }
+
     public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOException {
         {
             // Only data_nodes does not have enough space
@@ -378,11 +405,11 @@ public void testCalculateMethods() {
     public void testMappedFieldsForTelemetry() {
         assertEquals(ShardsCapacityHealthIndicatorService.NAME, "shards_capacity");
         assertEquals(
-            "elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node",
+            "elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_non_frozen_node",
             SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().getUniqueId()
         );
         assertEquals(
-            "elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node_frozen",
+            "elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_frozen_node",
             SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().getUniqueId()
         );
     }
@@ -442,7 +469,7 @@ private ClusterState createClusterState(
         var metadata = Metadata.builder()
             .persistentSettings(
                 Settings.builder()
-                    .put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
+                    .put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
                     .put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), maxShardsPerNodeFrozen)
                     .build()
             );