Do not recommend increasing max_shards_per_node (#120458)

Today if the `shards_capacity` health indicator detects a problem then it recommends increasing the limit, which goes against the advice in the manual about not increasing these limits and also makes it rather pointless having a limit in the first place. This commit improves the recommendation to suggest either adding nodes or else reducing the shard count.
elastic · Jan 20, 2025 · 9c0709f · 9c0709f
1 parent d1a2e0d
commit 9c0709f
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 58 deletions.
diff --git a/docs/changelog/120458.yaml b/docs/changelog/120458.yaml
@@ -0,0 +1,5 @@
+pr: 120458
+summary: Do not recommend increasing `max_shards_per_node`
+area: Health
+type: bug
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -83,6 +83,7 @@ public enum ReferenceDocs {
     ALLOCATION_EXPLAIN_NO_COPIES,
     ALLOCATION_EXPLAIN_MAX_RETRY,
     SECURE_SETTINGS,
+    CLUSTER_SHARD_LIMIT,
     // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
     ;
 

diff --git a/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java
@@ -12,9 +12,9 @@
 import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.TriFunction;
 import org.elasticsearch.common.settings.Setting;
-import org.elasticsearch.features.FeatureService;
 import org.elasticsearch.health.Diagnosis;
 import org.elasticsearch.health.HealthIndicatorDetails;
 import org.elasticsearch.health.HealthIndicatorImpact;
@@ -54,21 +54,18 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
         "The cluster is running low on room to add new shards. Adding data to new indices is at risk";
     private static final String INDEX_CREATION_RISK =
         "The cluster is running low on room to add new shards. Adding data to new indices might soon fail.";
-    private static final String HELP_GUIDE = "https://ela.st/fix-shards-capacity";
     private static final TriFunction<String, Setting<?>, String, Diagnosis> SHARD_MAX_CAPACITY_REACHED_FN = (
         id,
         setting,
         indexType) -> new Diagnosis(
             new Diagnosis.Definition(
                 NAME,
                 id,
-                "Elasticsearch is about to reach the maximum number of shards it can host, based on your current settings.",
-                "Increase the value of ["
-                    + setting.getKey()
-                    + "] cluster setting or remove "
+                "Elasticsearch is about to reach the maximum number of shards it can host as set by [" + setting.getKey() + "].",
+                "Increase the number of nodes in your cluster or remove some "
                     + indexType
-                    + " indices to clear up resources.",
-                HELP_GUIDE
+                    + " indices to reduce the number of shards in the cluster.",
+                ReferenceDocs.CLUSTER_SHARD_LIMIT.toString()
             ),
             null
         );
@@ -82,22 +79,20 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
         new HealthIndicatorImpact(NAME, "creation_of_new_indices_at_risk", 2, INDEX_CREATION_RISK, List.of(ImpactArea.INGEST))
     );
     static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_DATA_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
-        "increase_max_shards_per_node",
+        "decrease_shards_per_non_frozen_node",
         ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
-        "data"
+        "non-frozen"
     );
     static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
-        "increase_max_shards_per_node_frozen",
+        "decrease_shards_per_frozen_node",
         ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
         "frozen"
     );
 
     private final ClusterService clusterService;
-    private final FeatureService featureService;
 
-    public ShardsCapacityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
+    public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
         this.clusterService = clusterService;
-        this.featureService = featureService;
     }
 
     @Override

diff --git a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java
@@ -1352,7 +1352,7 @@ private Module loadDiagnosticServices(
             new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
             new RepositoryIntegrityHealthIndicatorService(clusterService),
             new DiskHealthIndicatorService(clusterService, featureService),
-            new ShardsCapacityHealthIndicatorService(clusterService, featureService),
+            new ShardsCapacityHealthIndicatorService(clusterService),
             fileSettingsHealthIndicatorService
         );
         var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)

diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt
@@ -45,3 +45,4 @@ CIRCUIT_BREAKER_ERRORS                                          circuit-breaker-
 ALLOCATION_EXPLAIN_NO_COPIES                                    cluster-allocation-explain.html#no-valid-shard-copy
 ALLOCATION_EXPLAIN_MAX_RETRY                                    cluster-allocation-explain.html#maximum-number-of-retries-exceeded
 SECURE_SETTINGS                                                 secure-settings.html
+CLUSTER_SHARD_LIMIT                                             misc-cluster-settings.html#cluster-shard-limit
diff --git a/...rc/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java b/...rc/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java
@@ -20,7 +20,6 @@
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
-import org.elasticsearch.features.FeatureService;
 import org.elasticsearch.health.HealthIndicatorDetails;
 import org.elasticsearch.health.HealthStatus;
 import org.elasticsearch.health.metadata.HealthMetadata;
@@ -39,7 +38,6 @@
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
-import org.mockito.Mockito;
 
 import java.io.IOException;
 import java.util.List;
@@ -61,10 +59,13 @@
 import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP;
 import static org.elasticsearch.indices.ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP;
 import static org.elasticsearch.indices.ShardLimitValidator.NORMAL_GROUP;
+import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
+import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
+import static org.hamcrest.Matchers.allOf;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
-import static org.mockito.ArgumentMatchers.any;
 
 public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {
 
@@ -73,7 +74,6 @@ public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {
     private static ThreadPool threadPool;
 
     private ClusterService clusterService;
-    private FeatureService featureService;
     private DiscoveryNode dataNode;
     private DiscoveryNode frozenNode;
 
@@ -92,9 +92,6 @@ public void setUp() throws Exception {
             .build();
 
         clusterService = ClusterServiceUtils.createClusterService(threadPool);
-
-        featureService = Mockito.mock(FeatureService.class);
-        Mockito.when(featureService.clusterHasFeature(any(), any())).thenReturn(true);
     }
 
     @After
@@ -122,7 +119,7 @@ public void testNoShardsCapacityMetadata() throws IOException {
                 createIndexInDataNode(100)
             )
         );
-        var target = new ShardsCapacityHealthIndicatorService(clusterService, featureService);
+        var target = new ShardsCapacityHealthIndicatorService(clusterService);
         var indicatorResult = target.calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
         assertEquals(indicatorResult.status(), HealthStatus.UNKNOWN);
@@ -136,10 +133,7 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
         int maxShardsPerNode = randomValidMaxShards();
         int maxShardsPerNodeFrozen = randomValidMaxShards();
         var clusterService = createClusterService(maxShardsPerNode, maxShardsPerNodeFrozen, createIndexInDataNode(maxShardsPerNode / 4));
-        var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-            true,
-            HealthInfo.EMPTY_HEALTH_INFO
-        );
+        var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
         assertEquals(indicatorResult.status(), HealthStatus.GREEN);
         assertTrue(indicatorResult.impacts().isEmpty());
@@ -158,15 +152,36 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
         );
     }
 
+    public void testDiagnoses() {
+        assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().indicatorName());
+        assertEquals("decrease_shards_per_non_frozen_node", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().id());
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().cause(),
+            allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()))
+        );
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().action(),
+            allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some non-frozen indices"))
+        );
+
+        assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().indicatorName());
+        assertEquals("decrease_shards_per_frozen_node", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().id());
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().cause(),
+            allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey()))
+        );
+        assertThat(
+            SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().action(),
+            allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some frozen indices"))
+        );
+    }
+
     public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOException {
         {
             // Only data_nodes does not have enough space
             int maxShardsPerNodeFrozen = randomValidMaxShards();
             var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(4));
-            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-                true,
-                HealthInfo.EMPTY_HEALTH_INFO
-            );
+            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
             assertEquals(indicatorResult.status(), YELLOW);
             assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
@@ -189,10 +204,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
             // Only frozen_nodes does not have enough space
             int maxShardsPerNode = randomValidMaxShards();
             var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(4));
-            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-                true,
-                HealthInfo.EMPTY_HEALTH_INFO
-            );
+            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
             assertEquals(indicatorResult.status(), YELLOW);
             assertEquals(
@@ -217,10 +229,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
         {
             // Both data and frozen nodes does not have enough space
             var clusterService = createClusterService(25, 25, createIndexInDataNode(4), createIndexInFrozenNode(4));
-            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-                true,
-                HealthInfo.EMPTY_HEALTH_INFO
-            );
+            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
             assertEquals(indicatorResult.status(), YELLOW);
             assertEquals(
@@ -251,10 +260,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
             // Only data_nodes does not have enough space
             int maxShardsPerNodeFrozen = randomValidMaxShards();
             var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
-            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-                true,
-                HealthInfo.EMPTY_HEALTH_INFO
-            );
+            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
             assertEquals(indicatorResult.status(), RED);
             assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
@@ -277,10 +283,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
             // Only frozen_nodes does not have enough space
             int maxShardsPerNode = randomValidMaxShards();
             var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(11));
-            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-                true,
-                HealthInfo.EMPTY_HEALTH_INFO
-            );
+            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
             assertEquals(indicatorResult.status(), RED);
             assertEquals(
@@ -305,10 +308,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
         {
             // Both data and frozen nodes does not have enough space
             var clusterService = createClusterService(25, 25, createIndexInDataNode(11), createIndexInFrozenNode(11));
-            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-                true,
-                HealthInfo.EMPTY_HEALTH_INFO
-            );
+            var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
 
             assertEquals(indicatorResult.status(), RED);
             assertEquals(
@@ -377,22 +377,19 @@ public void testCalculateMethods() {
     public void testMappedFieldsForTelemetry() {
         assertEquals(ShardsCapacityHealthIndicatorService.NAME, "shards_capacity");
         assertEquals(
-            "elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node",
+            "elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_non_frozen_node",
             SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().getUniqueId()
         );
         assertEquals(
-            "elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node_frozen",
+            "elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_frozen_node",
             SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().getUniqueId()
         );
     }
 
     public void testSkippingFieldsWhenVerboseIsFalse() {
         int maxShardsPerNodeFrozen = randomValidMaxShards();
         var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
-        var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
-            false,
-            HealthInfo.EMPTY_HEALTH_INFO
-        );
+        var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(false, HealthInfo.EMPTY_HEALTH_INFO);
 
         assertEquals(indicatorResult.status(), RED);
         assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
@@ -441,7 +438,7 @@ private ClusterState createClusterState(
         var metadata = Metadata.builder()
             .persistentSettings(
                 Settings.builder()
-                    .put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
+                    .put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
                     .put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), maxShardsPerNodeFrozen)
                     .build()
             );