Skip to content

Commit

Permalink
Do not recommend increasing max_shards_per_node (#120458)
Browse files Browse the repository at this point in the history
Today if the `shards_capacity` health indicator detects a problem then
it recommends increasing the limit, which goes against the advice in the
manual about not increasing these limits and also makes it rather
pointless having a limit in the first place.

This commit improves the recommendation to suggest either adding nodes
or else reducing the shard count.
  • Loading branch information
DaveCTurner authored Jan 20, 2025
1 parent d1a2e0d commit 9c0709f
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 58 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/120458.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 120458
summary: Do not recommend increasing `max_shards_per_node`
area: Health
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ public enum ReferenceDocs {
ALLOCATION_EXPLAIN_NO_COPIES,
ALLOCATION_EXPLAIN_MAX_RETRY,
SECURE_SETTINGS,
CLUSTER_SHARD_LIMIT,
// this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.ReferenceDocs;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
Expand Down Expand Up @@ -54,21 +54,18 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
"The cluster is running low on room to add new shards. Adding data to new indices is at risk";
private static final String INDEX_CREATION_RISK =
"The cluster is running low on room to add new shards. Adding data to new indices might soon fail.";
private static final String HELP_GUIDE = "https://ela.st/fix-shards-capacity";
private static final TriFunction<String, Setting<?>, String, Diagnosis> SHARD_MAX_CAPACITY_REACHED_FN = (
id,
setting,
indexType) -> new Diagnosis(
new Diagnosis.Definition(
NAME,
id,
"Elasticsearch is about to reach the maximum number of shards it can host, based on your current settings.",
"Increase the value of ["
+ setting.getKey()
+ "] cluster setting or remove "
"Elasticsearch is about to reach the maximum number of shards it can host as set by [" + setting.getKey() + "].",
"Increase the number of nodes in your cluster or remove some "
+ indexType
+ " indices to clear up resources.",
HELP_GUIDE
+ " indices to reduce the number of shards in the cluster.",
ReferenceDocs.CLUSTER_SHARD_LIMIT.toString()
),
null
);
Expand All @@ -82,22 +79,20 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
new HealthIndicatorImpact(NAME, "creation_of_new_indices_at_risk", 2, INDEX_CREATION_RISK, List.of(ImpactArea.INGEST))
);
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_DATA_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
"increase_max_shards_per_node",
"decrease_shards_per_non_frozen_node",
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
"data"
"non-frozen"
);
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
"increase_max_shards_per_node_frozen",
"decrease_shards_per_frozen_node",
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
"frozen"
);

private final ClusterService clusterService;
private final FeatureService featureService;

public ShardsCapacityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1352,7 +1352,7 @@ private Module loadDiagnosticServices(
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
new RepositoryIntegrityHealthIndicatorService(clusterService),
new DiskHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService),
fileSettingsHealthIndicatorService
);
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ CIRCUIT_BREAKER_ERRORS circuit-breaker-
ALLOCATION_EXPLAIN_NO_COPIES cluster-allocation-explain.html#no-valid-shard-copy
ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded
SECURE_SETTINGS secure-settings.html
CLUSTER_SHARD_LIMIT misc-cluster-settings.html#cluster-shard-limit
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthStatus;
import org.elasticsearch.health.metadata.HealthMetadata;
Expand All @@ -39,7 +38,6 @@
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.mockito.Mockito;

import java.io.IOException;
import java.util.List;
Expand All @@ -61,10 +59,13 @@
import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.NORMAL_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.mockito.ArgumentMatchers.any;

public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {

Expand All @@ -73,7 +74,6 @@ public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {
private static ThreadPool threadPool;

private ClusterService clusterService;
private FeatureService featureService;
private DiscoveryNode dataNode;
private DiscoveryNode frozenNode;

Expand All @@ -92,9 +92,6 @@ public void setUp() throws Exception {
.build();

clusterService = ClusterServiceUtils.createClusterService(threadPool);

featureService = Mockito.mock(FeatureService.class);
Mockito.when(featureService.clusterHasFeature(any(), any())).thenReturn(true);
}

@After
Expand Down Expand Up @@ -122,7 +119,7 @@ public void testNoShardsCapacityMetadata() throws IOException {
createIndexInDataNode(100)
)
);
var target = new ShardsCapacityHealthIndicatorService(clusterService, featureService);
var target = new ShardsCapacityHealthIndicatorService(clusterService);
var indicatorResult = target.calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), HealthStatus.UNKNOWN);
Expand All @@ -136,10 +133,7 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
int maxShardsPerNode = randomValidMaxShards();
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(maxShardsPerNode, maxShardsPerNodeFrozen, createIndexInDataNode(maxShardsPerNode / 4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), HealthStatus.GREEN);
assertTrue(indicatorResult.impacts().isEmpty());
Expand All @@ -158,15 +152,36 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
);
}

public void testDiagnoses() {
assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().indicatorName());
assertEquals("decrease_shards_per_non_frozen_node", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().id());
assertThat(
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().cause(),
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()))
);
assertThat(
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().action(),
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some non-frozen indices"))
);

assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().indicatorName());
assertEquals("decrease_shards_per_frozen_node", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().id());
assertThat(
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().cause(),
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey()))
);
assertThat(
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().action(),
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some frozen indices"))
);
}

public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOException {
{
// Only data_nodes does not have enough space
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), YELLOW);
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
Expand All @@ -189,10 +204,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
// Only frozen_nodes does not have enough space
int maxShardsPerNode = randomValidMaxShards();
var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), YELLOW);
assertEquals(
Expand All @@ -217,10 +229,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
{
// Both data and frozen nodes does not have enough space
var clusterService = createClusterService(25, 25, createIndexInDataNode(4), createIndexInFrozenNode(4));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), YELLOW);
assertEquals(
Expand Down Expand Up @@ -251,10 +260,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
// Only data_nodes does not have enough space
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
Expand All @@ -277,10 +283,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
// Only frozen_nodes does not have enough space
int maxShardsPerNode = randomValidMaxShards();
var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(
Expand All @@ -305,10 +308,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
{
// Both data and frozen nodes does not have enough space
var clusterService = createClusterService(25, 25, createIndexInDataNode(11), createIndexInFrozenNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
true,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(
Expand Down Expand Up @@ -377,22 +377,19 @@ public void testCalculateMethods() {
public void testMappedFieldsForTelemetry() {
assertEquals(ShardsCapacityHealthIndicatorService.NAME, "shards_capacity");
assertEquals(
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node",
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_non_frozen_node",
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().getUniqueId()
);
assertEquals(
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node_frozen",
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_frozen_node",
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().getUniqueId()
);
}

public void testSkippingFieldsWhenVerboseIsFalse() {
int maxShardsPerNodeFrozen = randomValidMaxShards();
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
false,
HealthInfo.EMPTY_HEALTH_INFO
);
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(false, HealthInfo.EMPTY_HEALTH_INFO);

assertEquals(indicatorResult.status(), RED);
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
Expand Down Expand Up @@ -441,7 +438,7 @@ private ClusterState createClusterState(
var metadata = Metadata.builder()
.persistentSettings(
Settings.builder()
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), maxShardsPerNodeFrozen)
.build()
);
Expand Down

0 comments on commit 9c0709f

Please sign in to comment.