Skip to content

Commit

Permalink
Do not recommend increasing max_shards_per_node (#120458)
Browse files Browse the repository at this point in the history
Today if the `shards_capacity` health indicator detects a problem then
it recommends increasing the limit, which goes against the advice in the
manual about not increasing these limits and also makes it rather
pointless having a limit in the first place.

This commit improves the recommendation to suggest either adding nodes
or else reducing the shard count.
  • Loading branch information
DaveCTurner committed Jan 20, 2025
1 parent 7678eb3 commit de5be24
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 14 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/120458.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 120458
summary: Do not recommend increasing `max_shards_per_node`
area: Health
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ public enum ReferenceDocs {
JDK_LOCALE_DIFFERENCES,
ALLOCATION_EXPLAIN_MAX_RETRY,
SECURE_SETTINGS,
CLUSTER_SHARD_LIMIT,
// this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.ReferenceDocs;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.features.FeatureService;
Expand Down Expand Up @@ -55,21 +56,18 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
"The cluster is running low on room to add new shards. Adding data to new indices is at risk";
private static final String INDEX_CREATION_RISK =
"The cluster is running low on room to add new shards. Adding data to new indices might soon fail.";
private static final String HELP_GUIDE = "https://ela.st/fix-shards-capacity";
private static final TriFunction<String, Setting<?>, String, Diagnosis> SHARD_MAX_CAPACITY_REACHED_FN = (
id,
setting,
indexType) -> new Diagnosis(
new Diagnosis.Definition(
NAME,
id,
"Elasticsearch is about to reach the maximum number of shards it can host, based on your current settings.",
"Increase the value of ["
+ setting.getKey()
+ "] cluster setting or remove "
"Elasticsearch is about to reach the maximum number of shards it can host as set by [" + setting.getKey() + "].",
"Increase the number of nodes in your cluster or remove some "
+ indexType
+ " indices to clear up resources.",
HELP_GUIDE
+ " indices to reduce the number of shards in the cluster.",
ReferenceDocs.CLUSTER_SHARD_LIMIT.toString()
),
null
);
Expand All @@ -83,12 +81,12 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
new HealthIndicatorImpact(NAME, "creation_of_new_indices_at_risk", 2, INDEX_CREATION_RISK, List.of(ImpactArea.INGEST))
);
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_DATA_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
"increase_max_shards_per_node",
"decrease_shards_per_non_frozen_node",
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
"data"
"non-frozen"
);
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
"increase_max_shards_per_node_frozen",
"decrease_shards_per_frozen_node",
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
"frozen"
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ FORMING_SINGLE_NODE_CLUSTERS modules-discover
JDK_LOCALE_DIFFERENCES mapping-date-format.html#custom-date-format-locales
ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded
SECURE_SETTINGS secure-settings.html
CLUSTER_SHARD_LIMIT misc-cluster-settings.html#cluster-shard-limit
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.NORMAL_GROUP;
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
Expand Down Expand Up @@ -93,7 +97,6 @@ public void setUp() throws Exception {
.build();

clusterService = ClusterServiceUtils.createClusterService(threadPool);

featureService = Mockito.mock(FeatureService.class);
Mockito.when(featureService.clusterHasFeature(any(), any())).thenReturn(true);
}
Expand Down Expand Up @@ -159,6 +162,30 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
);
}

public void testDiagnoses() {
assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().indicatorName());
assertEquals("decrease_shards_per_non_frozen_node", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().id());
assertThat(
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().cause(),
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()))
);
assertThat(
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().action(),
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some non-frozen indices"))
);

assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().indicatorName());
assertEquals("decrease_shards_per_frozen_node", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().id());
assertThat(
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().cause(),
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey()))
);
assertThat(
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().action(),
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some frozen indices"))
);
}

public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOException {
{
// Only data_nodes does not have enough space
Expand Down Expand Up @@ -378,11 +405,11 @@ public void testCalculateMethods() {
public void testMappedFieldsForTelemetry() {
assertEquals(ShardsCapacityHealthIndicatorService.NAME, "shards_capacity");
assertEquals(
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node",
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_non_frozen_node",
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().getUniqueId()
);
assertEquals(
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node_frozen",
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_frozen_node",
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().getUniqueId()
);
}
Expand Down Expand Up @@ -442,7 +469,7 @@ private ClusterState createClusterState(
var metadata = Metadata.builder()
.persistentSettings(
Settings.builder()
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), maxShardsPerNodeFrozen)
.build()
);
Expand Down

0 comments on commit de5be24

Please sign in to comment.