From 99ed2cb1e424964d18eb5dbad622562adb8cc51b Mon Sep 17 00:00:00 2001 From: Kyle Kim Date: Sat, 20 Jul 2024 04:59:11 +0000 Subject: [PATCH] Major revision. - Updated the aggregation to bypass nested commands, and instead account for wrapper commands (EXEC, EVAL, FCALL). - Added slot invalidation for cross-slot lua scripting. - Added resetClusterStats(). - Added documentation to valkey.conf Signed-off-by: Kyle Kim --- src/cluster.c | 10 ++++++ src/cluster.h | 1 + src/cluster_legacy.c | 2 +- src/cluster_slot_stats.c | 31 +++++++++++------- src/cluster_slot_stats.h | 2 ++ src/config.c | 1 + src/script.c | 2 ++ src/server.c | 1 - tests/unit/cluster/slot-stats.tcl | 54 ++++++++----------------------- valkey.conf | 10 ++++++ 10 files changed, 60 insertions(+), 54 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index c4949c08ee..018a7c91d2 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -35,6 +35,7 @@ #include "server.h" #include "cluster.h" +#include "cluster_slot_stats.h" #include @@ -1460,3 +1461,12 @@ void readwriteCommand(client *c) { c->flags &= ~CLIENT_READONLY; addReply(c, shared.ok); } + +/* Resets transient cluster stats that we expose via INFO or other means that we want + * to reset via CONFIG RESETSTAT. The function is also used in order to + * initialize these fields in clusterInit() at server startup. */ +void resetClusterStats(void) { + if (!server.cluster_enabled) return; + + clusterSlotStatResetAll(); +} diff --git a/src/cluster.h b/src/cluster.h index a83b4ac282..90e85cb29b 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -125,4 +125,5 @@ ConnectionType *connTypeOfCluster(void); int isNodeAvailable(clusterNode *node); long long getNodeReplicationOffset(clusterNode *node); sds aggregateClientOutputBuffer(client *c); +void resetClusterStats(void); #endif /* __CLUSTER_H */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 27ba15bb88..c930e267d6 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1043,7 +1043,7 @@ void clusterInit(void) { clusterUpdateMyselfIp(); clusterUpdateMyselfHostname(); clusterUpdateMyselfHumanNodename(); - clusterSlotStatResetAll(); + resetClusterStats(); } void clusterInitLast(void) { diff --git a/src/cluster_slot_stats.c b/src/cluster_slot_stats.c index 0ea59b8bd4..efb6f35c6a 100644 --- a/src/cluster_slot_stats.c +++ b/src/cluster_slot_stats.c @@ -8,11 +8,7 @@ #define UNASSIGNED_SLOT 0 -typedef enum { - INVALID, - KEY_COUNT, - CPU_USEC, -} slotStatTypes; +typedef enum { KEY_COUNT, CPU_USEC, SLOT_STAT_COUNT, INVALID } slotStatTypes; /* ----------------------------------------------------------------------------- * CLUSTER SLOT-STATS command @@ -82,7 +78,8 @@ static void addReplySlotStat(client *c, int slot) { addReplyArrayLen(c, 2); /* Array of size 2, where 0th index represents (int) slot, * and 1st index represents (map) usage statistics. */ addReplyLongLong(c, slot); - addReplyMapLen(c, (server.cluster_slot_stats_enabled) ? 2 : 1); /* Nested map representing slot usage statistics. */ + addReplyMapLen(c, (server.cluster_slot_stats_enabled) ? SLOT_STAT_COUNT + : 1); /* Nested map representing slot usage statistics. */ addReplyBulkCString(c, "key-count"); addReplyLongLong(c, countKeysInSlot(slot)); @@ -129,20 +126,22 @@ void clusterSlotStatReset(int slot) { } void clusterSlotStatResetAll(void) { - if (server.cluster == NULL) return; - memset(server.cluster->slot_stats, 0, sizeof(server.cluster->slot_stats)); } -/* For cpu-usec accumulation, EXEC, EVAl and FCALL commands are skipped. +/* For cpu-usec accumulation, nested commands within EXEC, EVAL, FCALL are skipped. * This is due to their unique callstack, where the c->duration for * EXEC, EVAL and FCALL already includes all of its nested commands. - * Meaning, the accumulation of cpu-usec for these wrapper commands + * Meaning, the accumulation of cpu-usec for these nested commands * would equate to repeating the same calculation twice. */ static int canAddCpuDuration(client *c) { - return server.cluster_slot_stats_enabled && server.cluster_enabled && c->slot != -1 && - c->cmd->proc != execCommand && c->cmd->proc != evalCommand && c->cmd->proc != fcallCommand; + return server.cluster_slot_stats_enabled && // Config should be enabled. + server.cluster_enabled && // Cluster mode should be enbaled. + c->slot != -1 && // Command should be slot specific. + (!server.execution_nesting || // Either; + (server.execution_nesting && // 1) Command should not be nested, or + c->realcmd->flags & CMD_BLOCKING)); // 2) If command is nested, it must be due to unblocking. } void clusterSlotStatsAddCpuDuration(client *c, ustime_t duration) { @@ -152,6 +151,14 @@ void clusterSlotStatsAddCpuDuration(client *c, ustime_t duration) { server.cluster->slot_stats[c->slot].cpu_usec += duration; } +/* For cross-slot scripting, its caller client's slot must be invalidated, + * such that its slot-stats aggregation is bypassed. */ +void clusterSlotStatsInvalidateSlotIfApplicable(scriptRunCtx *ctx) { + if (!(ctx->flags & SCRIPT_ALLOW_CROSS_SLOT)) return; + + ctx->original_client->slot = -1; +} + void clusterSlotStatsCommand(client *c) { if (server.cluster_enabled == 0) { addReplyError(c, "This instance has cluster support disabled"); diff --git a/src/cluster_slot_stats.h b/src/cluster_slot_stats.h index 8ceb46684b..9faa2a9598 100644 --- a/src/cluster_slot_stats.h +++ b/src/cluster_slot_stats.h @@ -1,7 +1,9 @@ #include "server.h" #include "cluster.h" +#include "script.h" #include "cluster_legacy.h" void clusterSlotStatReset(int slot); void clusterSlotStatResetAll(void); void clusterSlotStatsAddCpuDuration(client *c, ustime_t duration); +void clusterSlotStatsInvalidateSlotIfApplicable(scriptRunCtx *ctx); diff --git a/src/config.c b/src/config.c index 941b2aab5d..60f24da13e 100644 --- a/src/config.c +++ b/src/config.c @@ -3381,6 +3381,7 @@ void configHelpCommand(client *c) { void configResetStatCommand(client *c) { resetServerStats(); + resetClusterStats(); resetCommandTableStats(server.commands); resetErrorTableStats(); addReply(c, shared.ok); diff --git a/src/script.c b/src/script.c index 29397f81bd..ffbe77dacd 100644 --- a/src/script.c +++ b/src/script.c @@ -30,6 +30,7 @@ #include "server.h" #include "script.h" #include "cluster.h" +#include "cluster_slot_stats.h" scriptFlag scripts_flags_def[] = { {.flag = SCRIPT_FLAG_NO_WRITES, .str = "no-writes"}, @@ -583,6 +584,7 @@ void scriptCall(scriptRunCtx *run_ctx, sds *err) { } call(c, call_flags); serverAssert((c->flags & CLIENT_BLOCKED) == 0); + clusterSlotStatsInvalidateSlotIfApplicable(run_ctx); return; error: diff --git a/src/server.c b/src/server.c index 5252734a5c..40906763e1 100644 --- a/src/server.c +++ b/src/server.c @@ -2500,7 +2500,6 @@ void resetServerStats(void) { memset(server.duration_stats, 0, sizeof(durationStats) * EL_DURATION_TYPE_NUM); server.el_cmd_cnt_max = 0; lazyfreeResetStats(); - clusterSlotStatResetAll(); } /* Make the thread killable at any time, so that kill threads functions diff --git a/tests/unit/cluster/slot-stats.tcl b/tests/unit/cluster/slot-stats.tcl index f64052ba58..2443afbfbd 100644 --- a/tests/unit/cluster/slot-stats.tcl +++ b/tests/unit/cluster/slot-stats.tcl @@ -145,7 +145,7 @@ proc wait_for_replica_key_exists {key key_count} { # Test cases for CLUSTER SLOT-STATS cpu-usec metric correctness. # ----------------------------------------------------------------------------- -start_cluster 1 0 {tags {external:skip cluster}} { +start_cluster 1 0 {tags {external:skip cluster} overrides {cluster-slot-stats-enabled yes}} { # Define shared variables. set key "FOO" @@ -153,7 +153,6 @@ start_cluster 1 0 {tags {external:skip cluster}} { set key_secondary "FOO2" set key_secondary_slot [R 0 cluster keyslot $key_secondary] set metrics_to_assert [list cpu-usec] - R 0 CONFIG SET cluster-slot-stats-enabled yes test "CLUSTER SLOT-STATS cpu-usec reset upon CONFIG RESETSTAT." { R 0 SET $key VALUE @@ -264,11 +263,10 @@ start_cluster 1 0 {tags {external:skip cluster}} { # Execute transaction, and assert that all nested command times have been accumulated. $r1 EXEC set slot_stats [R 0 CLUSTER SLOT-STATS SLOTSRANGE 0 16383] - set get_usec [get_cmdstat_usec set r] - set set_usec [get_cmdstat_usec get r] + set exec_usec [get_cmdstat_usec exec r] set expected_slot_stats [ dict create $key_slot [ - dict create cpu-usec [expr $get_usec + $set_usec] + dict create cpu-usec $exec_usec ] ] assert_empty_slot_stats_with_exception $slot_stats $expected_slot_stats $metrics_to_assert @@ -277,15 +275,15 @@ start_cluster 1 0 {tags {external:skip cluster}} { R 0 FLUSHALL test "CLUSTER SLOT-STATS cpu-usec for lua-scripts, without cross-slot keys." { - r eval [format "redis.call('set', '%s', 'bar'); redis.call('get', '%s')" $key $key] 0 + r eval [format "#!lua + redis.call('set', '%s', 'bar'); redis.call('get', '%s')" $key $key] 0 - set get_usec [get_cmdstat_usec set r] - set set_usec [get_cmdstat_usec get r] + set eval_usec [get_cmdstat_usec eval r] set slot_stats [R 0 CLUSTER SLOT-STATS SLOTSRANGE 0 16383] set expected_slot_stats [ dict create $key_slot [ - dict create cpu-usec [expr $get_usec + $set_usec] + dict create cpu-usec $eval_usec ] ] assert_empty_slot_stats_with_exception $slot_stats $expected_slot_stats $metrics_to_assert @@ -298,20 +296,9 @@ start_cluster 1 0 {tags {external:skip cluster}} { redis.call('set', '%s', 'bar'); redis.call('get', '%s'); " $key $key_secondary] 0 - set set_usec [get_cmdstat_usec set r] - set get_usec [get_cmdstat_usec get r] + # For cross-slot, we do not accumulate at all. set slot_stats [R 0 CLUSTER SLOT-STATS SLOTSRANGE 0 16383] - - set expected_slot_stats [ - dict create \ - $key_slot [ - dict create cpu-usec $set_usec - ] \ - $key_secondary_slot [ - dict create cpu-usec $get_usec - ] - ] - assert_empty_slot_stats_with_exception $slot_stats $expected_slot_stats $metrics_to_assert + assert_empty_slot_stats $slot_stats $metrics_to_assert } R 0 CONFIG RESETSTAT R 0 FLUSHALL @@ -325,13 +312,12 @@ start_cluster 1 0 {tags {external:skip cluster}} { r function load replace $function_str r fcall f1 0 - set set_usec [get_cmdstat_usec set r] - set get_usec [get_cmdstat_usec get r] + set fcall_usec [get_cmdstat_usec fcall r] set slot_stats [R 0 CLUSTER SLOT-STATS SLOTSRANGE 0 16383] set expected_slot_stats [ dict create $key_slot [ - dict create cpu-usec [expr $get_usec + $set_usec] + dict create cpu-usec $fcall_usec ] ] assert_empty_slot_stats_with_exception $slot_stats $expected_slot_stats $metrics_to_assert @@ -349,20 +335,9 @@ start_cluster 1 0 {tags {external:skip cluster}} { r function load replace $function_str r fcall f1 0 - set set_usec [get_cmdstat_usec set r] - set get_usec [get_cmdstat_usec get r] + # For cross-slot, we do not accumulate at all. set slot_stats [R 0 CLUSTER SLOT-STATS SLOTSRANGE 0 16383] - - set expected_slot_stats [ - dict create \ - $key_slot [ - dict create cpu-usec $set_usec - ] \ - $key_secondary_slot [ - dict create cpu-usec $get_usec - ] - ] - assert_empty_slot_stats_with_exception $slot_stats $expected_slot_stats $metrics_to_assert + assert_empty_slot_stats $slot_stats $metrics_to_assert } R 0 CONFIG RESETSTAT R 0 FLUSHALL @@ -529,12 +504,11 @@ start_cluster 1 0 {tags {external:skip cluster}} { # Test cases for CLUSTER SLOT-STATS replication. # ----------------------------------------------------------------------------- -start_cluster 1 1 {tags {external:skip cluster}} { +start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-slot-stats-enabled yes}} { # Define shared variables. set key "FOO" set key_slot [R 0 CLUSTER KEYSLOT $key] - R 0 CONFIG SET cluster-slot-stats-enabled yes # For replication, only those metrics that are deterministic upon replication are asserted. # * key-count is asserted, as both the primary and its replica must hold the same number of keys. diff --git a/valkey.conf b/valkey.conf index e4ffd0f8ad..fa4438846f 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1771,6 +1771,16 @@ aof-timestamp-enabled no # # cluster-preferred-endpoint-type ip +# Clusters can be configured to track per-slot resource statistics, +# which are further accessible by the CLUSTER SLOT-STATS command. +# +# By default, the 'cluster-slot-stats-enabled' is disabled, and only 'key-count' is captured. +# By enabling the 'cluster-slot-stats-enabled' config, the cluster will begin to capture advanced statistics. +# These statistics can be leveraged to assess general slot usage trends, identify hot / cold slots, +# migrate slots for a balanced cluster workload, and / or re-write application logic to better utilize slots. +# +# cluster-slot-stats-enabled no + # In order to setup your cluster make sure to read the documentation # available at https://valkey.io web site.