From 90a72aef7c882c36a62f7958d1302d62b77a6f4d Mon Sep 17 00:00:00 2001 From: Byron Wasti Date: Fri, 19 Apr 2024 12:52:15 -0400 Subject: [PATCH] Fix negative feedback bug in ConcurrencyController The ConcurrencyController would sometimes get itself into a negative feedback loop where it would repeatedly set itself as TPS Limited and never stabilize. The reason has to do with the interplay between two ways the CC works: 1. The concurrency search algo is only increasing 2. When it goes into TPS Limited, it sets concurrency to the previously found "optimal" concurrency and reset the history What this meant is that when the TPS is limited, it would decrease the Goal TPS and set the concurrency to the "optimal" value. However, it would then use that concurrency as the starting point and start increasing it. Leading to more contention, leading to TPS dropping, leading to the controller thinking it was TPS limited again... because it kept increasing the concurrency. The solution is a bit of a hack, but we just set the concurrency to a lower value than the optimal one found, so that the algorithm can grow into it and stabilize. Frankly, this controller needs a rewrite. --- balter/src/controllers/concurrency.rs | 14 ++++++++++++-- balter/src/transaction.rs | 2 -- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/balter/src/controllers/concurrency.rs b/balter/src/controllers/concurrency.rs index a2b3295..b8fb2d5 100644 --- a/balter/src/controllers/concurrency.rs +++ b/balter/src/controllers/concurrency.rs @@ -7,6 +7,14 @@ use tracing::{debug, error, trace}; const STARTING_CONCURRENCY_COUNT: usize = 4; const MAX_CHANGE: usize = 100; +// NOTE: Somewhat tricky to explain, but essentially our optimal concurrency search algorithm only +// increases concurrency. This means if we set concurrency to an "optimal" value, the search algo +// will immediately start increasing it (leading to a negative feedback loop with increased +// contention). This adjustment is a bit of a hack, where we always allow the concurrency to grow +// so that the algorithm stabilizes. +// TODO: Rewrite the concurrency search algorithm (see above NOTE) +const CONCURRENCY_SET_ADJUSTMENT: f64 = 0.75; + #[derive(Debug)] pub(crate) struct ConcurrencyController { prev_measurements: Vec, @@ -29,8 +37,8 @@ impl ConcurrencyController { } else { // TODO: Better numerical conversions let ratio = goal_tps.get() as f64 / self.goal_tps.get() as f64; - let new_concurrency = - (ratio * self.concurrency as f64 + 1.).max(STARTING_CONCURRENCY_COUNT as f64); + let new_concurrency = (ratio * self.concurrency as f64 * CONCURRENCY_SET_ADJUSTMENT) + .max(STARTING_CONCURRENCY_COUNT as f64); new_concurrency as usize }; @@ -97,6 +105,7 @@ impl ConcurrencyController { self.concurrency = STARTING_CONCURRENCY_COUNT; CCOutcome::AlterConcurrency(self.concurrency) } else if let Some((max_tps, concurrency)) = self.detect_underpowered() { + let concurrency = (concurrency as f64 * CONCURRENCY_SET_ADJUSTMENT) as usize; self.concurrency = concurrency; CCOutcome::TpsLimited(max_tps, concurrency) } else { @@ -119,6 +128,7 @@ impl ConcurrencyController { // NOTE: The controller can get stuck at a given concurrency, and results in NaN. // This is an edge-case of when the controller is limited. if slope.is_nan() { + debug!("NaN Slope detected. Ignoring."); return 0.; } diff --git a/balter/src/transaction.rs b/balter/src/transaction.rs index af35c3b..f128992 100644 --- a/balter/src/transaction.rs +++ b/balter/src/transaction.rs @@ -32,8 +32,6 @@ where // transaction_hooks, and to log it in the sampler. hook.latency.push(elapsed); if cfg!(feature = "metrics") { - // TODO: What are the implications of calling this every time? - metrics::describe_histogram!(labels.latency, metrics::Unit::Seconds, ""); metrics::histogram!(labels.latency).record(elapsed.as_secs_f64()); }