From 5c39335cb86e4f9e18183d28999b619cdba2f138 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 00:47:43 +0100 Subject: [PATCH 01/35] Balanced measures RTT using OPTIONS requests --- .../BalancedNodeSelectionStrategyChannel.java | 78 +++++++++++++++++-- .../com/palantir/dialogue/HttpMethod.java | 1 + 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index a0ed1c400..8fec8a82f 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -24,8 +24,10 @@ import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.ListenableFuture; import com.palantir.dialogue.Endpoint; +import com.palantir.dialogue.HttpMethod; import com.palantir.dialogue.Request; import com.palantir.dialogue.Response; +import com.palantir.dialogue.UrlBuilder; import com.palantir.logsafe.Preconditions; import com.palantir.logsafe.SafeArg; import com.palantir.logsafe.UnsafeArg; @@ -37,6 +39,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; @@ -55,8 +58,10 @@ */ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class); + private static final Request BLANK_REQUEST = Request.builder().build(); - private static final Comparator BY_SCORE = Comparator.comparingInt(SortableChannel::getScore); + private static final Comparator BY_SCORE = + Comparator.comparingInt(SortableChannel::getScore).thenComparingLong(SortableChannel::getRtt); private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30); private static final double FAILURE_WEIGHT = 10; @@ -133,9 +138,12 @@ public String toString() { private static final class MutableChannelWithStats implements LimitedChannel { private final LimitedChannel delegate; private final FutureCallback updateStats; + private final Ticker clock; private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); + private volatile long rttNanos = 0; + // private volatile long nextRttMeasurement = 0; /** * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns @@ -147,6 +155,7 @@ private static final class MutableChannelWithStats implements LimitedChannel { MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) { this.delegate = delegate; this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY); + this.clock = clock; this.observability = observability; // Saves one allocation on each network call this.updateStats = new FutureCallback() { @@ -180,15 +189,35 @@ public void onFailure(Throwable throwable) { public Optional> maybeExecute(Endpoint endpoint, Request request) { inflight.incrementAndGet(); Optional> maybe = delegate.maybeExecute(endpoint, request); - if (maybe.isPresent()) { - DialogueFutures.addDirectCallback(maybe.get(), updateStats); - observability.markRequestMade(); - } else { + if (!maybe.isPresent()) { inflight.decrementAndGet(); // quickly undo + return Optional.empty(); } + + DialogueFutures.addDirectCallback(maybe.get(), updateStats); + observability.markRequestMade(); + maybeSampleRtt(); return maybe; } + private void maybeSampleRtt() { + long initialNanos = clock.read(); + // if (initialNanos < nextRttMeasurement) { + // return; + // } + + Optional> maybe = delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); + if (maybe.isPresent()) { + // nextRttMeasurement = initialNanos + TimeUnit.SECONDS.toNanos(1); + DialogueFutures.addDirectListener(maybe.get(), () -> { + rttNanos = clock.read() - initialNanos; + if (log.isInfoEnabled()) { + log.info("rttNanos {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate)); + } + }); + } + } + SortableChannel computeScore() { int requestsInflight = inflight.get(); double failureReservoir = recentFailuresReservoir.get(); @@ -198,7 +227,7 @@ SortableChannel computeScore() { int score = requestsInflight + Ints.saturatedCast(Math.round(failureReservoir)); observability.debugLogComputedScore(requestsInflight, failureReservoir, score); - return new SortableChannel(score, this); + return new SortableChannel(score, rttNanos, this); } @Override @@ -217,10 +246,12 @@ public String toString() { */ private static final class SortableChannel { private final int score; + private final long rtt; private final MutableChannelWithStats delegate; - SortableChannel(int score, MutableChannelWithStats delegate) { + SortableChannel(int score, long rtt, MutableChannelWithStats delegate) { this.score = score; + this.rtt = rtt; this.delegate = delegate; } @@ -228,9 +259,13 @@ int getScore() { return score; } + public long getRtt() { + return rtt; + } + @Override public String toString() { - return "SortableChannel{score=" + score + ", delegate=" + delegate + '}'; + return "SortableChannel{" + "score=" + score + ", rtt=" + rtt + ", delegate=" + delegate + '}'; } } @@ -342,4 +377,31 @@ public void markRequestMade() { }; } } + + private enum RttEndpoint implements Endpoint { + INSTANCE; + + @Override + public void renderPath(Map params, UrlBuilder url) {} + + @Override + public HttpMethod httpMethod() { + return HttpMethod.OPTIONS; + } + + @Override + public String serviceName() { + return "Balanced"; + } + + @Override + public String endpointName() { + return "rtt"; + } + + @Override + public String version() { + return "0.0.0"; + } + } } diff --git a/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java b/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java index 13ba55512..61a608a7e 100644 --- a/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java +++ b/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java @@ -24,4 +24,5 @@ public enum HttpMethod { PATCH, POST, PUT, + OPTIONS } From bc5b31d412b56e577b2ab1186e76b3d63a57a049 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 00:50:09 +0100 Subject: [PATCH 02/35] Compile --- .../dialogue/core/BalancedNodeSelectionStrategyChannel.java | 2 +- .../java/com/palantir/dialogue/core/RetryingChannel.java | 1 + .../src/main/java/com/palantir/dialogue/OkHttpChannel.java | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 8fec8a82f..231ac9529 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -382,7 +382,7 @@ private enum RttEndpoint implements Endpoint { INSTANCE; @Override - public void renderPath(Map params, UrlBuilder url) {} + public void renderPath(Map _params, UrlBuilder _url) {} @Override public HttpMethod httpMethod() { diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java index 2d1b7a5fb..167b8f895 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java @@ -381,6 +381,7 @@ private static boolean safeToRetry(HttpMethod httpMethod) { // in theory PUT and DELETE should be fine to retry too, we're just being conservative for now. case POST: case PATCH: + case OPTIONS: return false; } diff --git a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java index a04f26ba9..011cddfbe 100644 --- a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java +++ b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java @@ -93,6 +93,11 @@ public ListenableFuture execute(Endpoint endpoint, Request request) { case PATCH: okRequest = okRequest.patch(toOkHttpBody(request.body())); break; + case OPTIONS: + okRequest = okRequest.method( + "OPTIONS", + request.body().isPresent() ? toOkHttpBody(request.body().get()) : null); + break; } // Fill headers From 109b05d0e41e11d6addb64f67d7cb523873ec735 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 00:56:55 +0100 Subject: [PATCH 03/35] Handl OPTIONS --- .../java/com/palantir/dialogue/core/SimulationServer.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java b/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java index 46df6fed3..c8bd29aea 100644 --- a/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java +++ b/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java @@ -25,6 +25,7 @@ import com.google.common.util.concurrent.MoreExecutors; import com.palantir.dialogue.Channel; import com.palantir.dialogue.Endpoint; +import com.palantir.dialogue.HttpMethod; import com.palantir.dialogue.Request; import com.palantir.dialogue.Response; import com.palantir.dialogue.TestResponse; @@ -74,6 +75,10 @@ public static Builder builder() { @Override public ListenableFuture execute(Endpoint endpoint, Request request) { + if (endpoint.httpMethod() == HttpMethod.OPTIONS) { + return Futures.immediateFuture(new TestResponse().code(204)); + } + Meter perEndpointRequests = MetricNames.requestMeter(simulation.taggedMetrics(), serverName, endpoint); activeRequests.inc(); From 466154980e74106b158b83c8fe70054da6f120f8 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 00:58:17 +0100 Subject: [PATCH 04/35] Update simulations --- ..._nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ..._reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ..._big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ach_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- simulation/src/test/resources/report.md | 14 +++++++------- ...ate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ..._nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ach_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- 15 files changed, 28 insertions(+), 28 deletions(-) diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 870cd83ac..3879dff8e 100644 --- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5acd44d3f56e2a84023c8cda0f6f1da0e0848dd9c3b2b7bd0df936f6166cd27b -size 113221 +oid sha256:fa2908b6dde2dff46cd78175ea4fbf3a87b66baa0e29a495b3b6fe0daf987ef9 +size 116789 diff --git a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 4f58c5438..94064ea2d 100644 --- a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:301c818a4e266379a6c45dee81d725d1dad6f3c8d2abba49d38a1df8e68ac7ee -size 103151 +oid sha256:7252511a753523b3869eea5007f48298746fab76bf097cc512767c9bb1699fc6 +size 103509 diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 24e0252de..1c2789624 100644 --- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e925ad02dd69d9f2a5ef8b47ad43aabd5a14106ce0937a73e9780a9a11f3c435 -size 123045 +oid sha256:2aa3973eac6964c1b10c22e034cb052e97dc6e22112d1e1902619a412b9a4fb3 +size 123784 diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png index a7e4baf1c..e8365be98 100644 --- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2863197850e4d4573230ae2b4420e31baadf10421546cba011a3d271ca63834c -size 78310 +oid sha256:26ff458ff960fbfc6f8d3801799b295a0bd382400583c561c64ab170ab922f8d +size 82170 diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png index aa3d69e55..b4a658b44 100644 --- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d492690bce9b84925c16675a75ff55af3cfb180b8b0f753c1fd036f317c2801c -size 137045 +oid sha256:ab06d72114810dbf8530e882ca71c3384cc9b3ff13a198b5ef8f9d8647c38b54 +size 135051 diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md index 892475d24..5b5f46f6a 100644 --- a/simulation/src/test/resources/report.md +++ b/simulation/src/test/resources/report.md @@ -2,10 +2,10 @@ ``` all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=79.3% client_mean=PT5.81342S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1586, 500=414} - all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=73.7% client_mean=PT3.039455S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1474, 500=526} + all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=73.3% client_mean=PT2.61872S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1465, 500=535} all_nodes_500[UNLIMITED_ROUND_ROBIN].txt: success=50.0% client_mean=PT0.6S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1000, 500=1000} black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=59.2% client_mean=PT0.600655114S server_cpu=PT11M49.8S client_received=1183/2000 server_resps=1183 codes={200=1183} - black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=91.5% client_mean=PT0.6S server_cpu=PT18M17.4S client_received=1829/2000 server_resps=1829 codes={200=1829} + black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} black_hole[UNLIMITED_ROUND_ROBIN].txt: success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} drastic_slowdown[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2.947028083S server_cpu=PT41M8.862333314S client_received=4000/4000 server_resps=4000 codes={200=4000} drastic_slowdown[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.251969999S server_cpu=PT16M47.879999984S client_received=4000/4000 server_resps=4000 codes={200=4000} @@ -17,16 +17,16 @@ fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=94.3% client_mean=PT6.987708S server_cpu=PT1H56M8.59S client_received=2500/2500 server_resps=2500 codes={200=2357, 500=143} - live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=84.0% client_mean=PT4.6409416S server_cpu=PT1H53M13.43S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} + live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=83.8% client_mean=PT4.53S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} live_reloading[UNLIMITED_ROUND_ROBIN].txt: success=86.9% client_mean=PT2.802124S server_cpu=PT1H56M45.31S client_received=2500/2500 server_resps=2500 codes={200=2173, 500=327} one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2.569766928S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} - one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.521339525S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} + one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.516129859S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} one_big_spike[UNLIMITED_ROUND_ROBIN].txt: success=99.9% client_mean=PT1.000523583S server_cpu=PT7M37.35S client_received=1000/1000 server_resps=3049 codes={200=999, 429=1} one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=64.9% client_mean=PT8.1950896S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1623, 500=877} - one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=65.4% client_mean=PT4.0117872S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1636, 500=864} + one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=67.1% client_mean=PT2.6955136S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1677, 500=823} one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt: success=65.1% client_mean=PT0.6S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1628, 500=872} server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT12M44.762591546S server_cpu=PT9H55M40.4S client_received=150000/150000 server_resps=178702 codes={200=149989, 429=11} - server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.290099498S server_cpu=PT9H9M39.2S client_received=150000/150000 server_resps=164896 codes={200=150000} + server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.264165097S server_cpu=PT9H4M39S client_received=150000/150000 server_resps=163395 codes={200=150000} server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.217416858S server_cpu=PT8H45M29.2S client_received=150000/150000 server_resps=157646 codes={200=150000} simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT0.834469696S server_cpu=PT3H3M35S client_received=13200/13200 server_resps=13200 codes={200=13200} simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.785757575S server_cpu=PT2H52M52S client_received=13200/13200 server_resps=13200 codes={200=13200} @@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: succe slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2M28.388896928S server_cpu=PT8H48M18.546665835S client_received=10000/10000 server_resps=10899 codes={200=10000} - slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M24.308654421S server_cpu=PT14H3M44.353333281S client_received=10000/10000 server_resps=13383 codes={200=9999, 500=1} + slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M20.776800234S server_cpu=PT14H10M8.413333255S client_received=10000/10000 server_resps=13515 codes={200=9998, 500=2} slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt: success=3.6% client_mean=PT21.551691121S server_cpu=PT54H45M57.899999949S client_received=10000/10000 server_resps=49335 codes={200=360, 500=9640} uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 3c1b66b6a..e77c67847 100644 --- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33cfa2b188b84d30e139b5a8ff39a646f65588b5cf7b5755a2a61993c9d38c8e -size 606983 +oid sha256:8f0cb8ff527ff2327b12b0298c073ea40037f57fb08177d263d7a1c07a04bf96 +size 587465 diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png index f5dcc152c..374579fc5 100644 --- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:160c566ff4a67cdd8ecddbb91c2262d1310a4925728dadb36ff81bf9a9237875 -size 120294 +oid sha256:76e04d90b781eade0dc4a333c8d5451cbfb9441e7c241117515e89bbb7c446f9 +size 123285 diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index a0433ed68..a99adba40 100644 --- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=73.7% client_mean=PT3.039455S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1474, 500=526} \ No newline at end of file +success=73.3% client_mean=PT2.61872S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1465, 500=535} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 5b70ecd5f..5c1da373e 100644 --- a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=91.5% client_mean=PT0.6S server_cpu=PT18M17.4S client_received=1829/2000 server_resps=1829 codes={200=1829} \ No newline at end of file +success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 46b26cdea..be188c732 100644 --- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=84.0% client_mean=PT4.6409416S server_cpu=PT1H53M13.43S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} \ No newline at end of file +success=83.8% client_mean=PT4.53S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index b71e29339..09b358e6b 100644 --- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT1.521339525S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} \ No newline at end of file +success=100.0% client_mean=PT1.516129859S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 561f056b7..0f2287fc3 100644 --- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=65.4% client_mean=PT4.0117872S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1636, 500=864} \ No newline at end of file +success=67.1% client_mean=PT2.6955136S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1677, 500=823} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index ffdfaca53..e71d95262 100644 --- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT0.290099498S server_cpu=PT9H9M39.2S client_received=150000/150000 server_resps=164896 codes={200=150000} \ No newline at end of file +success=100.0% client_mean=PT0.264165097S server_cpu=PT9H4M39S client_received=150000/150000 server_resps=163395 codes={200=150000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index d250cf150..fb9afe184 100644 --- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT3M24.308654421S server_cpu=PT14H3M44.353333281S client_received=10000/10000 server_resps=13383 codes={200=9999, 500=1} \ No newline at end of file +success=100.0% client_mean=PT3M20.776800234S server_cpu=PT14H10M8.413333255S client_received=10000/10000 server_resps=13515 codes={200=9998, 500=2} \ No newline at end of file From 17c248e52bc076d89cbaeccfbc57e2b319576ea2 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 01:13:20 +0100 Subject: [PATCH 05/35] Updatee tests for rtt sampling --- .../BalancedNodeSelectionStrategyChannelTest.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 9086c290c..1e985a68e 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -76,8 +76,8 @@ void when_one_channel_is_in_use_prefer_the_other() { for (int i = 0; i < 200; i++) { channel.maybeExecute(endpoint, request); } - verify(chan1, times(199)).maybeExecute(any(), any()); - verify(chan2, times(1)).maybeExecute(any(), any()); + verify(chan1, times(398)).maybeExecute(any(), any()); + verify(chan2, times(2)).maybeExecute(any(), any()); } @Test @@ -88,8 +88,8 @@ void when_both_channels_are_free_we_get_roughly_fair_tiebreaking() { for (int i = 0; i < 200; i++) { channel.maybeExecute(endpoint, request); } - verify(chan1, times(99)).maybeExecute(any(), any()); - verify(chan2, times(101)).maybeExecute(any(), any()); + verify(chan1, times(198)).maybeExecute(any(), any()); + verify(chan2, times(202)).maybeExecute(any(), any()); } @Test @@ -116,8 +116,8 @@ void a_single_4xx_doesnt_move_the_needle() { .containsExactly(0, 0); } - verify(chan1, times(99)).maybeExecute(any(), any()); - verify(chan2, times(101)).maybeExecute(any(), any()); + verify(chan1, times(198)).maybeExecute(any(), any()); + verify(chan2, times(202)).maybeExecute(any(), any()); } @Test From a5e7531b7098b9ed49cbd44a0f58f3904e282423 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 16:26:22 +0100 Subject: [PATCH 06/35] Accumulate RTT average --- .../BalancedNodeSelectionStrategyChannel.java | 42 ++++++++++++++----- ...ancedNodeSelectionStrategyChannelTest.java | 13 ++++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 231ac9529..85507e06f 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -44,6 +44,7 @@ import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; +import javax.annotation.concurrent.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -142,8 +143,7 @@ private static final class MutableChannelWithStats implements LimitedChannel { private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); - private volatile long rttNanos = 0; - // private volatile long nextRttMeasurement = 0; + private final RoundTripTimeMeasurement rtt = new RoundTripTimeMeasurement(); /** * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns @@ -202,17 +202,14 @@ public Optional> maybeExecute(Endpoint endpoint, Requ private void maybeSampleRtt() { long initialNanos = clock.read(); - // if (initialNanos < nextRttMeasurement) { - // return; - // } Optional> maybe = delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); if (maybe.isPresent()) { - // nextRttMeasurement = initialNanos + TimeUnit.SECONDS.toNanos(1); DialogueFutures.addDirectListener(maybe.get(), () -> { - rttNanos = clock.read() - initialNanos; + long rttNanos = clock.read() - initialNanos; + rtt.update(rttNanos); if (log.isInfoEnabled()) { - log.info("rttNanos {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate)); + log.info("rttNanos {} {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate)); } }); } @@ -227,7 +224,7 @@ SortableChannel computeScore() { int score = requestsInflight + Ints.saturatedCast(Math.round(failureReservoir)); observability.debugLogComputedScore(requestsInflight, failureReservoir, score); - return new SortableChannel(score, rttNanos, this); + return new SortableChannel(score, rtt.getNanos(), this); } @Override @@ -259,7 +256,7 @@ int getScore() { return score; } - public long getRtt() { + long getRtt() { return rtt; } @@ -404,4 +401,29 @@ public String version() { return "0.0.0"; } } + + @VisibleForTesting + @ThreadSafe + static class RoundTripTimeMeasurement { + + // TODO(dfox): switch to some exponentially decaying thingy? + // TODO(dfox): can we exclude outlier measurements that probably include TLS handshakes? + private volatile float rttNanos = 0; + private volatile long numSamples = 0; + + long getNanos() { + return (long) rttNanos; + } + + synchronized void update(long newMeasurement) { + float denominator = (float) numSamples / (numSamples + 1); + this.rttNanos = rttNanos * denominator + (float) newMeasurement / (numSamples + 1); + this.numSamples = numSamples + 1; + } + + @Override + public String toString() { + return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + ", numSamples=" + numSamples + '}'; + } + } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 1e985a68e..977ffe2dc 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -146,6 +146,19 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu .containsExactly(0, 0); } + @Test + void rtt_accumulates_avg_nicely() { + BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt = + new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement(); + rtt.update(1); + rtt.update(2); + rtt.update(3); + assertThat(rtt.getNanos()).isEqualTo(2); + + rtt.update(500); + assertThat(rtt.getNanos()).isEqualTo(126); + } + private static void set200(LimitedChannel chan) { when(chan.maybeExecute(any(), any())).thenReturn(http(200)); } From e398af0b7afe071ffb5f1d2a42dc2c5d55d4a510 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 18:02:55 +0100 Subject: [PATCH 07/35] Sample all channels, with two sequential calls --- .../BalancedNodeSelectionStrategyChannel.java | 85 ++++++++++++++----- ...ancedNodeSelectionStrategyChannelTest.java | 8 +- 2 files changed, 70 insertions(+), 23 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 85507e06f..b96705d94 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -22,7 +22,9 @@ import com.google.common.collect.ImmutableList; import com.google.common.primitives.Ints; import com.google.common.util.concurrent.FutureCallback; +import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.MoreExecutors; import com.palantir.dialogue.Endpoint; import com.palantir.dialogue.HttpMethod; import com.palantir.dialogue.Request; @@ -42,7 +44,9 @@ import java.util.Map; import java.util.Optional; import java.util.Random; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import java.util.stream.IntStream; import javax.annotation.concurrent.ThreadSafe; import org.slf4j.Logger; @@ -103,6 +107,7 @@ public Optional> maybeExecute(Endpoint endpoint, Requ for (SortableChannel channel : sortedChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { + sampleRttsForAllChannels(); return maybe; } } @@ -110,6 +115,66 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return Optional.empty(); } + private void sampleRttsForAllChannels() { + // TODO(dfox): don't do this 100% of the time + // if (random.nextDouble() > 0.01f) { + // return; + // } + + List> results = new ArrayList<>(); + for (MutableChannelWithStats channel : channels) { + long attempt1Before = clock.read(); + + Optional> attempt1 = + channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); + if (!attempt1.isPresent()) { + results.add(Futures.immediateFuture(Long.MAX_VALUE)); + continue; + } + + ListenableFuture attempt1Duration = Futures.transform( + attempt1.get(), response1 -> clock.read() - attempt1Before, MoreExecutors.directExecutor()); + + results.add(Futures.transformAsync( + attempt1Duration, + attempt1Nanos -> { + long attempt2Before = clock.read(); + Optional> attempt2 = + channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); + if (!attempt2.isPresent()) { + return Futures.immediateFuture(Long.MAX_VALUE); // signifies we couldn't get two samples + } + + return Futures.transform( + attempt2.get(), + response2 -> { + long attempt2Nanos = clock.read() - attempt2Before; + // taking min of two attempts to exclude samples that incurred a TLS handshake + long newMeasurement = Math.min(attempt1Nanos, attempt2Nanos); + channel.rtt.addMeasurement(newMeasurement); + return newMeasurement; + }, + MoreExecutors.directExecutor()); + }, + MoreExecutors.directExecutor())); + } + if (log.isInfoEnabled()) { + DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback>() { + @Override + public void onSuccess(List result) { + List millis = + result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); + log.info("RTTs {} {}", SafeArg.of("nanos", result), SafeArg.of("millis", millis)); + } + + @Override + public void onFailure(Throwable throwable) { + log.info("Throwable", throwable); + } + }); + } + } + private static SortableChannel[] sortByScore(List preShuffled) { SortableChannel[] sorted = new SortableChannel[preShuffled.size()]; for (int i = 0; i < preShuffled.size(); i++) { @@ -139,7 +204,6 @@ public String toString() { private static final class MutableChannelWithStats implements LimitedChannel { private final LimitedChannel delegate; private final FutureCallback updateStats; - private final Ticker clock; private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); @@ -155,7 +219,6 @@ private static final class MutableChannelWithStats implements LimitedChannel { MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) { this.delegate = delegate; this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY); - this.clock = clock; this.observability = observability; // Saves one allocation on each network call this.updateStats = new FutureCallback() { @@ -196,25 +259,9 @@ public Optional> maybeExecute(Endpoint endpoint, Requ DialogueFutures.addDirectCallback(maybe.get(), updateStats); observability.markRequestMade(); - maybeSampleRtt(); return maybe; } - private void maybeSampleRtt() { - long initialNanos = clock.read(); - - Optional> maybe = delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); - if (maybe.isPresent()) { - DialogueFutures.addDirectListener(maybe.get(), () -> { - long rttNanos = clock.read() - initialNanos; - rtt.update(rttNanos); - if (log.isInfoEnabled()) { - log.info("rttNanos {} {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate)); - } - }); - } - } - SortableChannel computeScore() { int requestsInflight = inflight.get(); double failureReservoir = recentFailuresReservoir.get(); @@ -415,7 +462,7 @@ long getNanos() { return (long) rttNanos; } - synchronized void update(long newMeasurement) { + synchronized void addMeasurement(long newMeasurement) { float denominator = (float) numSamples / (numSamples + 1); this.rttNanos = rttNanos * denominator + (float) newMeasurement / (numSamples + 1); this.numSamples = numSamples + 1; diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 977ffe2dc..5fcc2c8ae 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -150,12 +150,12 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu void rtt_accumulates_avg_nicely() { BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt = new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement(); - rtt.update(1); - rtt.update(2); - rtt.update(3); + rtt.addMeasurement(1); + rtt.addMeasurement(2); + rtt.addMeasurement(3); assertThat(rtt.getNanos()).isEqualTo(2); - rtt.update(500); + rtt.addMeasurement(500); assertThat(rtt.getNanos()).isEqualTo(126); } From 0701a64e4e20e3224aa74f1ebb2bf6baf7cc6b15 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 18:35:31 +0100 Subject: [PATCH 08/35] Also log accumulated ones --- .../core/BalancedNodeSelectionStrategyChannel.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index b96705d94..387994ede 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -164,7 +164,13 @@ private void sampleRttsForAllChannels() { public void onSuccess(List result) { List millis = result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); - log.info("RTTs {} {}", SafeArg.of("nanos", result), SafeArg.of("millis", millis)); + List accumulated = + channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList()); + log.info( + "RTTs {} {}", + SafeArg.of("nanos", result), + SafeArg.of("millis", millis), + SafeArg.of("accumulated", accumulated)); } @Override From bef11a4bfc9d6338cde8aa08d8bcb1f455e517e0 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Wed, 27 May 2020 23:36:44 +0100 Subject: [PATCH 09/35] Just store the min --- .../BalancedNodeSelectionStrategyChannel.java | 66 +++++++------------ ...ancedNodeSelectionStrategyChannelTest.java | 13 ++-- 2 files changed, 30 insertions(+), 49 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 387994ede..65de96f17 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -46,6 +46,7 @@ import java.util.Random; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import java.util.stream.IntStream; import javax.annotation.concurrent.ThreadSafe; @@ -123,40 +124,22 @@ private void sampleRttsForAllChannels() { List> results = new ArrayList<>(); for (MutableChannelWithStats channel : channels) { - long attempt1Before = clock.read(); + long before = clock.read(); - Optional> attempt1 = + Optional> future = channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); - if (!attempt1.isPresent()) { + if (future.isPresent()) { + results.add(Futures.transform( + future.get(), + _response -> { + long durationNanos = clock.read() - before; + channel.rtt.addMeasurement(durationNanos); + return durationNanos; + }, + MoreExecutors.directExecutor())); + } else { results.add(Futures.immediateFuture(Long.MAX_VALUE)); - continue; } - - ListenableFuture attempt1Duration = Futures.transform( - attempt1.get(), response1 -> clock.read() - attempt1Before, MoreExecutors.directExecutor()); - - results.add(Futures.transformAsync( - attempt1Duration, - attempt1Nanos -> { - long attempt2Before = clock.read(); - Optional> attempt2 = - channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); - if (!attempt2.isPresent()) { - return Futures.immediateFuture(Long.MAX_VALUE); // signifies we couldn't get two samples - } - - return Futures.transform( - attempt2.get(), - response2 -> { - long attempt2Nanos = clock.read() - attempt2Before; - // taking min of two attempts to exclude samples that incurred a TLS handshake - long newMeasurement = Math.min(attempt1Nanos, attempt2Nanos); - channel.rtt.addMeasurement(newMeasurement); - return newMeasurement; - }, - MoreExecutors.directExecutor()); - }, - MoreExecutors.directExecutor())); } if (log.isInfoEnabled()) { DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback>() { @@ -164,18 +147,18 @@ private void sampleRttsForAllChannels() { public void onSuccess(List result) { List millis = result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); - List accumulated = + List best = channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList()); log.info( - "RTTs {} {}", + "RTTs {} {} {}", SafeArg.of("nanos", result), SafeArg.of("millis", millis), - SafeArg.of("accumulated", accumulated)); + SafeArg.of("best", best)); } @Override public void onFailure(Throwable throwable) { - log.info("Throwable", throwable); + log.info("Failed to sample RTT for channels", throwable); } }); } @@ -460,23 +443,20 @@ public String version() { static class RoundTripTimeMeasurement { // TODO(dfox): switch to some exponentially decaying thingy? - // TODO(dfox): can we exclude outlier measurements that probably include TLS handshakes? - private volatile float rttNanos = 0; - private volatile long numSamples = 0; + private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE); long getNanos() { - return (long) rttNanos; + return rttNanos.get(); } - synchronized void addMeasurement(long newMeasurement) { - float denominator = (float) numSamples / (numSamples + 1); - this.rttNanos = rttNanos * denominator + (float) newMeasurement / (numSamples + 1); - this.numSamples = numSamples + 1; + void addMeasurement(long newMeasurement) { + // Only stores the *minimum* values, so that we exclude slow calls that might include TLS handshakes. + rttNanos.getAndUpdate(existing -> Math.min(existing, newMeasurement)); } @Override public String toString() { - return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + ", numSamples=" + numSamples + '}'; + return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + '}'; } } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 5fcc2c8ae..f54b58f60 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -76,8 +76,8 @@ void when_one_channel_is_in_use_prefer_the_other() { for (int i = 0; i < 200; i++) { channel.maybeExecute(endpoint, request); } - verify(chan1, times(398)).maybeExecute(any(), any()); - verify(chan2, times(2)).maybeExecute(any(), any()); + verify(chan1, times(399)).maybeExecute(any(), any()); + verify(chan2, times(201)).maybeExecute(any(), any()); } @Test @@ -147,16 +147,17 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu } @Test - void rtt_accumulates_avg_nicely() { + void rtt_just_remembers_the_min() { BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt = new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement(); + rtt.addMeasurement(3); + assertThat(rtt.getNanos()).isEqualTo(3); rtt.addMeasurement(1); rtt.addMeasurement(2); - rtt.addMeasurement(3); - assertThat(rtt.getNanos()).isEqualTo(2); + assertThat(rtt.getNanos()).isEqualTo(1); rtt.addMeasurement(500); - assertThat(rtt.getNanos()).isEqualTo(126); + assertThat(rtt.getNanos()).isEqualTo(1); } private static void set200(LimitedChannel chan) { From 2284f62794ef18047857b61ceccedaf89aa14792 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 00:49:43 +0100 Subject: [PATCH 10/35] Compute score based on the range of observed rtts --- .../BalancedNodeSelectionStrategyChannel.java | 121 ++++++++++-------- ...ancedNodeSelectionStrategyChannelTest.java | 17 +-- 2 files changed, 78 insertions(+), 60 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 65de96f17..4ba468fd9 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -65,12 +65,21 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class); private static final Request BLANK_REQUEST = Request.builder().build(); + private static final Comparator BY_SCORE = Comparator.comparingInt(ChannelStats::getScore); - private static final Comparator BY_SCORE = - Comparator.comparingInt(SortableChannel::getScore).thenComparingLong(SortableChannel::getRtt); private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30); private static final double FAILURE_WEIGHT = 10; + /** + * This determines how 'sticky' we are to our 'nearest' node (as measured by RTT). If this is set too high, then we + * may deliver suboptimal perf by sending all requests to a slow node that is physically nearby, when there's + * actually a faster one further away. + * If this is too low, then we may prematurely spill across AZs and pay the $ cost. Should probably + * keep this lower than {@link #FAILURE_WEIGHT} to ensure that a single 5xx makes a nearby node less attractive + * than a faraway node that exhibited zero failures. + */ + private static final double RTT_WEIGHT = 3; + private final ImmutableList channels; private final Random random; private final Ticker clock; @@ -103,9 +112,9 @@ public Optional> maybeExecute(Endpoint endpoint, Requ // TODO(dfox): P2C optimization when we have high number of nodes to save CPU? // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf - SortableChannel[] sortedChannels = sortByScore(preShuffled); + ChannelStats[] sortedChannels = sortByScore(preShuffled); - for (SortableChannel channel : sortedChannels) { + for (ChannelStats channel : sortedChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { sampleRttsForAllChannels(); @@ -141,7 +150,7 @@ private void sampleRttsForAllChannels() { results.add(Futures.immediateFuture(Long.MAX_VALUE)); } } - if (log.isInfoEnabled()) { + if (log.isDebugEnabled()) { DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback>() { @Override public void onSuccess(List result) { @@ -149,7 +158,7 @@ public void onSuccess(List result) { result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); List best = channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList()); - log.info( + log.debug( "RTTs {} {} {}", SafeArg.of("nanos", result), SafeArg.of("millis", millis), @@ -164,13 +173,36 @@ public void onFailure(Throwable throwable) { } } - private static SortableChannel[] sortByScore(List preShuffled) { - SortableChannel[] sorted = new SortableChannel[preShuffled.size()]; + private static ChannelStats[] sortByScore(List preShuffled) { + ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()]; + + long bestRttNanos = Long.MAX_VALUE; + long worstRttNanos = 0; for (int i = 0; i < preShuffled.size(); i++) { - sorted[i] = preShuffled.get(i).computeScore(); + ChannelStats snapshot = preShuffled.get(i).getSnapshot(); + snapshotArray[i] = snapshot; + + if (snapshot.rttNanos != Long.MAX_VALUE) { + bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos); + worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos); + } + } + + // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding it + // would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a conversion + // fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos to worstRttNanos, + // with 0 being best and 1 being worst. This ensures that if several nodes are all within the same AZ and + // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero). + // Note, this can only be computed when we have all the snapshots in front of us. + if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0) { + long rttRange = worstRttNanos - bestRttNanos; + for (ChannelStats snapshot : snapshotArray) { + snapshot.rttSpectrum = ((float) snapshot.rttNanos - bestRttNanos) / rttRange; + } } - Arrays.sort(sorted, BY_SCORE); - return sorted; + + Arrays.sort(snapshotArray, BY_SCORE); + return snapshotArray; } /** Returns a new shuffled list, without mutating the input list (which may be immutable). */ @@ -182,7 +214,7 @@ private static List shuffleImmutableList(ImmutableList sourceList, Ran @VisibleForTesting IntStream getScores() { - return channels.stream().mapToInt(c -> c.computeScore().score); + return channels.stream().mapToInt(c -> c.getSnapshot().getScore()); } @Override @@ -251,22 +283,14 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return maybe; } - SortableChannel computeScore() { - int requestsInflight = inflight.get(); - double failureReservoir = recentFailuresReservoir.get(); - - // it's important that scores are integers because if we kept the full double precision, then a single 4xx - // would end up influencing host selection long beyond its intended lifespan in the absence of other data. - int score = requestsInflight + Ints.saturatedCast(Math.round(failureReservoir)); - - observability.debugLogComputedScore(requestsInflight, failureReservoir, score); - return new SortableChannel(score, rtt.getNanos(), this); + ChannelStats getSnapshot() { + return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getNanos()); } @Override public String toString() { - return "MutableChannelWithStats{score=" + computeScore().score - + ", inflight=" + inflight + return "MutableChannelWithStats{" + + "inflight=" + inflight + ", recentFailures=" + recentFailuresReservoir + ", delegate=" + delegate + '}'; @@ -274,31 +298,35 @@ public String toString() { } /** - * A dedicated immutable class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger + * A dedicated value class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger * might change mid-sort, leading to undefined behaviour. */ - private static final class SortableChannel { - private final int score; - private final long rtt; + private static final class ChannelStats { private final MutableChannelWithStats delegate; + private final int requestsInflight; + private final double recentBadness; - SortableChannel(int score, long rtt, MutableChannelWithStats delegate) { - this.score = score; - this.rtt = rtt; + private final long rttNanos; + private float rttSpectrum = 0; // assigned later once we've figured out the range of rttNanos across channels + + ChannelStats(MutableChannelWithStats delegate, int requestsInflight, double recentBadness, long rttNanos) { this.delegate = delegate; + this.requestsInflight = requestsInflight; + this.recentBadness = recentBadness; + this.rttNanos = rttNanos; } int getScore() { - return score; - } - - long getRtt() { - return rtt; + // it's important that scores are integers because if we kept the full double precision, then a single 4xx + // would end up influencing host selection long beyond its intended lifespan in the absence of other data. + return requestsInflight + + Ints.saturatedCast(Math.round(recentBadness)) + + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT)); } @Override public String toString() { - return "SortableChannel{" + "score=" + score + ", rtt=" + rtt + ", delegate=" + delegate + '}'; + return "SortableChannel{" + getScore() + '}'; } } @@ -320,7 +348,7 @@ private static void registerGauges( DialogueInternalWeakReducingGauge.getOrCreate( taggedMetrics, metricName, - c -> c.computeScore().getScore(), + c -> c.getSnapshot().getScore(), longStream -> { long[] longs = longStream.toArray(); if (log.isInfoEnabled() && longs.length > 1) { @@ -369,18 +397,6 @@ void debugLogStatusFailure(Response response) { } } - void debugLogComputedScore(int inflight, double failures, int score) { - if (log.isDebugEnabled()) { - log.debug( - "Computed score", - channelName, - hostIndex, - SafeArg.of("score", score), - SafeArg.of("inflight", inflight), - SafeArg.of("failures", failures)); - } - } - static PerHostObservability create( ImmutableList channels, TaggedMetricRegistry taggedMetrics, @@ -411,7 +427,8 @@ public void markRequestMade() { } } - private enum RttEndpoint implements Endpoint { + @VisibleForTesting + enum RttEndpoint implements Endpoint { INSTANCE; @Override @@ -442,7 +459,7 @@ public String version() { @ThreadSafe static class RoundTripTimeMeasurement { - // TODO(dfox): switch to some exponentially decaying thingy? + // TODO(dfox): switch to some exponentially decaying thingy, so we could detect if network topology changes private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE); long getNanos() { diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index f54b58f60..5833dfaad 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -18,6 +18,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -76,8 +77,8 @@ void when_one_channel_is_in_use_prefer_the_other() { for (int i = 0; i < 200; i++) { channel.maybeExecute(endpoint, request); } - verify(chan1, times(399)).maybeExecute(any(), any()); - verify(chan2, times(201)).maybeExecute(any(), any()); + verify(chan1, times(199)).maybeExecute(eq(endpoint), any()); + verify(chan2, times(1)).maybeExecute(eq(endpoint), any()); } @Test @@ -88,8 +89,8 @@ void when_both_channels_are_free_we_get_roughly_fair_tiebreaking() { for (int i = 0; i < 200; i++) { channel.maybeExecute(endpoint, request); } - verify(chan1, times(198)).maybeExecute(any(), any()); - verify(chan2, times(202)).maybeExecute(any(), any()); + verify(chan1, times(299)).maybeExecute(eq(endpoint), any()); + verify(chan2, times(301)).maybeExecute(eq(endpoint), any()); } @Test @@ -98,8 +99,8 @@ void when_channels_refuse_try_all_then_give_up() { when(chan2.maybeExecute(any(), any())).thenReturn(Optional.empty()); assertThat(channel.maybeExecute(endpoint, request)).isNotPresent(); - verify(chan1, times(1)).maybeExecute(any(), any()); - verify(chan2, times(1)).maybeExecute(any(), any()); + verify(chan1, times(1)).maybeExecute(eq(endpoint), any()); + verify(chan2, times(1)).maybeExecute(eq(endpoint), any()); } @Test @@ -116,8 +117,8 @@ void a_single_4xx_doesnt_move_the_needle() { .containsExactly(0, 0); } - verify(chan1, times(198)).maybeExecute(any(), any()); - verify(chan2, times(202)).maybeExecute(any(), any()); + verify(chan1, times(99)).maybeExecute(eq(endpoint), any()); + verify(chan2, times(101)).maybeExecute(eq(endpoint), any()); } @Test From 961b2404806c308ce48971dd01a7278619e8de96 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 00:53:54 +0100 Subject: [PATCH 11/35] Delete misleading method --- .../core/BalancedNodeSelectionStrategyChannel.java | 8 +++++--- .../BalancedNodeSelectionStrategyChannelTest.java | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 4ba468fd9..1f33e0d07 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -49,6 +49,7 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import java.util.stream.IntStream; +import java.util.stream.Stream; import javax.annotation.concurrent.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -213,8 +214,8 @@ private static List shuffleImmutableList(ImmutableList sourceList, Ran } @VisibleForTesting - IntStream getScores() { - return channels.stream().mapToInt(c -> c.getSnapshot().getScore()); + Stream getScoresForTesting() { + return Arrays.stream(sortByScore(channels)); } @Override @@ -301,7 +302,8 @@ public String toString() { * A dedicated value class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger * might change mid-sort, leading to undefined behaviour. */ - private static final class ChannelStats { + @VisibleForTesting + static final class ChannelStats { private final MutableChannelWithStats delegate; private final int requestsInflight; private final double recentBadness; diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 5833dfaad..3a30d32d2 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -89,8 +89,8 @@ void when_both_channels_are_free_we_get_roughly_fair_tiebreaking() { for (int i = 0; i < 200; i++) { channel.maybeExecute(endpoint, request); } - verify(chan1, times(299)).maybeExecute(eq(endpoint), any()); - verify(chan2, times(301)).maybeExecute(eq(endpoint), any()); + verify(chan1, times(99)).maybeExecute(eq(endpoint), any()); + verify(chan2, times(101)).maybeExecute(eq(endpoint), any()); } @Test @@ -112,7 +112,7 @@ void a_single_4xx_doesnt_move_the_needle() { clock.read() < start + Duration.ofSeconds(10).toNanos(); incrementClockBy(Duration.ofMillis(50))) { channel.maybeExecute(endpoint, request); - assertThat(channel.getScores()) + assertThat(channel.getScoresForTesting().map(c -> c.getScore())) .describedAs("A single 400 at the beginning isn't enough to impact scores", channel) .containsExactly(0, 0); } @@ -128,19 +128,19 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu for (int i = 0; i < 11; i++) { channel.maybeExecute(endpoint, request); - assertThat(channel.getScores()) + assertThat(channel.getScoresForTesting().map(c -> c.getScore())) .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), channel) .containsExactly(0, 0); incrementClockBy(Duration.ofMillis(50)); } channel.maybeExecute(endpoint, request); - assertThat(channel.getScores()) + assertThat(channel.getScoresForTesting().map(c -> c.getScore())) .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), channel) - .containsExactly(1, 0); + .containsExactly(0, 1); incrementClockBy(Duration.ofSeconds(5)); - assertThat(channel.getScores()) + assertThat(channel.getScoresForTesting().map(c -> c.getScore())) .describedAs( "%s: We quickly forget about 4xxs and go back to fair shuffling %s", Duration.ofNanos(clock.read()), channel) From 3c2dc94fb874d8cc50bb2cde2cfc52936f02990a Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 01:06:50 +0100 Subject: [PATCH 12/35] Fix unit tests --- .../BalancedNodeSelectionStrategyChannel.java | 13 +++-- ...ancedNodeSelectionStrategyChannelTest.java | 55 +++++++++++++++++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 1f33e0d07..64e5bed44 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -195,8 +195,8 @@ private static ChannelStats[] sortByScore(List preShuff // with 0 being best and 1 being worst. This ensures that if several nodes are all within the same AZ and // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero). // Note, this can only be computed when we have all the snapshots in front of us. - if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0) { - long rttRange = worstRttNanos - bestRttNanos; + long rttRange = worstRttNanos - bestRttNanos; + if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0) { for (ChannelStats snapshot : snapshotArray) { snapshot.rttSpectrum = ((float) snapshot.rttNanos - bestRttNanos) / rttRange; } @@ -291,9 +291,10 @@ ChannelStats getSnapshot() { @Override public String toString() { return "MutableChannelWithStats{" - + "inflight=" + inflight + + "delegate=" + delegate + + ", inflight=" + inflight + ", recentFailures=" + recentFailuresReservoir - + ", delegate=" + delegate + + ", rtt=" + rtt + '}'; } } @@ -328,7 +329,7 @@ int getScore() { @Override public String toString() { - return "SortableChannel{" + getScore() + '}'; + return "ChannelStats{" + "score=" + getScore() + ", delegate=" + delegate + '}'; } } @@ -475,7 +476,7 @@ void addMeasurement(long newMeasurement) { @Override public String toString() { - return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + '}'; + return "RoundTripTimeMeasurement{" + rttNanos + '}'; } } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 3a30d32d2..59bbba3ae 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -147,6 +147,61 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu .containsExactly(0, 0); } + @Test + void rtt_is_measured_and_can_influence_choices() { + // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); + when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); + + SettableFuture chan1OptionsResponse = SettableFuture.create(); + SettableFuture chan2OptionsResponse = SettableFuture.create(); + BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint = + BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE; + when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan1OptionsResponse)); + when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan2OptionsResponse)); + + channel.maybeExecute(endpoint, request); + + incrementClockBy(Duration.ofNanos(123)); + chan1OptionsResponse.set(new TestResponse().code(200)); + + incrementClockBy(Duration.ofNanos(456)); + chan2OptionsResponse.set(new TestResponse().code(200)); + + assertThat(channel.getScoresForTesting().map(c -> c.getScore())) + .describedAs("The poor latency of channel2 imposes a small constant penalty in the score") + .containsExactly(0, 3); + } + + @Test + void when_rtt_measurements_are_limited_dont_freak_out() { + // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); + when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); + + BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint = + BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE; + when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty()); + when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty()); + + channel.maybeExecute(endpoint, request); + + assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); + } + + @Test + void when_rtt_measurements_havent_returned_yet_dont_freak_out() { + // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); + when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); + + BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint = + BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE; + when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create())); + when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create())); + + channel.maybeExecute(endpoint, request); + + assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); + } + @Test void rtt_just_remembers_the_min() { BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt = From 9514f912fe9fdecb59d42b49a9f58f5ef4c3b618 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 01:08:55 +0100 Subject: [PATCH 13/35] Update simulations --- ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- simulation/src/test/resources/report.md | 12 ++++++------ ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- 13 files changed, 24 insertions(+), 24 deletions(-) diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 3879dff8e..dcbdc4b08 100644 --- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa2908b6dde2dff46cd78175ea4fbf3a87b66baa0e29a495b3b6fe0daf987ef9 -size 116789 +oid sha256:1cefd6e18f6003d8146aa97dcfef902d00185395ae1f8d162c2482dacfd57d81 +size 117492 diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 1c2789624..a011807a8 100644 --- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aa3973eac6964c1b10c22e034cb052e97dc6e22112d1e1902619a412b9a4fb3 -size 123784 +oid sha256:32de7fbba94d7676ad359f3228e55337b61088fcb4102fe743e9105581c94280 +size 123659 diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png index e8365be98..f6d0b4252 100644 --- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26ff458ff960fbfc6f8d3801799b295a0bd382400583c561c64ab170ab922f8d -size 82170 +oid sha256:dd0ba5b93d7672d48abf2aef421884968bac702a9c760e159990ef4b3c6ac7a9 +size 82748 diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png index b4a658b44..459bfa31b 100644 --- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab06d72114810dbf8530e882ca71c3384cc9b3ff13a198b5ef8f9d8647c38b54 -size 135051 +oid sha256:1836375d4ea7584287820c4b7658ff7b62c990ffac3cf1abe5657b6746651463 +size 135128 diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md index 5b5f46f6a..61cf72fe9 100644 --- a/simulation/src/test/resources/report.md +++ b/simulation/src/test/resources/report.md @@ -2,7 +2,7 @@ ``` all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=79.3% client_mean=PT5.81342S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1586, 500=414} - all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=73.3% client_mean=PT2.61872S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1465, 500=535} + all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=72.8% client_mean=PT2.57287S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1456, 500=544} all_nodes_500[UNLIMITED_ROUND_ROBIN].txt: success=50.0% client_mean=PT0.6S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1000, 500=1000} black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=59.2% client_mean=PT0.600655114S server_cpu=PT11M49.8S client_received=1183/2000 server_resps=1183 codes={200=1183} black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} @@ -17,16 +17,16 @@ fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=94.3% client_mean=PT6.987708S server_cpu=PT1H56M8.59S client_received=2500/2500 server_resps=2500 codes={200=2357, 500=143} - live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=83.8% client_mean=PT4.53S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} + live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=83.8% client_mean=PT4.529216S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} live_reloading[UNLIMITED_ROUND_ROBIN].txt: success=86.9% client_mean=PT2.802124S server_cpu=PT1H56M45.31S client_received=2500/2500 server_resps=2500 codes={200=2173, 500=327} one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2.569766928S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} - one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.516129859S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} + one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.516076793S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} one_big_spike[UNLIMITED_ROUND_ROBIN].txt: success=99.9% client_mean=PT1.000523583S server_cpu=PT7M37.35S client_received=1000/1000 server_resps=3049 codes={200=999, 429=1} one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=64.9% client_mean=PT8.1950896S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1623, 500=877} - one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=67.1% client_mean=PT2.6955136S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1677, 500=823} + one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=66.8% client_mean=PT2.6967232S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1671, 500=829} one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt: success=65.1% client_mean=PT0.6S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1628, 500=872} server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT12M44.762591546S server_cpu=PT9H55M40.4S client_received=150000/150000 server_resps=178702 codes={200=149989, 429=11} - server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.264165097S server_cpu=PT9H4M39S client_received=150000/150000 server_resps=163395 codes={200=150000} + server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.244761087S server_cpu=PT9H1M59S client_received=150000/150000 server_resps=162595 codes={200=150000} server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.217416858S server_cpu=PT8H45M29.2S client_received=150000/150000 server_resps=157646 codes={200=150000} simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT0.834469696S server_cpu=PT3H3M35S client_received=13200/13200 server_resps=13200 codes={200=13200} simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.785757575S server_cpu=PT2H52M52S client_received=13200/13200 server_resps=13200 codes={200=13200} @@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: succe slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2M28.388896928S server_cpu=PT8H48M18.546665835S client_received=10000/10000 server_resps=10899 codes={200=10000} - slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M20.776800234S server_cpu=PT14H10M8.413333255S client_received=10000/10000 server_resps=13515 codes={200=9998, 500=2} + slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M26.439571686S server_cpu=PT14H31M14.346666611S client_received=10000/10000 server_resps=13781 codes={200=9996, 500=4} slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt: success=3.6% client_mean=PT21.551691121S server_cpu=PT54H45M57.899999949S client_received=10000/10000 server_resps=49335 codes={200=360, 500=9640} uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png index e77c67847..69edd5eb7 100644 --- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f0cb8ff527ff2327b12b0298c073ea40037f57fb08177d263d7a1c07a04bf96 -size 587465 +oid sha256:c80da86e1873f4bd972a3cd3f2976c1bfb6f64e3bc28c7b9043feb08388a4c51 +size 552215 diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 374579fc5..6abe35c1c 100644 --- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76e04d90b781eade0dc4a333c8d5451cbfb9441e7c241117515e89bbb7c446f9 -size 123285 +oid sha256:dd941ae9716e18ce6d35cbef330f26979460c9124ed744d143a5e124d497c92b +size 119483 diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index a99adba40..4848ec898 100644 --- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=73.3% client_mean=PT2.61872S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1465, 500=535} \ No newline at end of file +success=72.8% client_mean=PT2.57287S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1456, 500=544} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index be188c732..eb2d8e4b8 100644 --- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=83.8% client_mean=PT4.53S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} \ No newline at end of file +success=83.8% client_mean=PT4.529216S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 09b358e6b..b5f6ef6a5 100644 --- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT1.516129859S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} \ No newline at end of file +success=100.0% client_mean=PT1.516076793S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 0f2287fc3..8bec96e81 100644 --- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=67.1% client_mean=PT2.6955136S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1677, 500=823} \ No newline at end of file +success=66.8% client_mean=PT2.6967232S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1671, 500=829} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index e71d95262..22cc13a0e 100644 --- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT0.264165097S server_cpu=PT9H4M39S client_received=150000/150000 server_resps=163395 codes={200=150000} \ No newline at end of file +success=100.0% client_mean=PT0.244761087S server_cpu=PT9H1M59S client_received=150000/150000 server_resps=162595 codes={200=150000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index fb9afe184..0b011e68a 100644 --- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT3M20.776800234S server_cpu=PT14H10M8.413333255S client_received=10000/10000 server_resps=13515 codes={200=9998, 500=2} \ No newline at end of file +success=100.0% client_mean=PT3M26.439571686S server_cpu=PT14H31M14.346666611S client_received=10000/10000 server_resps=13781 codes={200=9996, 500=4} \ No newline at end of file From cb3b440dfaf9803b7d445bc43681c4ec5a8b627d Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 01:15:26 +0100 Subject: [PATCH 14/35] streams are fine here --- .../BalancedNodeSelectionStrategyChannel.java | 100 +++++++++--------- ...ancedNodeSelectionStrategyChannelTest.java | 4 +- 2 files changed, 51 insertions(+), 53 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 64e5bed44..2111fad56 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -126,54 +126,6 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return Optional.empty(); } - private void sampleRttsForAllChannels() { - // TODO(dfox): don't do this 100% of the time - // if (random.nextDouble() > 0.01f) { - // return; - // } - - List> results = new ArrayList<>(); - for (MutableChannelWithStats channel : channels) { - long before = clock.read(); - - Optional> future = - channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST); - if (future.isPresent()) { - results.add(Futures.transform( - future.get(), - _response -> { - long durationNanos = clock.read() - before; - channel.rtt.addMeasurement(durationNanos); - return durationNanos; - }, - MoreExecutors.directExecutor())); - } else { - results.add(Futures.immediateFuture(Long.MAX_VALUE)); - } - } - if (log.isDebugEnabled()) { - DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback>() { - @Override - public void onSuccess(List result) { - List millis = - result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); - List best = - channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList()); - log.debug( - "RTTs {} {} {}", - SafeArg.of("nanos", result), - SafeArg.of("millis", millis), - SafeArg.of("best", best)); - } - - @Override - public void onFailure(Throwable throwable) { - log.info("Failed to sample RTT for channels", throwable); - } - }); - } - } - private static ChannelStats[] sortByScore(List preShuffled) { ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()]; @@ -206,6 +158,52 @@ private static ChannelStats[] sortByScore(List preShuff return snapshotArray; } + private void sampleRttsForAllChannels() { + // TODO(dfox): don't do this 100% of the time + // if (random.nextDouble() > 0.01f) { + // return; + // } + + List> futures = channels.stream() + .map(channel -> { + long before = clock.read(); + return channel.delegate + .maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST) + .map(future -> Futures.transform( + future, + _response -> { + long durationNanos = clock.read() - before; + channel.rtt.addMeasurement(durationNanos); + return durationNanos; + }, + MoreExecutors.directExecutor())) + .orElseGet(() -> Futures.immediateFuture(Long.MAX_VALUE)); + }) + .collect(ImmutableList.toImmutableList()); + + if (log.isDebugEnabled()) { + DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback>() { + @Override + public void onSuccess(List result) { + List millis = + result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); + List best = + channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList()); + log.debug( + "RTTs {} {} {}", + SafeArg.of("nanos", result), + SafeArg.of("millis", millis), + SafeArg.of("best", best)); + } + + @Override + public void onFailure(Throwable throwable) { + log.info("Failed to sample RTT for channels", throwable); + } + }); + } + } + /** Returns a new shuffled list, without mutating the input list (which may be immutable). */ private static List shuffleImmutableList(ImmutableList sourceList, Random random) { List mutableList = new ArrayList<>(sourceList); @@ -229,7 +227,7 @@ private static final class MutableChannelWithStats implements LimitedChannel { private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); - private final RoundTripTimeMeasurement rtt = new RoundTripTimeMeasurement(); + private final RttMeasurement rtt = new RttMeasurement(); /** * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns @@ -460,7 +458,7 @@ public String version() { @VisibleForTesting @ThreadSafe - static class RoundTripTimeMeasurement { + static final class RttMeasurement { // TODO(dfox): switch to some exponentially decaying thingy, so we could detect if network topology changes private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE); @@ -476,7 +474,7 @@ void addMeasurement(long newMeasurement) { @Override public String toString() { - return "RoundTripTimeMeasurement{" + rttNanos + '}'; + return "RttMeasurement{" + rttNanos + '}'; } } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 59bbba3ae..5d9357b4a 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -204,8 +204,8 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() { @Test void rtt_just_remembers_the_min() { - BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt = - new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement(); + BalancedNodeSelectionStrategyChannel.RttMeasurement rtt = + new BalancedNodeSelectionStrategyChannel.RttMeasurement(); rtt.addMeasurement(3); assertThat(rtt.getNanos()).isEqualTo(3); rtt.addMeasurement(1); From 9398d9a08f94a65363bcf7a4071e67652d8141ed Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 01:33:24 +0100 Subject: [PATCH 15/35] test for rate limiter --- .../BalancedNodeSelectionStrategyChannel.java | 38 +++++++++++++++---- ...ancedNodeSelectionStrategyChannelTest.java | 13 +++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 2111fad56..7a449c20e 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -84,6 +84,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private final ImmutableList channels; private final Random random; private final Ticker clock; + private final RttMeasurementRateLimiter shouldMeasureRtts; BalancedNodeSelectionStrategyChannel( ImmutableList channels, @@ -94,6 +95,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { Preconditions.checkState(channels.size() >= 2, "At least two channels required"); this.random = random; this.clock = ticker; + this.shouldMeasureRtts = new RttMeasurementRateLimiter(clock); this.channels = IntStream.range(0, channels.size()) .mapToObj(index -> new MutableChannelWithStats( channels.get(index), @@ -118,7 +120,9 @@ public Optional> maybeExecute(Endpoint endpoint, Requ for (ChannelStats channel : sortedChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { - sampleRttsForAllChannels(); + if (shouldMeasureRtts.tryAcquire()) { + sampleAllChannelsRttsNonBlocking(); + } return maybe; } } @@ -158,12 +162,7 @@ private static ChannelStats[] sortByScore(List preShuff return snapshotArray; } - private void sampleRttsForAllChannels() { - // TODO(dfox): don't do this 100% of the time - // if (random.nextDouble() > 0.01f) { - // return; - // } - + private void sampleAllChannelsRttsNonBlocking() { List> futures = channels.stream() .map(channel -> { long before = clock.read(); @@ -477,4 +476,29 @@ public String toString() { return "RttMeasurement{" + rttNanos + '}'; } } + + static final class RttMeasurementRateLimiter { + private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos(); + + private final Ticker clock; + private final AtomicLong lastMeasured = new AtomicLong(0); + + private RttMeasurementRateLimiter(Ticker clock) { + this.clock = clock; + } + + boolean tryAcquire() { + long last = lastMeasured.get(); + long now = clock.read(); + if (last + BETWEEN_SAMPLES > now) { + return false; + } + + // TODO(dfox): if we have thousands of requests/sec, maybe we should re-evaluate every 1000 or so? + // TODO(dfox): do we care if multiple samples are running at the same time? + + lastMeasured.compareAndSet(last, now); + return true; + } + } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 5d9357b4a..7fbc38dce 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -149,6 +149,8 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu @Test void rtt_is_measured_and_can_influence_choices() { + incrementClockBy(Duration.ofHours(1)); + // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); @@ -170,10 +172,20 @@ void rtt_is_measured_and_can_influence_choices() { assertThat(channel.getScoresForTesting().map(c -> c.getScore())) .describedAs("The poor latency of channel2 imposes a small constant penalty in the score") .containsExactly(0, 3); + + for (int i = 0; i < 500; i++) { + incrementClockBy(Duration.ofMillis(10)); + channel.maybeExecute(endpoint, request); + } + // rate limiter ensures a sensible amount of rtt sampling + verify(chan1, times(6)).maybeExecute(eq(rttEndpoint), any()); + verify(chan2, times(6)).maybeExecute(eq(rttEndpoint), any()); } @Test void when_rtt_measurements_are_limited_dont_freak_out() { + incrementClockBy(Duration.ofHours(1)); + // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); @@ -189,6 +201,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() { @Test void when_rtt_measurements_havent_returned_yet_dont_freak_out() { + incrementClockBy(Duration.ofHours(1)); // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); From b17e311062c3cb702584457800d4f2eb0c2ed1a6 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 01:34:13 +0100 Subject: [PATCH 16/35] Simulations --- ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- simulation/src/test/resources/report.md | 12 ++++++------ ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- 13 files changed, 24 insertions(+), 24 deletions(-) diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png index dcbdc4b08..12fe6c646 100644 --- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cefd6e18f6003d8146aa97dcfef902d00185395ae1f8d162c2482dacfd57d81 -size 117492 +oid sha256:516145937579fb582d7a76100b05fd7054b6357b95d560770988fc5b460e20f0 +size 115803 diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png index a011807a8..b0f68688b 100644 --- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32de7fbba94d7676ad359f3228e55337b61088fcb4102fe743e9105581c94280 -size 123659 +oid sha256:f6d4e200f4549b0c11a3bdd95af0c8e1dbafa2bb7fb7d2c5c27b48e9a9ed1d86 +size 122953 diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png index f6d0b4252..0972c02e4 100644 --- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd0ba5b93d7672d48abf2aef421884968bac702a9c760e159990ef4b3c6ac7a9 -size 82748 +oid sha256:76b09c3a5e31760626407eb8f6c1f50b2091a4d2ca7efe5e7d7a84cd099f7601 +size 83448 diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 459bfa31b..437e4e988 100644 --- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1836375d4ea7584287820c4b7658ff7b62c990ffac3cf1abe5657b6746651463 -size 135128 +oid sha256:8e2f1f9109ba521a3226d4d9fb745c3b7c9c440efb0ccc4d71f7ee5037199054 +size 133708 diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md index 61cf72fe9..c773eef36 100644 --- a/simulation/src/test/resources/report.md +++ b/simulation/src/test/resources/report.md @@ -2,7 +2,7 @@ ``` all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=79.3% client_mean=PT5.81342S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1586, 500=414} - all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=72.8% client_mean=PT2.57287S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1456, 500=544} + all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=73.6% client_mean=PT2.91032S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1472, 500=528} all_nodes_500[UNLIMITED_ROUND_ROBIN].txt: success=50.0% client_mean=PT0.6S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1000, 500=1000} black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=59.2% client_mean=PT0.600655114S server_cpu=PT11M49.8S client_received=1183/2000 server_resps=1183 codes={200=1183} black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} @@ -17,16 +17,16 @@ fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=94.3% client_mean=PT6.987708S server_cpu=PT1H56M8.59S client_received=2500/2500 server_resps=2500 codes={200=2357, 500=143} - live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=83.8% client_mean=PT4.529216S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} + live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=84.0% client_mean=PT4.5915112S server_cpu=PT1H52M50.27S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} live_reloading[UNLIMITED_ROUND_ROBIN].txt: success=86.9% client_mean=PT2.802124S server_cpu=PT1H56M45.31S client_received=2500/2500 server_resps=2500 codes={200=2173, 500=327} one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2.569766928S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} - one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.516076793S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} + one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.521339525S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} one_big_spike[UNLIMITED_ROUND_ROBIN].txt: success=99.9% client_mean=PT1.000523583S server_cpu=PT7M37.35S client_received=1000/1000 server_resps=3049 codes={200=999, 429=1} one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=64.9% client_mean=PT8.1950896S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1623, 500=877} - one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=66.8% client_mean=PT2.6967232S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1671, 500=829} + one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=64.2% client_mean=PT3.9523312S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1605, 500=895} one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt: success=65.1% client_mean=PT0.6S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1628, 500=872} server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT12M44.762591546S server_cpu=PT9H55M40.4S client_received=150000/150000 server_resps=178702 codes={200=149989, 429=11} - server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.244761087S server_cpu=PT9H1M59S client_received=150000/150000 server_resps=162595 codes={200=150000} + server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.262702651S server_cpu=PT9H5M50.6S client_received=150000/150000 server_resps=163753 codes={200=150000} server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.217416858S server_cpu=PT8H45M29.2S client_received=150000/150000 server_resps=157646 codes={200=150000} simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT0.834469696S server_cpu=PT3H3M35S client_received=13200/13200 server_resps=13200 codes={200=13200} simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.785757575S server_cpu=PT2H52M52S client_received=13200/13200 server_resps=13200 codes={200=13200} @@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: succe slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2M28.388896928S server_cpu=PT8H48M18.546665835S client_received=10000/10000 server_resps=10899 codes={200=10000} - slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M26.439571686S server_cpu=PT14H31M14.346666611S client_received=10000/10000 server_resps=13781 codes={200=9996, 500=4} + slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M18.314817484S server_cpu=PT13H50M38.666666634S client_received=10000/10000 server_resps=13148 codes={200=9995, 500=5} slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt: success=3.6% client_mean=PT21.551691121S server_cpu=PT54H45M57.899999949S client_received=10000/10000 server_resps=49335 codes={200=360, 500=9640} uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 69edd5eb7..4a1422710 100644 --- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c80da86e1873f4bd972a3cd3f2976c1bfb6f64e3bc28c7b9043feb08388a4c51 -size 552215 +oid sha256:29856bd2342821f7d0c0bb19b4b1efdcaed1af8b3c36cb2533485c55e0e00bfd +size 560290 diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 6abe35c1c..91ad7456a 100644 --- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd941ae9716e18ce6d35cbef330f26979460c9124ed744d143a5e124d497c92b -size 119483 +oid sha256:dc3169a632ae8fd7de4e96ba5bb2d39b99e97e12416d37c5fb6516a452a1eedf +size 119379 diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 4848ec898..71eaf7e5e 100644 --- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=72.8% client_mean=PT2.57287S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1456, 500=544} \ No newline at end of file +success=73.6% client_mean=PT2.91032S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1472, 500=528} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index eb2d8e4b8..e253a01c0 100644 --- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=83.8% client_mean=PT4.529216S server_cpu=PT1H53M23.21S client_received=2500/2500 server_resps=2500 codes={200=2096, 500=404} \ No newline at end of file +success=84.0% client_mean=PT4.5915112S server_cpu=PT1H52M50.27S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index b5f6ef6a5..b71e29339 100644 --- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT1.516076793S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} \ No newline at end of file +success=100.0% client_mean=PT1.521339525S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 8bec96e81..a4008cf4c 100644 --- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=66.8% client_mean=PT2.6967232S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1671, 500=829} \ No newline at end of file +success=64.2% client_mean=PT3.9523312S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1605, 500=895} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 22cc13a0e..bfc7b44c2 100644 --- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT0.244761087S server_cpu=PT9H1M59S client_received=150000/150000 server_resps=162595 codes={200=150000} \ No newline at end of file +success=100.0% client_mean=PT0.262702651S server_cpu=PT9H5M50.6S client_received=150000/150000 server_resps=163753 codes={200=150000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 0b011e68a..9fe47e0ea 100644 --- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT3M26.439571686S server_cpu=PT14H31M14.346666611S client_received=10000/10000 server_resps=13781 codes={200=9996, 500=4} \ No newline at end of file +success=100.0% client_mean=PT3M18.314817484S server_cpu=PT13H50M38.666666634S client_received=10000/10000 server_resps=13148 codes={200=9995, 500=5} \ No newline at end of file From 3beac0f782f3b09ee86a2cb24646cd7f34fcd3bd Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 02:08:58 +0100 Subject: [PATCH 17/35] Smaller diff --- .../BalancedNodeSelectionStrategyChannel.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 7a449c20e..a0cc5fb09 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -72,12 +72,12 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private static final double FAILURE_WEIGHT = 10; /** - * This determines how 'sticky' we are to our 'nearest' node (as measured by RTT). If this is set too high, then we - * may deliver suboptimal perf by sending all requests to a slow node that is physically nearby, when there's - * actually a faster one further away. - * If this is too low, then we may prematurely spill across AZs and pay the $ cost. Should probably - * keep this lower than {@link #FAILURE_WEIGHT} to ensure that a single 5xx makes a nearby node less attractive - * than a faraway node that exhibited zero failures. + * RTT_WEIGHT determines how sticky we are to the physically nearest node (as measured by RTT). If this is set + * too high, then we may deliver suboptimal perf by sending all requests to a slow node that is physically nearby, + * when there's actually a faster one further away. + * If this is too low, then we may prematurely spill across AZs and pay the $ cost. Keep this lower than + * {@link #FAILURE_WEIGHT} to ensure that a single 5xx makes a nearby node less attractive than a faraway node + * that exhibited zero failures. */ private static final double RTT_WEIGHT = 3; @@ -226,7 +226,7 @@ private static final class MutableChannelWithStats implements LimitedChannel { private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); - private final RttMeasurement rtt = new RttMeasurement(); + private final RttMeasurement rtt; /** * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns @@ -239,6 +239,7 @@ private static final class MutableChannelWithStats implements LimitedChannel { this.delegate = delegate; this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY); this.observability = observability; + this.rtt = new RttMeasurement(); // Saves one allocation on each network call this.updateStats = new FutureCallback() { @Override @@ -271,13 +272,12 @@ public void onFailure(Throwable throwable) { public Optional> maybeExecute(Endpoint endpoint, Request request) { inflight.incrementAndGet(); Optional> maybe = delegate.maybeExecute(endpoint, request); - if (!maybe.isPresent()) { + if (maybe.isPresent()) { + DialogueFutures.addDirectCallback(maybe.get(), updateStats); + observability.markRequestMade(); + } else { inflight.decrementAndGet(); // quickly undo - return Optional.empty(); } - - DialogueFutures.addDirectCallback(maybe.get(), updateStats); - observability.markRequestMade(); return maybe; } @@ -477,7 +477,7 @@ public String toString() { } } - static final class RttMeasurementRateLimiter { + private static final class RttMeasurementRateLimiter { private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos(); private final Ticker clock; From c40950f3dda77294f4d7d1eaa023c04060ae8c28 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 01:08:58 +0000 Subject: [PATCH 18/35] Add generated changelog entries --- changelog/@unreleased/pr-794.v2.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changelog/@unreleased/pr-794.v2.yml diff --git a/changelog/@unreleased/pr-794.v2.yml b/changelog/@unreleased/pr-794.v2.yml new file mode 100644 index 000000000..013c4ed00 --- /dev/null +++ b/changelog/@unreleased/pr-794.v2.yml @@ -0,0 +1,6 @@ +type: feature +feature: + description: Balanced channel now biases towards whichever node has the lowest latency, + which should reduce AWS spend by routing requests within AZ. + links: + - https://github.com/palantir/dialogue/pull/794 From c91333487e37e284c5e0495e6456f94512b173aa Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 17:06:58 +0100 Subject: [PATCH 19/35] Don't allow multiple samples to run at the same time --- .../BalancedNodeSelectionStrategyChannel.java | 45 +++++++++++++------ ...ancedNodeSelectionStrategyChannelTest.java | 7 ++- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index a0cc5fb09..0d6a6c955 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -35,6 +35,7 @@ import com.palantir.logsafe.UnsafeArg; import com.palantir.tritium.metrics.registry.MetricName; import com.palantir.tritium.metrics.registry.TaggedMetricRegistry; +import java.io.Closeable; import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; @@ -45,6 +46,7 @@ import java.util.Optional; import java.util.Random; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; @@ -120,9 +122,7 @@ public Optional> maybeExecute(Endpoint endpoint, Requ for (ChannelStats channel : sortedChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { - if (shouldMeasureRtts.tryAcquire()) { - sampleAllChannelsRttsNonBlocking(); - } + maybeSampleAllChannelRttsNonBlocking(); return maybe; } } @@ -162,7 +162,12 @@ private static ChannelStats[] sortByScore(List preShuff return snapshotArray; } - private void sampleAllChannelsRttsNonBlocking() { + private void maybeSampleAllChannelRttsNonBlocking() { + Optional maybePermit = shouldMeasureRtts.acquireNonBlocking(); + if (!maybePermit.isPresent()) { + return; + } + List> futures = channels.stream() .map(channel -> { long before = clock.read(); @@ -180,8 +185,13 @@ private void sampleAllChannelsRttsNonBlocking() { }) .collect(ImmutableList.toImmutableList()); + // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off + // more and more and more requests and eventually exhaust a threadpool + ListenableFuture> allAsList = Futures.allAsList(futures); + DialogueFutures.addDirectListener(allAsList, maybePermit.get()::close); + if (log.isDebugEnabled()) { - DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback>() { + DialogueFutures.addDirectCallback(allAsList, new FutureCallback>() { @Override public void onSuccess(List result) { List millis = @@ -482,23 +492,30 @@ private static final class RttMeasurementRateLimiter { private final Ticker clock; private final AtomicLong lastMeasured = new AtomicLong(0); + private final AtomicBoolean currentlySampling = new AtomicBoolean(false); private RttMeasurementRateLimiter(Ticker clock) { this.clock = clock; } - boolean tryAcquire() { + Optional acquireNonBlocking() { long last = lastMeasured.get(); - long now = clock.read(); - if (last + BETWEEN_SAMPLES > now) { - return false; + if (last + BETWEEN_SAMPLES > clock.read()) { + return Optional.empty(); } - // TODO(dfox): if we have thousands of requests/sec, maybe we should re-evaluate every 1000 or so? - // TODO(dfox): do we care if multiple samples are running at the same time? - - lastMeasured.compareAndSet(last, now); - return true; + if (currentlySampling.compareAndSet(false, true)) { + lastMeasured.set(clock.read()); + return Optional.of(() -> currentlySampling.compareAndSet(true, false)); + } else { + log.warn("Wanted to sample RTTs but an existing sample was still in progress"); + return Optional.empty(); + } } } + + private interface RttSamplePermit extends Closeable { + @Override + void close(); + } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 7fbc38dce..e900b88c2 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -210,9 +210,14 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() { when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create())); when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create())); - channel.maybeExecute(endpoint, request); + for (int i = 0; i < 20; i++) { + incrementClockBy(Duration.ofSeconds(5)); + channel.maybeExecute(endpoint, request); + } assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); + verify(chan1, times(1)).maybeExecute(eq(rttEndpoint), any()); + verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any()); } @Test From 7005529a5540036a41e581e1ce283eebdff2213c Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 17:24:00 +0100 Subject: [PATCH 20/35] Return the min of the last 5 measurements --- .../BalancedNodeSelectionStrategyChannel.java | 45 +++++++++++++------ ...ancedNodeSelectionStrategyChannelTest.java | 14 ++++-- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 0d6a6c955..fa29c3073 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -67,9 +67,8 @@ */ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class); - private static final Request BLANK_REQUEST = Request.builder().build(); - private static final Comparator BY_SCORE = Comparator.comparingInt(ChannelStats::getScore); + private static final Comparator BY_SCORE = Comparator.comparingInt(ChannelStats::getScore); private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30); private static final double FAILURE_WEIGHT = 10; @@ -163,7 +162,7 @@ private static ChannelStats[] sortByScore(List preShuff } private void maybeSampleAllChannelRttsNonBlocking() { - Optional maybePermit = shouldMeasureRtts.acquireNonBlocking(); + Optional maybePermit = shouldMeasureRtts.acquireNonBlocking(); if (!maybePermit.isPresent()) { return; } @@ -172,7 +171,8 @@ private void maybeSampleAllChannelRttsNonBlocking() { .map(channel -> { long before = clock.read(); return channel.delegate - .maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST) + .maybeExecute( + RttEndpoint.INSTANCE, Request.builder().build()) .map(future -> Futures.transform( future, _response -> { @@ -465,25 +465,44 @@ public String version() { } } + /** + * Always returns the *minimum* value from the last few samples, so that we exclude slow calls that might include + * TLS handshakes. + */ @VisibleForTesting @ThreadSafe static final class RttMeasurement { + private static final int NUM_MEASUREMENTS = 5; - // TODO(dfox): switch to some exponentially decaying thingy, so we could detect if network topology changes - private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE); + private final long[] samples; + private volatile long bestRttNanos = Long.MAX_VALUE; + + RttMeasurement() { + samples = new long[NUM_MEASUREMENTS]; + Arrays.fill(samples, Long.MAX_VALUE); + } long getNanos() { - return rttNanos.get(); + return bestRttNanos; } - void addMeasurement(long newMeasurement) { - // Only stores the *minimum* values, so that we exclude slow calls that might include TLS handshakes. - rttNanos.getAndUpdate(existing -> Math.min(existing, newMeasurement)); + synchronized void addMeasurement(long newMeasurement) { + Preconditions.checkArgument(newMeasurement > 0, "Must be greater than zero"); + Preconditions.checkArgument(newMeasurement < Long.MAX_VALUE, "Must be less than MAX_VALUE"); + + if (samples[0] == Long.MAX_VALUE) { + Arrays.fill(samples, newMeasurement); + bestRttNanos = newMeasurement; + } else { + System.arraycopy(samples, 1, samples, 0, NUM_MEASUREMENTS - 1); + samples[NUM_MEASUREMENTS - 1] = newMeasurement; + bestRttNanos = Arrays.stream(samples).min().getAsLong(); + } } @Override public String toString() { - return "RttMeasurement{" + rttNanos + '}'; + return "RttMeasurement{" + "bestRttNanos=" + bestRttNanos + ", samples=" + Arrays.toString(samples) + '}'; } } @@ -498,7 +517,7 @@ private RttMeasurementRateLimiter(Ticker clock) { this.clock = clock; } - Optional acquireNonBlocking() { + Optional acquireNonBlocking() { long last = lastMeasured.get(); if (last + BETWEEN_SAMPLES > clock.read()) { return Optional.empty(); @@ -514,7 +533,7 @@ Optional acquireNonBlocking() { } } - private interface RttSamplePermit extends Closeable { + private interface RttMeasurementPermit extends Closeable { @Override void close(); } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index e900b88c2..ba3e35c5b 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -221,17 +221,23 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() { } @Test - void rtt_just_remembers_the_min() { + void rtt_returns_the_min_of_the_last_5_measurements() { BalancedNodeSelectionStrategyChannel.RttMeasurement rtt = new BalancedNodeSelectionStrategyChannel.RttMeasurement(); rtt.addMeasurement(3); - assertThat(rtt.getNanos()).isEqualTo(3); + assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(3); rtt.addMeasurement(1); rtt.addMeasurement(2); - assertThat(rtt.getNanos()).isEqualTo(1); + assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1); rtt.addMeasurement(500); - assertThat(rtt.getNanos()).isEqualTo(1); + assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1); + rtt.addMeasurement(500); + rtt.addMeasurement(500); + rtt.addMeasurement(500); + assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(2); + rtt.addMeasurement(500); + assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(500); } private static void set200(LimitedChannel chan) { From 4c4ce9164bd0f6147ef864b6197847c389dee579 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 17:38:51 +0100 Subject: [PATCH 21/35] Move sorting --- .../core/BalancedNodeSelectionStrategyChannel.java | 12 ++++++------ .../BalancedNodeSelectionStrategyChannelTest.java | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index fa29c3073..55e3abc9e 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -116,9 +116,10 @@ public Optional> maybeExecute(Endpoint endpoint, Requ // TODO(dfox): P2C optimization when we have high number of nodes to save CPU? // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf - ChannelStats[] sortedChannels = sortByScore(preShuffled); + ChannelStats[] sortableChannels = computeScores(preShuffled); + Arrays.sort(sortableChannels, BY_SCORE); - for (ChannelStats channel : sortedChannels) { + for (ChannelStats channel : sortableChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { maybeSampleAllChannelRttsNonBlocking(); @@ -129,7 +130,7 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return Optional.empty(); } - private static ChannelStats[] sortByScore(List preShuffled) { + private static ChannelStats[] computeScores(List preShuffled) { ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()]; long bestRttNanos = Long.MAX_VALUE; @@ -157,7 +158,6 @@ private static ChannelStats[] sortByScore(List preShuff } } - Arrays.sort(snapshotArray, BY_SCORE); return snapshotArray; } @@ -222,7 +222,7 @@ private static List shuffleImmutableList(ImmutableList sourceList, Ran @VisibleForTesting Stream getScoresForTesting() { - return Arrays.stream(sortByScore(channels)); + return Arrays.stream(computeScores(channels)); } @Override @@ -358,7 +358,7 @@ private static void registerGauges( DialogueInternalWeakReducingGauge.getOrCreate( taggedMetrics, metricName, - c -> c.getSnapshot().getScore(), + c -> c.getSnapshot().getScore(), // note, this doesn't include rtts longStream -> { long[] longs = longStream.toArray(); if (log.isInfoEnabled() && longs.length > 1) { diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index ba3e35c5b..59a82d131 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -136,7 +136,7 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu channel.maybeExecute(endpoint, request); assertThat(channel.getScoresForTesting().map(c -> c.getScore())) .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), channel) - .containsExactly(0, 1); + .containsExactly(1, 0); incrementClockBy(Duration.ofSeconds(5)); From 7f9db171fb005aece822211851e81198ca9dca3f Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 17:43:51 +0100 Subject: [PATCH 22/35] Refactor --- .../BalancedNodeSelectionStrategyChannel.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 55e3abc9e..81313c819 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -187,13 +187,12 @@ private void maybeSampleAllChannelRttsNonBlocking() { // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off // more and more and more requests and eventually exhaust a threadpool - ListenableFuture> allAsList = Futures.allAsList(futures); - DialogueFutures.addDirectListener(allAsList, maybePermit.get()::close); + DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback>() { + @Override + public void onSuccess(List result) { + maybePermit.get().close(); - if (log.isDebugEnabled()) { - DialogueFutures.addDirectCallback(allAsList, new FutureCallback>() { - @Override - public void onSuccess(List result) { + if (log.isDebugEnabled()) { List millis = result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); List best = @@ -204,13 +203,14 @@ public void onSuccess(List result) { SafeArg.of("millis", millis), SafeArg.of("best", best)); } + } - @Override - public void onFailure(Throwable throwable) { - log.info("Failed to sample RTT for channels", throwable); - } - }); - } + @Override + public void onFailure(Throwable throwable) { + maybePermit.get().close(); + log.info("Failed to sample RTT for channels", throwable); + } + }); } /** Returns a new shuffled list, without mutating the input list (which may be immutable). */ From 64a955b5ee11df19795679a2d3bd940dc080f1fd Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 18:08:35 +0100 Subject: [PATCH 23/35] Pull everything out to a dedicated 'RttSampler' class --- .../BalancedNodeSelectionStrategyChannel.java | 181 +------------- .../palantir/dialogue/core/RttSampler.java | 229 ++++++++++++++++++ ...ancedNodeSelectionStrategyChannelTest.java | 29 +-- .../dialogue/core/RttSamplerTest.java | 43 ++++ 4 files changed, 285 insertions(+), 197 deletions(-) create mode 100644 dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java create mode 100644 dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 81313c819..a47712671 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -22,37 +22,27 @@ import com.google.common.collect.ImmutableList; import com.google.common.primitives.Ints; import com.google.common.util.concurrent.FutureCallback; -import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; -import com.google.common.util.concurrent.MoreExecutors; import com.palantir.dialogue.Endpoint; -import com.palantir.dialogue.HttpMethod; import com.palantir.dialogue.Request; import com.palantir.dialogue.Response; -import com.palantir.dialogue.UrlBuilder; import com.palantir.logsafe.Preconditions; import com.palantir.logsafe.SafeArg; import com.palantir.logsafe.UnsafeArg; import com.palantir.tritium.metrics.registry.MetricName; import com.palantir.tritium.metrics.registry.TaggedMetricRegistry; -import java.io.Closeable; import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.Map; import java.util.Optional; import java.util.Random; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; +import java.util.function.LongSupplier; import java.util.stream.IntStream; import java.util.stream.Stream; -import javax.annotation.concurrent.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -85,7 +75,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private final ImmutableList channels; private final Random random; private final Ticker clock; - private final RttMeasurementRateLimiter shouldMeasureRtts; + private final RttSampler rttSampler; BalancedNodeSelectionStrategyChannel( ImmutableList channels, @@ -96,11 +86,12 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { Preconditions.checkState(channels.size() >= 2, "At least two channels required"); this.random = random; this.clock = ticker; - this.shouldMeasureRtts = new RttMeasurementRateLimiter(clock); + this.rttSampler = new RttSampler(channels, clock); this.channels = IntStream.range(0, channels.size()) .mapToObj(index -> new MutableChannelWithStats( channels.get(index), clock, + rttSampler.get(index), PerHostObservability.create(channels, taggedMetrics, channelName, index))) .collect(ImmutableList.toImmutableList()); @@ -122,7 +113,7 @@ public Optional> maybeExecute(Endpoint endpoint, Requ for (ChannelStats channel : sortableChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { - maybeSampleAllChannelRttsNonBlocking(); + rttSampler.maybeSampleRtts(); return maybe; } } @@ -161,58 +152,6 @@ private static ChannelStats[] computeScores(List preShu return snapshotArray; } - private void maybeSampleAllChannelRttsNonBlocking() { - Optional maybePermit = shouldMeasureRtts.acquireNonBlocking(); - if (!maybePermit.isPresent()) { - return; - } - - List> futures = channels.stream() - .map(channel -> { - long before = clock.read(); - return channel.delegate - .maybeExecute( - RttEndpoint.INSTANCE, Request.builder().build()) - .map(future -> Futures.transform( - future, - _response -> { - long durationNanos = clock.read() - before; - channel.rtt.addMeasurement(durationNanos); - return durationNanos; - }, - MoreExecutors.directExecutor())) - .orElseGet(() -> Futures.immediateFuture(Long.MAX_VALUE)); - }) - .collect(ImmutableList.toImmutableList()); - - // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off - // more and more and more requests and eventually exhaust a threadpool - DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback>() { - @Override - public void onSuccess(List result) { - maybePermit.get().close(); - - if (log.isDebugEnabled()) { - List millis = - result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); - List best = - channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList()); - log.debug( - "RTTs {} {} {}", - SafeArg.of("nanos", result), - SafeArg.of("millis", millis), - SafeArg.of("best", best)); - } - } - - @Override - public void onFailure(Throwable throwable) { - maybePermit.get().close(); - log.info("Failed to sample RTT for channels", throwable); - } - }); - } - /** Returns a new shuffled list, without mutating the input list (which may be immutable). */ private static List shuffleImmutableList(ImmutableList sourceList, Random random) { List mutableList = new ArrayList<>(sourceList); @@ -233,10 +172,10 @@ public String toString() { private static final class MutableChannelWithStats implements LimitedChannel { private final LimitedChannel delegate; private final FutureCallback updateStats; + private final LongSupplier rtt; private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); - private final RttMeasurement rtt; /** * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns @@ -245,11 +184,12 @@ private static final class MutableChannelWithStats implements LimitedChannel { */ private final CoarseExponentialDecayReservoir recentFailuresReservoir; - MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) { + MutableChannelWithStats( + LimitedChannel delegate, Ticker clock, LongSupplier rtt, PerHostObservability observability) { this.delegate = delegate; this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY); + this.rtt = rtt; this.observability = observability; - this.rtt = new RttMeasurement(); // Saves one allocation on each network call this.updateStats = new FutureCallback() { @Override @@ -292,7 +232,7 @@ public Optional> maybeExecute(Endpoint endpoint, Requ } ChannelStats getSnapshot() { - return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getNanos()); + return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getAsLong()); } @Override @@ -436,105 +376,4 @@ public void markRequestMade() { }; } } - - @VisibleForTesting - enum RttEndpoint implements Endpoint { - INSTANCE; - - @Override - public void renderPath(Map _params, UrlBuilder _url) {} - - @Override - public HttpMethod httpMethod() { - return HttpMethod.OPTIONS; - } - - @Override - public String serviceName() { - return "Balanced"; - } - - @Override - public String endpointName() { - return "rtt"; - } - - @Override - public String version() { - return "0.0.0"; - } - } - - /** - * Always returns the *minimum* value from the last few samples, so that we exclude slow calls that might include - * TLS handshakes. - */ - @VisibleForTesting - @ThreadSafe - static final class RttMeasurement { - private static final int NUM_MEASUREMENTS = 5; - - private final long[] samples; - private volatile long bestRttNanos = Long.MAX_VALUE; - - RttMeasurement() { - samples = new long[NUM_MEASUREMENTS]; - Arrays.fill(samples, Long.MAX_VALUE); - } - - long getNanos() { - return bestRttNanos; - } - - synchronized void addMeasurement(long newMeasurement) { - Preconditions.checkArgument(newMeasurement > 0, "Must be greater than zero"); - Preconditions.checkArgument(newMeasurement < Long.MAX_VALUE, "Must be less than MAX_VALUE"); - - if (samples[0] == Long.MAX_VALUE) { - Arrays.fill(samples, newMeasurement); - bestRttNanos = newMeasurement; - } else { - System.arraycopy(samples, 1, samples, 0, NUM_MEASUREMENTS - 1); - samples[NUM_MEASUREMENTS - 1] = newMeasurement; - bestRttNanos = Arrays.stream(samples).min().getAsLong(); - } - } - - @Override - public String toString() { - return "RttMeasurement{" + "bestRttNanos=" + bestRttNanos + ", samples=" + Arrays.toString(samples) + '}'; - } - } - - private static final class RttMeasurementRateLimiter { - private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos(); - - private final Ticker clock; - private final AtomicLong lastMeasured = new AtomicLong(0); - private final AtomicBoolean currentlySampling = new AtomicBoolean(false); - - private RttMeasurementRateLimiter(Ticker clock) { - this.clock = clock; - } - - Optional acquireNonBlocking() { - long last = lastMeasured.get(); - if (last + BETWEEN_SAMPLES > clock.read()) { - return Optional.empty(); - } - - if (currentlySampling.compareAndSet(false, true)) { - lastMeasured.set(clock.read()); - return Optional.of(() -> currentlySampling.compareAndSet(true, false)); - } else { - log.warn("Wanted to sample RTTs but an existing sample was still in progress"); - return Optional.empty(); - } - } - } - - private interface RttMeasurementPermit extends Closeable { - @Override - void close(); - } } diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java new file mode 100644 index 000000000..c7b3d97cf --- /dev/null +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -0,0 +1,229 @@ +/* + * (c) Copyright 2020 Palantir Technologies Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.palantir.dialogue.core; + +import com.github.benmanes.caffeine.cache.Ticker; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.FutureCallback; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.MoreExecutors; +import com.palantir.dialogue.Endpoint; +import com.palantir.dialogue.HttpMethod; +import com.palantir.dialogue.Request; +import com.palantir.dialogue.UrlBuilder; +import com.palantir.logsafe.Preconditions; +import com.palantir.logsafe.SafeArg; +import java.io.Closeable; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.LongSupplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javax.annotation.concurrent.ThreadSafe; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class RttSampler { + private static final Logger log = LoggerFactory.getLogger(RttSampler.class); + + private final ImmutableList channels; + private final RttMeasurement[] rtts; + private final RttMeasurementRateLimiter rateLimiter; + private final Ticker clock; + + RttSampler(ImmutableList channels, Ticker clock) { + this.channels = channels; + this.rateLimiter = new RttMeasurementRateLimiter(clock); + this.clock = clock; + this.rtts = IntStream.range(0, channels.size()) + .mapToObj(i -> new RttMeasurement()) + .toArray(RttMeasurement[]::new); + } + + /** + * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link java.lang.Long#MAX_VALUE} to signify unknown. + */ + LongSupplier get(int index) { + return rtts[index]; + } + + /** + * Non-blocking. + */ + void maybeSampleRtts() { + Optional maybePermit = rateLimiter.acquireNonBlocking(); + if (!maybePermit.isPresent()) { + return; + } + + List> futures = IntStream.range(0, channels.size()) + .mapToObj(i -> { + long before = clock.read(); + return channels.get(i) + .maybeExecute( + RttEndpoint.INSTANCE, Request.builder().build()) + .map(future -> Futures.transform( + future, + _response -> { + long durationNanos = clock.read() - before; + rtts[i].addMeasurement(durationNanos); + return durationNanos; + }, + MoreExecutors.directExecutor())) + .orElseGet(() -> Futures.immediateFuture(Long.MAX_VALUE)); + }) + .collect(ImmutableList.toImmutableList()); + + // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off + // more and more and more requests and eventually exhaust a threadpool + DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback>() { + @Override + public void onSuccess(List result) { + maybePermit.get().close(); + + if (log.isDebugEnabled()) { + List millis = + result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); + long[] best = Arrays.stream(rtts) + .mapToLong(rtt -> rtt.getAsLong()) + .toArray(); + log.debug( + "RTTs {} {} {}", + SafeArg.of("nanos", result), + SafeArg.of("millis", millis), + SafeArg.of("best", Arrays.toString(best))); + } + } + + @Override + public void onFailure(Throwable throwable) { + maybePermit.get().close(); + log.info("Failed to sample RTT for channels", throwable); + } + }); + } + + @VisibleForTesting + enum RttEndpoint implements Endpoint { + INSTANCE; + + @Override + public void renderPath(Map _params, UrlBuilder _url) {} + + @Override + public HttpMethod httpMethod() { + return HttpMethod.OPTIONS; + } + + @Override + public String serviceName() { + return "Balanced"; + } + + @Override + public String endpointName() { + return "rtt"; + } + + @Override + public String version() { + return "0.0.0"; + } + } + + /** + * Always returns the *minimum* value from the last few samples, so that we exclude slow calls that might include + * TLS handshakes. + */ + @VisibleForTesting + @ThreadSafe + static final class RttMeasurement implements LongSupplier { + private static final int NUM_MEASUREMENTS = 5; + + private final long[] samples; + private volatile long bestRttNanos = Long.MAX_VALUE; + + RttMeasurement() { + samples = new long[NUM_MEASUREMENTS]; + Arrays.fill(samples, Long.MAX_VALUE); + } + + @Override + public long getAsLong() { + return bestRttNanos; + } + + synchronized void addMeasurement(long newMeasurement) { + Preconditions.checkArgument(newMeasurement > 0, "Must be greater than zero"); + Preconditions.checkArgument(newMeasurement < Long.MAX_VALUE, "Must be less than MAX_VALUE"); + + if (samples[0] == Long.MAX_VALUE) { + Arrays.fill(samples, newMeasurement); + bestRttNanos = newMeasurement; + } else { + System.arraycopy(samples, 1, samples, 0, NUM_MEASUREMENTS - 1); + samples[NUM_MEASUREMENTS - 1] = newMeasurement; + bestRttNanos = Arrays.stream(samples).min().getAsLong(); + } + } + + @Override + public String toString() { + return "RttMeasurement{" + "bestRttNanos=" + bestRttNanos + ", samples=" + Arrays.toString(samples) + '}'; + } + } + + private static final class RttMeasurementRateLimiter { + private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos(); + + private final Ticker clock; + private final AtomicLong lastMeasured = new AtomicLong(0); + private final AtomicBoolean currentlySampling = new AtomicBoolean(false); + + private RttMeasurementRateLimiter(Ticker clock) { + this.clock = clock; + } + + Optional acquireNonBlocking() { + long last = lastMeasured.get(); + if (last + BETWEEN_SAMPLES > clock.read()) { + return Optional.empty(); + } + + if (currentlySampling.compareAndSet(false, true)) { + lastMeasured.set(clock.read()); + return Optional.of(() -> currentlySampling.compareAndSet(true, false)); + } else { + log.warn("Wanted to sample RTTs but an existing sample was still in progress"); + return Optional.empty(); + } + } + } + + private interface RttMeasurementPermit extends Closeable { + @Override + void close(); + } +} diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 59a82d131..9164375e4 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -156,8 +156,7 @@ void rtt_is_measured_and_can_influence_choices() { SettableFuture chan1OptionsResponse = SettableFuture.create(); SettableFuture chan2OptionsResponse = SettableFuture.create(); - BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint = - BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE; + RttSampler.RttEndpoint rttEndpoint = RttSampler.RttEndpoint.INSTANCE; when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan1OptionsResponse)); when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan2OptionsResponse)); @@ -189,8 +188,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() { // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); - BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint = - BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE; + RttSampler.RttEndpoint rttEndpoint = RttSampler.RttEndpoint.INSTANCE; when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty()); when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty()); @@ -205,8 +203,7 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() { // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); - BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint = - BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE; + RttSampler.RttEndpoint rttEndpoint = RttSampler.RttEndpoint.INSTANCE; when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create())); when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create())); @@ -220,26 +217,6 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() { verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any()); } - @Test - void rtt_returns_the_min_of_the_last_5_measurements() { - BalancedNodeSelectionStrategyChannel.RttMeasurement rtt = - new BalancedNodeSelectionStrategyChannel.RttMeasurement(); - rtt.addMeasurement(3); - assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(3); - rtt.addMeasurement(1); - rtt.addMeasurement(2); - assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1); - - rtt.addMeasurement(500); - assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1); - rtt.addMeasurement(500); - rtt.addMeasurement(500); - rtt.addMeasurement(500); - assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(2); - rtt.addMeasurement(500); - assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(500); - } - private static void set200(LimitedChannel chan) { when(chan.maybeExecute(any(), any())).thenReturn(http(200)); } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java new file mode 100644 index 000000000..1a2b23383 --- /dev/null +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java @@ -0,0 +1,43 @@ +/* + * (c) Copyright 2020 Palantir Technologies Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.palantir.dialogue.core; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; + +class RttSamplerTest { + + @Test + void rtt_returns_the_min_of_the_last_5_measurements() { + RttSampler.RttMeasurement rtt = new RttSampler.RttMeasurement(); + rtt.addMeasurement(3); + assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(3); + rtt.addMeasurement(1); + rtt.addMeasurement(2); + assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1); + + rtt.addMeasurement(500); + assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1); + rtt.addMeasurement(500); + rtt.addMeasurement(500); + rtt.addMeasurement(500); + assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(2); + rtt.addMeasurement(500); + assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(500); + } +} From 2389a068be3014320c7b4c3b9b568c2a5fadfa6d Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 18:18:52 +0100 Subject: [PATCH 24/35] Use OptionalLong instead of Long.MAX_VALUE as special value --- .../BalancedNodeSelectionStrategyChannel.java | 30 +++++++++++-------- .../palantir/dialogue/core/RttSampler.java | 17 +++++++---- .../dialogue/core/RttSamplerTest.java | 10 +++---- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index a47712671..16ad8c8c1 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -38,9 +38,9 @@ import java.util.Comparator; import java.util.List; import java.util.Optional; +import java.util.OptionalLong; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.LongSupplier; import java.util.stream.IntStream; import java.util.stream.Stream; import org.slf4j.Logger; @@ -123,16 +123,17 @@ public Optional> maybeExecute(Endpoint endpoint, Requ private static ChannelStats[] computeScores(List preShuffled) { ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()]; - - long bestRttNanos = Long.MAX_VALUE; - long worstRttNanos = 0; for (int i = 0; i < preShuffled.size(); i++) { ChannelStats snapshot = preShuffled.get(i).getSnapshot(); snapshotArray[i] = snapshot; + } - if (snapshot.rttNanos != Long.MAX_VALUE) { - bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos); - worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos); + long bestRttNanos = Long.MAX_VALUE; + long worstRttNanos = 0; + for (ChannelStats snapshot : snapshotArray) { + if (snapshot.rttNanos.isPresent()) { + bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos.getAsLong()); + worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos.getAsLong()); } } @@ -145,7 +146,9 @@ private static ChannelStats[] computeScores(List preShu long rttRange = worstRttNanos - bestRttNanos; if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0) { for (ChannelStats snapshot : snapshotArray) { - snapshot.rttSpectrum = ((float) snapshot.rttNanos - bestRttNanos) / rttRange; + if (snapshot.rttNanos.isPresent()) { + snapshot.rttSpectrum = ((float) snapshot.rttNanos.getAsLong() - bestRttNanos) / rttRange; + } } } @@ -172,7 +175,7 @@ public String toString() { private static final class MutableChannelWithStats implements LimitedChannel { private final LimitedChannel delegate; private final FutureCallback updateStats; - private final LongSupplier rtt; + private final RttSampler.RttSupplier rtt; private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); @@ -185,7 +188,7 @@ private static final class MutableChannelWithStats implements LimitedChannel { private final CoarseExponentialDecayReservoir recentFailuresReservoir; MutableChannelWithStats( - LimitedChannel delegate, Ticker clock, LongSupplier rtt, PerHostObservability observability) { + LimitedChannel delegate, Ticker clock, RttSampler.RttSupplier rtt, PerHostObservability observability) { this.delegate = delegate; this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY); this.rtt = rtt; @@ -232,7 +235,7 @@ public Optional> maybeExecute(Endpoint endpoint, Requ } ChannelStats getSnapshot() { - return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getAsLong()); + return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getRttNanos()); } @Override @@ -256,10 +259,11 @@ static final class ChannelStats { private final int requestsInflight; private final double recentBadness; - private final long rttNanos; + private final OptionalLong rttNanos; private float rttSpectrum = 0; // assigned later once we've figured out the range of rttNanos across channels - ChannelStats(MutableChannelWithStats delegate, int requestsInflight, double recentBadness, long rttNanos) { + ChannelStats( + MutableChannelWithStats delegate, int requestsInflight, double recentBadness, OptionalLong rttNanos) { this.delegate = delegate; this.requestsInflight = requestsInflight; this.recentBadness = recentBadness; diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java index c7b3d97cf..9873cde9f 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.OptionalLong; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; @@ -63,9 +64,9 @@ final class RttSampler { } /** - * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link java.lang.Long#MAX_VALUE} to signify unknown. + * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link OptionalLong#empty()} if we have no data. */ - LongSupplier get(int index) { + RttSupplier get(int index) { return rtts[index]; } @@ -107,7 +108,7 @@ public void onSuccess(List result) { List millis = result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList()); long[] best = Arrays.stream(rtts) - .mapToLong(rtt -> rtt.getAsLong()) + .mapToLong(rtt -> rtt.getRttNanos().orElse(Long.MAX_VALUE)) .toArray(); log.debug( "RTTs {} {} {}", @@ -159,7 +160,7 @@ public String version() { */ @VisibleForTesting @ThreadSafe - static final class RttMeasurement implements LongSupplier { + static final class RttMeasurement implements RttSupplier { private static final int NUM_MEASUREMENTS = 5; private final long[] samples; @@ -171,8 +172,8 @@ static final class RttMeasurement implements LongSupplier { } @Override - public long getAsLong() { - return bestRttNanos; + public OptionalLong getRttNanos() { + return bestRttNanos == Long.MAX_VALUE ? OptionalLong.empty() : OptionalLong.of(bestRttNanos); } synchronized void addMeasurement(long newMeasurement) { @@ -226,4 +227,8 @@ private interface RttMeasurementPermit extends Closeable { @Override void close(); } + + interface RttSupplier { + OptionalLong getRttNanos(); + } } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java index 1a2b23383..00ccb1be9 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java @@ -26,18 +26,18 @@ class RttSamplerTest { void rtt_returns_the_min_of_the_last_5_measurements() { RttSampler.RttMeasurement rtt = new RttSampler.RttMeasurement(); rtt.addMeasurement(3); - assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(3); + assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(3); rtt.addMeasurement(1); rtt.addMeasurement(2); - assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1); + assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(1); rtt.addMeasurement(500); - assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1); + assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(1); rtt.addMeasurement(500); rtt.addMeasurement(500); rtt.addMeasurement(500); - assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(2); + assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(2); rtt.addMeasurement(500); - assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(500); + assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(500); } } From 3adc16c3503bf2c303bb7ae8b409440e0f09fed6 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 18:34:19 +0100 Subject: [PATCH 25/35] be more immutable, reduce diff --- .../BalancedNodeSelectionStrategyChannel.java | 74 +++++++++---------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 16ad8c8c1..e642224d9 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -58,7 +58,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class); - private static final Comparator BY_SCORE = Comparator.comparingInt(ChannelStats::getScore); + private static final Comparator BY_SCORE = Comparator.comparingInt(SortableChannel::getScore); private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30); private static final double FAILURE_WEIGHT = 10; @@ -107,10 +107,10 @@ public Optional> maybeExecute(Endpoint endpoint, Requ // TODO(dfox): P2C optimization when we have high number of nodes to save CPU? // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf - ChannelStats[] sortableChannels = computeScores(preShuffled); + SortableChannel[] sortableChannels = computeScores(preShuffled); Arrays.sort(sortableChannels, BY_SCORE); - for (ChannelStats channel : sortableChannels) { + for (SortableChannel channel : sortableChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { rttSampler.maybeSampleRtts(); @@ -121,19 +121,18 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return Optional.empty(); } - private static ChannelStats[] computeScores(List preShuffled) { - ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()]; + private static SortableChannel[] computeScores(List preShuffled) { + OptionalLong[] rtts = new OptionalLong[preShuffled.size()]; for (int i = 0; i < preShuffled.size(); i++) { - ChannelStats snapshot = preShuffled.get(i).getSnapshot(); - snapshotArray[i] = snapshot; + rtts[i] = preShuffled.get(i).rtt.getRttNanos(); } long bestRttNanos = Long.MAX_VALUE; long worstRttNanos = 0; - for (ChannelStats snapshot : snapshotArray) { - if (snapshot.rttNanos.isPresent()) { - bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos.getAsLong()); - worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos.getAsLong()); + for (OptionalLong rtt : rtts) { + if (rtt.isPresent()) { + bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong()); + worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong()); } } @@ -144,12 +143,16 @@ private static ChannelStats[] computeScores(List preShu // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero). // Note, this can only be computed when we have all the snapshots in front of us. long rttRange = worstRttNanos - bestRttNanos; - if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0) { - for (ChannelStats snapshot : snapshotArray) { - if (snapshot.rttNanos.isPresent()) { - snapshot.rttSpectrum = ((float) snapshot.rttNanos.getAsLong() - bestRttNanos) / rttRange; - } - } + boolean includeRtts = bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0; + + SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()]; + for (int i = 0; i < preShuffled.size(); i++) { + OptionalLong rtt = rtts[i]; + float rttSpectrum = + (includeRtts && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0; + + SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum); + snapshotArray[i] = snapshot; } return snapshotArray; @@ -163,7 +166,7 @@ private static List shuffleImmutableList(ImmutableList sourceList, Ran } @VisibleForTesting - Stream getScoresForTesting() { + Stream getScoresForTesting() { return Arrays.stream(computeScores(channels)); } @@ -234,8 +237,14 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return maybe; } - ChannelStats getSnapshot() { - return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getRttNanos()); + SortableChannel computeScore(float rttSpectrum) { + // it's important that scores are integers because if we kept the full double precision, then a single 4xx + // would end up influencing host selection long beyond its intended lifespan in the absence of other data. + int score = inflight.get() + + Ints.saturatedCast(Math.round(recentFailuresReservoir.get())) + + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT)); + + return new SortableChannel(score, this); } @Override @@ -254,33 +263,22 @@ public String toString() { * might change mid-sort, leading to undefined behaviour. */ @VisibleForTesting - static final class ChannelStats { + static final class SortableChannel { + private final int score; private final MutableChannelWithStats delegate; - private final int requestsInflight; - private final double recentBadness; - - private final OptionalLong rttNanos; - private float rttSpectrum = 0; // assigned later once we've figured out the range of rttNanos across channels - ChannelStats( - MutableChannelWithStats delegate, int requestsInflight, double recentBadness, OptionalLong rttNanos) { + SortableChannel(int score, MutableChannelWithStats delegate) { this.delegate = delegate; - this.requestsInflight = requestsInflight; - this.recentBadness = recentBadness; - this.rttNanos = rttNanos; + this.score = score; } int getScore() { - // it's important that scores are integers because if we kept the full double precision, then a single 4xx - // would end up influencing host selection long beyond its intended lifespan in the absence of other data. - return requestsInflight - + Ints.saturatedCast(Math.round(recentBadness)) - + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT)); + return score; } @Override public String toString() { - return "ChannelStats{" + "score=" + getScore() + ", delegate=" + delegate + '}'; + return "SortableChannel{score=" + score + ", delegate=" + delegate + '}'; } } @@ -302,7 +300,7 @@ private static void registerGauges( DialogueInternalWeakReducingGauge.getOrCreate( taggedMetrics, metricName, - c -> c.getSnapshot().getScore(), // note, this doesn't include rtts + c -> c.computeScore(0).getScore(), longStream -> { long[] longs = longStream.toArray(); if (log.isInfoEnabled() && longs.length > 1) { From eb76abb96d3858f0788610c16ef61452b98e72b8 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Thu, 28 May 2020 19:01:35 +0100 Subject: [PATCH 26/35] Feature flag it off --- .../BalancedNodeSelectionStrategyChannel.java | 98 +++++++++++-------- .../core/NodeSelectionStrategyChannel.java | 4 +- ...ancedNodeSelectionStrategyChannelTest.java | 17 +++- .../dialogue/core/NodeSelectionBenchmark.java | 5 +- 4 files changed, 78 insertions(+), 46 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index e642224d9..526728470 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -43,6 +43,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; import java.util.stream.Stream; +import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,6 +76,8 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { private final ImmutableList channels; private final Random random; private final Ticker clock; + + @Nullable private final RttSampler rttSampler; BalancedNodeSelectionStrategyChannel( @@ -82,16 +85,16 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { Random random, Ticker ticker, TaggedMetricRegistry taggedMetrics, - String channelName) { + String channelName, + RttSampling samplingEnabled) { Preconditions.checkState(channels.size() >= 2, "At least two channels required"); this.random = random; this.clock = ticker; - this.rttSampler = new RttSampler(channels, clock); + this.rttSampler = samplingEnabled == RttSampling.DEFAULT_OFF ? null : new RttSampler(channels, clock); this.channels = IntStream.range(0, channels.size()) .mapToObj(index -> new MutableChannelWithStats( channels.get(index), clock, - rttSampler.get(index), PerHostObservability.create(channels, taggedMetrics, channelName, index))) .collect(ImmutableList.toImmutableList()); @@ -99,6 +102,11 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel { log.debug("Initialized", SafeArg.of("count", channels.size()), UnsafeArg.of("channels", channels)); } + enum RttSampling { + DEFAULT_OFF, + ENABLED + } + @Override public Optional> maybeExecute(Endpoint endpoint, Request request) { // pre-shuffling is pretty important here, otherwise when there are no requests in flight, we'd @@ -107,13 +115,15 @@ public Optional> maybeExecute(Endpoint endpoint, Requ // TODO(dfox): P2C optimization when we have high number of nodes to save CPU? // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf - SortableChannel[] sortableChannels = computeScores(preShuffled); + SortableChannel[] sortableChannels = computeScores(rttSampler, preShuffled); Arrays.sort(sortableChannels, BY_SCORE); for (SortableChannel channel : sortableChannels) { Optional> maybe = channel.delegate.maybeExecute(endpoint, request); if (maybe.isPresent()) { - rttSampler.maybeSampleRtts(); + if (rttSampler != null) { + rttSampler.maybeSampleRtts(); + } return maybe; } } @@ -121,41 +131,51 @@ public Optional> maybeExecute(Endpoint endpoint, Requ return Optional.empty(); } - private static SortableChannel[] computeScores(List preShuffled) { - OptionalLong[] rtts = new OptionalLong[preShuffled.size()]; - for (int i = 0; i < preShuffled.size(); i++) { - rtts[i] = preShuffled.get(i).rtt.getRttNanos(); - } + private static SortableChannel[] computeScores( + @Nullable RttSampler rttSampler, List preShuffled) { + SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()]; - long bestRttNanos = Long.MAX_VALUE; - long worstRttNanos = 0; - for (OptionalLong rtt : rtts) { - if (rtt.isPresent()) { - bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong()); - worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong()); - } - } + if (rttSampler == null) { - // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding it - // would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a conversion - // fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos to worstRttNanos, - // with 0 being best and 1 being worst. This ensures that if several nodes are all within the same AZ and - // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero). - // Note, this can only be computed when we have all the snapshots in front of us. - long rttRange = worstRttNanos - bestRttNanos; - boolean includeRtts = bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0; + for (int i = 0; i < preShuffled.size(); i++) { + SortableChannel snapshot = preShuffled.get(i).computeScore(0); + snapshotArray[i] = snapshot; + } + return snapshotArray; + + } else { + // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding + // it would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a + // conversion fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos + // to worstRttNanos, with 0 being best and 1 being worst. This ensures that if several nodes are all + // within the same AZ and can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have + // a similar rttScore (near zero). Note, this can only be computed when we have all the snapshots in + // front of us. + long bestRttNanos = Long.MAX_VALUE; + long worstRttNanos = 0; + + OptionalLong[] rtts = new OptionalLong[preShuffled.size()]; + for (int i = 0; i < preShuffled.size(); i++) { + OptionalLong rtt = rttSampler.get(i).getRttNanos(); + rtts[i] = rtt; + + if (rtt.isPresent()) { + bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong()); + worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong()); + } + } + long rttRange = worstRttNanos - bestRttNanos; - SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()]; - for (int i = 0; i < preShuffled.size(); i++) { - OptionalLong rtt = rtts[i]; - float rttSpectrum = - (includeRtts && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0; + for (int i = 0; i < preShuffled.size(); i++) { + OptionalLong rtt = rtts[i]; + float rttSpectrum = + (rttRange > 0 && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0; - SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum); - snapshotArray[i] = snapshot; + SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum); + snapshotArray[i] = snapshot; + } + return snapshotArray; } - - return snapshotArray; } /** Returns a new shuffled list, without mutating the input list (which may be immutable). */ @@ -167,7 +187,7 @@ private static List shuffleImmutableList(ImmutableList sourceList, Ran @VisibleForTesting Stream getScoresForTesting() { - return Arrays.stream(computeScores(channels)); + return Arrays.stream(computeScores(rttSampler, channels)); } @Override @@ -178,7 +198,6 @@ public String toString() { private static final class MutableChannelWithStats implements LimitedChannel { private final LimitedChannel delegate; private final FutureCallback updateStats; - private final RttSampler.RttSupplier rtt; private final PerHostObservability observability; private final AtomicInteger inflight = new AtomicInteger(0); @@ -190,11 +209,9 @@ private static final class MutableChannelWithStats implements LimitedChannel { */ private final CoarseExponentialDecayReservoir recentFailuresReservoir; - MutableChannelWithStats( - LimitedChannel delegate, Ticker clock, RttSampler.RttSupplier rtt, PerHostObservability observability) { + MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) { this.delegate = delegate; this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY); - this.rtt = rtt; this.observability = observability; // Saves one allocation on each network call this.updateStats = new FutureCallback() { @@ -253,7 +270,6 @@ public String toString() { + "delegate=" + delegate + ", inflight=" + inflight + ", recentFailures=" + recentFailuresReservoir - + ", rtt=" + rtt + '}'; } } diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java index 461e56c39..69bf6155d 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java @@ -24,6 +24,7 @@ import com.palantir.dialogue.Endpoint; import com.palantir.dialogue.Request; import com.palantir.dialogue.Response; +import com.palantir.dialogue.core.BalancedNodeSelectionStrategyChannel.RttSampling; import com.palantir.logsafe.SafeArg; import com.palantir.logsafe.exceptions.SafeRuntimeException; import com.palantir.tritium.metrics.registry.TaggedMetricRegistry; @@ -140,7 +141,8 @@ private NodeSelectionChannel createNodeSelectionChannel( // When people ask for 'ROUND_ROBIN', they usually just want something to load balance better. // We used to have a naive RoundRobinChannel, then tried RandomSelection and now use this heuristic: return channelBuilder - .channel(new BalancedNodeSelectionStrategyChannel(channels, random, tick, metrics, channelName)) + .channel(new BalancedNodeSelectionStrategyChannel( + channels, random, tick, metrics, channelName, RttSampling.DEFAULT_OFF)) .build(); case UNKNOWN: } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 9164375e4..606a94e5e 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -33,6 +33,7 @@ import com.palantir.dialogue.Response; import com.palantir.dialogue.TestEndpoint; import com.palantir.dialogue.TestResponse; +import com.palantir.dialogue.core.BalancedNodeSelectionStrategyChannel.RttSampling; import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry; import java.time.Duration; import java.util.Optional; @@ -65,7 +66,12 @@ class BalancedNodeSelectionStrategyChannelTest { @BeforeEach public void before() { channel = new BalancedNodeSelectionStrategyChannel( - ImmutableList.of(chan1, chan2), random, clock, new DefaultTaggedMetricRegistry(), "channelName"); + ImmutableList.of(chan1, chan2), + random, + clock, + new DefaultTaggedMetricRegistry(), + "channelName", + RttSampling.DEFAULT_OFF); } @Test @@ -149,6 +155,13 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu @Test void rtt_is_measured_and_can_influence_choices() { + channel = new BalancedNodeSelectionStrategyChannel( + ImmutableList.of(chan1, chan2), + random, + clock, + new DefaultTaggedMetricRegistry(), + "channelName", + RttSampling.ENABLED); incrementClockBy(Duration.ofHours(1)); // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); @@ -198,7 +211,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() { } @Test - void when_rtt_measurements_havent_returned_yet_dont_freak_out() { + void when_rtt_measurements_havent_returned_yet_consider_both_far_away() { incrementClockBy(Duration.ofHours(1)); // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); diff --git a/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java b/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java index f58380571..d407339e6 100644 --- a/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java +++ b/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java @@ -26,6 +26,7 @@ import com.palantir.dialogue.Response; import com.palantir.dialogue.TestEndpoint; import com.palantir.dialogue.TestResponse; +import com.palantir.dialogue.core.BalancedNodeSelectionStrategyChannel.RttSampling; import com.palantir.logsafe.exceptions.SafeIllegalArgumentException; import com.palantir.random.SafeThreadLocalRandom; import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry; @@ -123,8 +124,8 @@ public void before() { "channelName"); break; case ROUND_ROBIN: - channel = - new BalancedNodeSelectionStrategyChannel(channels, random, ticker, metrics, "channelName"); + channel = new BalancedNodeSelectionStrategyChannel( + channels, random, ticker, metrics, "channelName", RttSampling.DEFAULT_OFF); break; default: throw new SafeIllegalArgumentException("Unsupported"); From 4f73a902ea0190e67a63a3dd45fb034fe71b3ef5 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 14:18:57 +0100 Subject: [PATCH 27/35] Allow servers to enable it with BALANCED_RTT --- .../core/DialogueNodeSelectionStrategy.java | 29 +++++++++++++------ .../core/NodeSelectionStrategyChannel.java | 5 ++++ .../DialogueNodeSelectionStrategyTest.java | 4 +++ 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java index 5261d51aa..94218ec43 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java @@ -16,6 +16,7 @@ package com.palantir.dialogue.core; +import com.google.common.annotations.Beta; import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; @@ -35,6 +36,8 @@ enum DialogueNodeSelectionStrategy { PIN_UNTIL_ERROR, PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE, BALANCED, + @Beta + BALANCED_RTT, UNKNOWN; private static final Logger log = LoggerFactory.getLogger(DialogueNodeSelectionStrategy.class); @@ -45,17 +48,25 @@ static List fromHeader(String header) { Lists.transform(SPLITTER.splitToList(header), DialogueNodeSelectionStrategy::safeValueOf)); } - private static DialogueNodeSelectionStrategy safeValueOf(String value) { - String normalizedValue = value.toUpperCase(); - if (PIN_UNTIL_ERROR.name().equals(normalizedValue)) { - return PIN_UNTIL_ERROR; - } else if (PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE.name().equals(normalizedValue)) { - return PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE; - } else if (BALANCED.name().equals(normalizedValue)) { - return BALANCED; + /** + * We allow server-determined headers to access some incubating dialogue-specific strategies (e.g. BALANCED_RTT) + * which users can't normally configure. + */ + private static DialogueNodeSelectionStrategy safeValueOf(String string) { + String uppercaseString = string.toUpperCase(); + + switch (uppercaseString) { + case "PIN_UNTIL_ERROR": + return PIN_UNTIL_ERROR; + case "PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE": + return PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE; + case "BALANCED": + return BALANCED; + case "BALANCED_RTT": + return BALANCED_RTT; } - log.info("Received unknown selection strategy {}", SafeArg.of("strategy", value)); + log.info("Received unknown selection strategy {}", SafeArg.of("strategy", string)); return UNKNOWN; } diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java index 69bf6155d..aad2ee31c 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java @@ -144,6 +144,11 @@ private NodeSelectionChannel createNodeSelectionChannel( .channel(new BalancedNodeSelectionStrategyChannel( channels, random, tick, metrics, channelName, RttSampling.DEFAULT_OFF)) .build(); + case BALANCED_RTT: + return channelBuilder + .channel(new BalancedNodeSelectionStrategyChannel( + channels, random, tick, metrics, channelName, RttSampling.ENABLED)) + .build(); case UNKNOWN: } throw new SafeRuntimeException("Unknown NodeSelectionStrategy", SafeArg.of("unknown", strategy)); diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java index 91b11a2fb..757ebcb9e 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java @@ -32,6 +32,10 @@ void parses_single_strategy() { void parses_multiple_strategies() { assertThat(DialogueNodeSelectionStrategy.fromHeader("BALANCED, PIN_UNTIL_ERROR")) .containsExactly(DialogueNodeSelectionStrategy.BALANCED, DialogueNodeSelectionStrategy.PIN_UNTIL_ERROR); + assertThat(DialogueNodeSelectionStrategy.fromHeader("BALANCED_RTT, BALANCED")) + .containsExactly(DialogueNodeSelectionStrategy.BALANCED_RTT, DialogueNodeSelectionStrategy.BALANCED); + assertThat(DialogueNodeSelectionStrategy.fromHeader("BALANCED_FUTURE_EXPERIMENT, BALANCED")) + .containsExactly(DialogueNodeSelectionStrategy.UNKNOWN, DialogueNodeSelectionStrategy.BALANCED); } @Test From f3642056efa12e206c381a65fa6c96c2e8371841 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 14:45:49 +0100 Subject: [PATCH 28/35] Re-run simulations --- ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- .../black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- simulation/src/test/resources/report.md | 12 ++++++------ ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png | 4 ++-- ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- .../black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt | 2 +- 13 files changed, 24 insertions(+), 24 deletions(-) diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 12fe6c646..d8a479f77 100644 --- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:516145937579fb582d7a76100b05fd7054b6357b95d560770988fc5b460e20f0 -size 115803 +oid sha256:17e4b03f7618600213e834e93b52bc594fa9e96f5cf6220030e284d6250a6176 +size 113249 diff --git a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 94064ea2d..4f58c5438 100644 --- a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7252511a753523b3869eea5007f48298746fab76bf097cc512767c9bb1699fc6 -size 103509 +oid sha256:301c818a4e266379a6c45dee81d725d1dad6f3c8d2abba49d38a1df8e68ac7ee +size 103151 diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png index b0f68688b..24e0252de 100644 --- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6d4e200f4549b0c11a3bdd95af0c8e1dbafa2bb7fb7d2c5c27b48e9a9ed1d86 -size 122953 +oid sha256:e925ad02dd69d9f2a5ef8b47ad43aabd5a14106ce0937a73e9780a9a11f3c435 +size 123045 diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 437e4e988..aa3d69e55 100644 --- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e2f1f9109ba521a3226d4d9fb745c3b7c9c440efb0ccc4d71f7ee5037199054 -size 133708 +oid sha256:d492690bce9b84925c16675a75ff55af3cfb180b8b0f753c1fd036f317c2801c +size 137045 diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md index c773eef36..892475d24 100644 --- a/simulation/src/test/resources/report.md +++ b/simulation/src/test/resources/report.md @@ -2,10 +2,10 @@ ``` all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=79.3% client_mean=PT5.81342S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1586, 500=414} - all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=73.6% client_mean=PT2.91032S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1472, 500=528} + all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=73.7% client_mean=PT3.039455S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1474, 500=526} all_nodes_500[UNLIMITED_ROUND_ROBIN].txt: success=50.0% client_mean=PT0.6S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1000, 500=1000} black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=59.2% client_mean=PT0.600655114S server_cpu=PT11M49.8S client_received=1183/2000 server_resps=1183 codes={200=1183} - black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} + black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=91.5% client_mean=PT0.6S server_cpu=PT18M17.4S client_received=1829/2000 server_resps=1829 codes={200=1829} black_hole[UNLIMITED_ROUND_ROBIN].txt: success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} drastic_slowdown[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2.947028083S server_cpu=PT41M8.862333314S client_received=4000/4000 server_resps=4000 codes={200=4000} drastic_slowdown[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.251969999S server_cpu=PT16M47.879999984S client_received=4000/4000 server_resps=4000 codes={200=4000} @@ -17,16 +17,16 @@ fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.120107163S server_cpu=PT1H30M0.0000004S client_received=45000/45000 server_resps=45040 codes={200=45000} live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=94.3% client_mean=PT6.987708S server_cpu=PT1H56M8.59S client_received=2500/2500 server_resps=2500 codes={200=2357, 500=143} - live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=84.0% client_mean=PT4.5915112S server_cpu=PT1H52M50.27S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} + live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=84.0% client_mean=PT4.6409416S server_cpu=PT1H53M13.43S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} live_reloading[UNLIMITED_ROUND_ROBIN].txt: success=86.9% client_mean=PT2.802124S server_cpu=PT1H56M45.31S client_received=2500/2500 server_resps=2500 codes={200=2173, 500=327} one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2.569766928S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.521339525S server_cpu=PT2M30S client_received=1000/1000 server_resps=1000 codes={200=1000} one_big_spike[UNLIMITED_ROUND_ROBIN].txt: success=99.9% client_mean=PT1.000523583S server_cpu=PT7M37.35S client_received=1000/1000 server_resps=3049 codes={200=999, 429=1} one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=64.9% client_mean=PT8.1950896S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1623, 500=877} - one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=64.2% client_mean=PT3.9523312S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1605, 500=895} + one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=65.4% client_mean=PT4.0117872S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1636, 500=864} one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt: success=65.1% client_mean=PT0.6S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1628, 500=872} server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT12M44.762591546S server_cpu=PT9H55M40.4S client_received=150000/150000 server_resps=178702 codes={200=149989, 429=11} - server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.262702651S server_cpu=PT9H5M50.6S client_received=150000/150000 server_resps=163753 codes={200=150000} + server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.290099498S server_cpu=PT9H9M39.2S client_received=150000/150000 server_resps=164896 codes={200=150000} server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.217416858S server_cpu=PT8H45M29.2S client_received=150000/150000 server_resps=157646 codes={200=150000} simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT0.834469696S server_cpu=PT3H3M35S client_received=13200/13200 server_resps=13200 codes={200=13200} simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.785757575S server_cpu=PT2H52M52S client_received=13200/13200 server_resps=13200 codes={200=13200} @@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: succe slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.088828705S server_cpu=PT4M22.364666657S client_received=3000/3000 server_resps=3031 codes={200=3000} slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT2M28.388896928S server_cpu=PT8H48M18.546665835S client_received=10000/10000 server_resps=10899 codes={200=10000} - slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M18.314817484S server_cpu=PT13H50M38.666666634S client_received=10000/10000 server_resps=13148 codes={200=9995, 500=5} + slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT3M24.308654421S server_cpu=PT14H3M44.353333281S client_received=10000/10000 server_resps=13383 codes={200=9999, 500=1} slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt: success=3.6% client_mean=PT21.551691121S server_cpu=PT54H45M57.899999949S client_received=10000/10000 server_resps=49335 codes={200=360, 500=9640} uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=99.0% client_mean=PT0.000001S server_cpu=PT0.01S client_received=10000/10000 server_resps=10000 codes={200=9900, 500=100} diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 4a1422710..3c1b66b6a 100644 --- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29856bd2342821f7d0c0bb19b4b1efdcaed1af8b3c36cb2533485c55e0e00bfd -size 560290 +oid sha256:33cfa2b188b84d30e139b5a8ff39a646f65588b5cf7b5755a2a61993c9d38c8e +size 606983 diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png index 91ad7456a..f5dcc152c 100644 --- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png +++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc3169a632ae8fd7de4e96ba5bb2d39b99e97e12416d37c5fb6516a452a1eedf -size 119379 +oid sha256:160c566ff4a67cdd8ecddbb91c2262d1310a4925728dadb36ff81bf9a9237875 +size 120294 diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 71eaf7e5e..a0433ed68 100644 --- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=73.6% client_mean=PT2.91032S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1472, 500=528} \ No newline at end of file +success=73.7% client_mean=PT3.039455S server_cpu=PT20M client_received=2000/2000 server_resps=2000 codes={200=1474, 500=526} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 5c1da373e..5b70ecd5f 100644 --- a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=91.4% client_mean=PT0.6S server_cpu=PT18M16.8S client_received=1828/2000 server_resps=1828 codes={200=1828} \ No newline at end of file +success=91.5% client_mean=PT0.6S server_cpu=PT18M17.4S client_received=1829/2000 server_resps=1829 codes={200=1829} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index e253a01c0..46b26cdea 100644 --- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=84.0% client_mean=PT4.5915112S server_cpu=PT1H52M50.27S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} \ No newline at end of file +success=84.0% client_mean=PT4.6409416S server_cpu=PT1H53M13.43S client_received=2500/2500 server_resps=2500 codes={200=2101, 500=399} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index a4008cf4c..561f056b7 100644 --- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=64.2% client_mean=PT3.9523312S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1605, 500=895} \ No newline at end of file +success=65.4% client_mean=PT4.0117872S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1636, 500=864} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index bfc7b44c2..ffdfaca53 100644 --- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT0.262702651S server_cpu=PT9H5M50.6S client_received=150000/150000 server_resps=163753 codes={200=150000} \ No newline at end of file +success=100.0% client_mean=PT0.290099498S server_cpu=PT9H9M39.2S client_received=150000/150000 server_resps=164896 codes={200=150000} \ No newline at end of file diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt index 9fe47e0ea..d250cf150 100644 --- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt +++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt @@ -1 +1 @@ -success=100.0% client_mean=PT3M18.314817484S server_cpu=PT13H50M38.666666634S client_received=10000/10000 server_resps=13148 codes={200=9995, 500=5} \ No newline at end of file +success=100.0% client_mean=PT3M24.308654421S server_cpu=PT14H3M44.353333281S client_received=10000/10000 server_resps=13383 codes={200=9999, 500=1} \ No newline at end of file From 6f9804ad4818432992db0cbe8859d3d1a9493671 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 14:48:43 +0100 Subject: [PATCH 29/35] Fix tests --- ...ancedNodeSelectionStrategyChannelTest.java | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index 606a94e5e..c95d68ec6 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -59,6 +59,7 @@ class BalancedNodeSelectionStrategyChannelTest { private Endpoint endpoint = TestEndpoint.GET; private BalancedNodeSelectionStrategyChannel channel; + private BalancedNodeSelectionStrategyChannel rttChannel; @Mock Ticker clock; @@ -72,6 +73,13 @@ public void before() { new DefaultTaggedMetricRegistry(), "channelName", RttSampling.DEFAULT_OFF); + rttChannel = new BalancedNodeSelectionStrategyChannel( + ImmutableList.of(chan1, chan2), + random, + clock, + new DefaultTaggedMetricRegistry(), + "channelName", + RttSampling.ENABLED); } @Test @@ -133,35 +141,28 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu when(chan2.maybeExecute(any(), any())).thenReturn(http(200)); for (int i = 0; i < 11; i++) { - channel.maybeExecute(endpoint, request); - assertThat(channel.getScoresForTesting().map(c -> c.getScore())) - .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), channel) + rttChannel.maybeExecute(endpoint, request); + assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) + .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), rttChannel) .containsExactly(0, 0); incrementClockBy(Duration.ofMillis(50)); } - channel.maybeExecute(endpoint, request); - assertThat(channel.getScoresForTesting().map(c -> c.getScore())) - .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), channel) + rttChannel.maybeExecute(endpoint, request); + assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) + .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), rttChannel) .containsExactly(1, 0); incrementClockBy(Duration.ofSeconds(5)); - assertThat(channel.getScoresForTesting().map(c -> c.getScore())) + assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) .describedAs( "%s: We quickly forget about 4xxs and go back to fair shuffling %s", - Duration.ofNanos(clock.read()), channel) + Duration.ofNanos(clock.read()), rttChannel) .containsExactly(0, 0); } @Test void rtt_is_measured_and_can_influence_choices() { - channel = new BalancedNodeSelectionStrategyChannel( - ImmutableList.of(chan1, chan2), - random, - clock, - new DefaultTaggedMetricRegistry(), - "channelName", - RttSampling.ENABLED); incrementClockBy(Duration.ofHours(1)); // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200)); @@ -173,7 +174,7 @@ void rtt_is_measured_and_can_influence_choices() { when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan1OptionsResponse)); when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan2OptionsResponse)); - channel.maybeExecute(endpoint, request); + rttChannel.maybeExecute(endpoint, request); incrementClockBy(Duration.ofNanos(123)); chan1OptionsResponse.set(new TestResponse().code(200)); @@ -181,13 +182,13 @@ void rtt_is_measured_and_can_influence_choices() { incrementClockBy(Duration.ofNanos(456)); chan2OptionsResponse.set(new TestResponse().code(200)); - assertThat(channel.getScoresForTesting().map(c -> c.getScore())) + assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) .describedAs("The poor latency of channel2 imposes a small constant penalty in the score") .containsExactly(0, 3); for (int i = 0; i < 500; i++) { incrementClockBy(Duration.ofMillis(10)); - channel.maybeExecute(endpoint, request); + rttChannel.maybeExecute(endpoint, request); } // rate limiter ensures a sensible amount of rtt sampling verify(chan1, times(6)).maybeExecute(eq(rttEndpoint), any()); @@ -205,7 +206,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() { when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty()); when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty()); - channel.maybeExecute(endpoint, request); + rttChannel.maybeExecute(endpoint, request); assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); } @@ -222,10 +223,10 @@ void when_rtt_measurements_havent_returned_yet_consider_both_far_away() { for (int i = 0; i < 20; i++) { incrementClockBy(Duration.ofSeconds(5)); - channel.maybeExecute(endpoint, request); + rttChannel.maybeExecute(endpoint, request); } - assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); + assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); verify(chan1, times(1)).maybeExecute(eq(rttEndpoint), any()); verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any()); } From 0128c4eca6327b5179cdaf2c2add6b23f14f74db Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 16:53:55 +0100 Subject: [PATCH 30/35] Ensure we send a good user agent with these OPTIONS requests --- .../com/palantir/dialogue/core/RttSampler.java | 18 ++++++++++++++---- .../core/UserAgentEndpointChannel.java | 9 +++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java index 9873cde9f..3c0f463ed 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -23,6 +23,9 @@ import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; +import com.palantir.conjure.java.api.config.service.UserAgent; +import com.palantir.conjure.java.api.config.service.UserAgent.Agent; +import com.palantir.conjure.java.api.config.service.UserAgents; import com.palantir.dialogue.Endpoint; import com.palantir.dialogue.HttpMethod; import com.palantir.dialogue.Request; @@ -48,6 +51,9 @@ final class RttSampler { private static final Logger log = LoggerFactory.getLogger(RttSampler.class); + private static final String USER_AGENT = + UserAgents.format(UserAgent.of(Agent.of(RttEndpoint.INSTANCE.serviceName(), RttEndpoint.INSTANCE.version())) + .addAgent(UserAgentEndpointChannel.DIALOGUE_AGENT)); private final ImmutableList channels; private final RttMeasurement[] rtts; @@ -79,12 +85,16 @@ void maybeSampleRtts() { return; } + Request rttRequest = Request.builder() + // necessary as we've already gone through the UserAgentEndpointChannel + .putHeaderParams("user-agent", USER_AGENT) + .build(); + List> futures = IntStream.range(0, channels.size()) .mapToObj(i -> { long before = clock.read(); return channels.get(i) - .maybeExecute( - RttEndpoint.INSTANCE, Request.builder().build()) + .maybeExecute(RttEndpoint.INSTANCE, rttRequest) .map(future -> Futures.transform( future, _response -> { @@ -140,7 +150,7 @@ public HttpMethod httpMethod() { @Override public String serviceName() { - return "Balanced"; + return "RttSampler"; } @Override @@ -150,7 +160,7 @@ public String endpointName() { @Override public String version() { - return "0.0.0"; + return UserAgentEndpointChannel.dialogueVersion(); } } diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java index 0a116dd79..46a5a5f88 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java @@ -31,7 +31,7 @@ * {@link Endpoint}'s target service and endpoint. */ final class UserAgentEndpointChannel implements EndpointChannel { - private static final UserAgent.Agent DIALOGUE_AGENT = extractDialogueAgent(); + static final UserAgent.Agent DIALOGUE_AGENT = extractDialogueAgent(); private final EndpointChannel delegate; private final String userAgent; @@ -62,8 +62,13 @@ private static UserAgent augmentUserAgent(UserAgent baseAgent, Endpoint endpoint } private static UserAgent.Agent extractDialogueAgent() { + String version = dialogueVersion(); + return UserAgent.Agent.of("dialogue", version); + } + + static String dialogueVersion() { String maybeDialogueVersion = Channel.class.getPackage().getImplementationVersion(); - return UserAgent.Agent.of("dialogue", maybeDialogueVersion != null ? maybeDialogueVersion : "0.0.0"); + return maybeDialogueVersion != null ? maybeDialogueVersion : "0.0.0"; } @Override From 1af6a22d361f554b52f23994254be60a34a75225 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 17:00:03 +0100 Subject: [PATCH 31/35] More CR --- .../BalancedNodeSelectionStrategyChannel.java | 3 ++- .../com/palantir/dialogue/core/RttSampler.java | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 526728470..97313832d 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -49,7 +49,8 @@ /** * Chooses nodes based on stats about each channel, i.e. how many requests are currently - * being served and also how many failures have been seen in the last few seconds. + * being served, how many failures have been seen in the last few seconds and (optionally) also what the best latency + * to each node is. Use {@link RttSampling#ENABLED} to switch this on. * * This is intended to be a strict improvement over Round Robin and Random Selection which can leave fast servers * underutilized, as it sends the same number to both a slow and fast node. It is *not* appropriate for transactional diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java index 3c0f463ed..67bc07e0f 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -41,7 +41,6 @@ import java.util.OptionalLong; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicLong; import java.util.function.LongSupplier; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -80,7 +79,7 @@ RttSupplier get(int index) { * Non-blocking. */ void maybeSampleRtts() { - Optional maybePermit = rateLimiter.acquireNonBlocking(); + Optional maybePermit = rateLimiter.tryAcquire(); if (!maybePermit.isPresent()) { return; } @@ -210,22 +209,23 @@ private static final class RttMeasurementRateLimiter { private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos(); private final Ticker clock; - private final AtomicLong lastMeasured = new AtomicLong(0); private final AtomicBoolean currentlySampling = new AtomicBoolean(false); + private volatile long lastMeasured = 0; + + private final RttMeasurementPermit finishedSampling = () -> currentlySampling.set(false); private RttMeasurementRateLimiter(Ticker clock) { this.clock = clock; } - Optional acquireNonBlocking() { - long last = lastMeasured.get(); - if (last + BETWEEN_SAMPLES > clock.read()) { + Optional tryAcquire() { + if (lastMeasured + BETWEEN_SAMPLES > clock.read()) { return Optional.empty(); } - if (currentlySampling.compareAndSet(false, true)) { - lastMeasured.set(clock.read()); - return Optional.of(() -> currentlySampling.compareAndSet(true, false)); + if (!currentlySampling.get() && currentlySampling.compareAndSet(false, true)) { + lastMeasured = clock.read(); + return Optional.of(finishedSampling); } else { log.warn("Wanted to sample RTTs but an existing sample was still in progress"); return Optional.empty(); From f1566379949b45dbcb162186cc0479e7ee5ddddb Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 17:23:43 +0100 Subject: [PATCH 32/35] Move logic to RttSampler --- .../BalancedNodeSelectionStrategyChannel.java | 55 ++++------------- .../palantir/dialogue/core/RttSampler.java | 61 +++++++++++++++---- 2 files changed, 59 insertions(+), 57 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index 97313832d..f0be496ef 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -38,7 +38,6 @@ import java.util.Comparator; import java.util.List; import java.util.Optional; -import java.util.OptionalLong; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; @@ -133,50 +132,18 @@ public Optional> maybeExecute(Endpoint endpoint, Requ } private static SortableChannel[] computeScores( - @Nullable RttSampler rttSampler, List preShuffled) { - SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()]; - - if (rttSampler == null) { - - for (int i = 0; i < preShuffled.size(); i++) { - SortableChannel snapshot = preShuffled.get(i).computeScore(0); - snapshotArray[i] = snapshot; - } - return snapshotArray; - - } else { - // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding - // it would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a - // conversion fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos - // to worstRttNanos, with 0 being best and 1 being worst. This ensures that if several nodes are all - // within the same AZ and can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have - // a similar rttScore (near zero). Note, this can only be computed when we have all the snapshots in - // front of us. - long bestRttNanos = Long.MAX_VALUE; - long worstRttNanos = 0; - - OptionalLong[] rtts = new OptionalLong[preShuffled.size()]; - for (int i = 0; i < preShuffled.size(); i++) { - OptionalLong rtt = rttSampler.get(i).getRttNanos(); - rtts[i] = rtt; - - if (rtt.isPresent()) { - bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong()); - worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong()); - } - } - long rttRange = worstRttNanos - bestRttNanos; - - for (int i = 0; i < preShuffled.size(); i++) { - OptionalLong rtt = rtts[i]; - float rttSpectrum = - (rttRange > 0 && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0; - - SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum); - snapshotArray[i] = snapshot; - } - return snapshotArray; + @Nullable RttSampler rttSampler, List chans) { + // if the feature is disabled (i.e. RttSampling.DEFAULT_OFF), then we just consider every host to have a + // rttSpectrum of '0' + float[] rttSpectrums = rttSampler != null ? rttSampler.computeRttSpectrums() : new float[chans.size()]; + + SortableChannel[] snapshotArray = new SortableChannel[chans.size()]; + for (int i = 0; i < chans.size(); i++) { + MutableChannelWithStats channel = chans.get(i); + float rttSpectrum = rttSpectrums[i]; + snapshotArray[i] = channel.computeScore(rttSpectrum); } + return snapshotArray; } /** Returns a new shuffled list, without mutating the input list (which may be immutable). */ diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java index 67bc07e0f..713f15797 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -41,7 +41,6 @@ import java.util.OptionalLong; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.function.LongSupplier; import java.util.stream.Collectors; import java.util.stream.IntStream; import javax.annotation.concurrent.ThreadSafe; @@ -69,14 +68,53 @@ final class RttSampler { } /** - * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link OptionalLong#empty()} if we have no data. + * Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding + * it would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a + * conversion fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos + * to worstRttNanos, with 0 being best and 1 being worst. This ensures that if several nodes are all + * within the same AZ and can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have + * a similar rttScore (near zero). Note, this can only be computed when we have all the snapshots in + * front of us. */ - RttSupplier get(int index) { - return rtts[index]; + float[] computeRttSpectrums() { + long bestRttNanos = Long.MAX_VALUE; + long worstRttNanos = 0; + + // first we take a snapshot of all channels' RTT + OptionalLong[] snapshots = new OptionalLong[rtts.length]; + for (int i = 0; i < rtts.length; i++) { + OptionalLong rtt = rtts[i].getRttNanos(); + snapshots[i] = rtt; + + if (rtt.isPresent()) { + bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong()); + worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong()); + } + } + + // given the best & worst values, we can then compute the spectrums + float[] spectrums = new float[rtts.length]; + long rttRange = worstRttNanos - bestRttNanos; + if (rttRange <= 0) { + return spectrums; + } + + for (int i = 0; i < channels.size(); i++) { + OptionalLong rtt = snapshots[i]; + float rttSpectrum = rtt.isPresent() ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0; + Preconditions.checkState( + 0 <= rttSpectrum && rttSpectrum <= 1, + "rttSpectrum must be between 0 and 1", + SafeArg.of("value", rttSpectrum), + SafeArg.of("hostIndex", i)); + spectrums[i] = rttSpectrum; + } + + return spectrums; } /** - * Non-blocking. + * Non-blocking - should return pretty much instantly. */ void maybeSampleRtts() { Optional maybePermit = rateLimiter.tryAcquire(); @@ -106,8 +144,6 @@ void maybeSampleRtts() { }) .collect(ImmutableList.toImmutableList()); - // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off - // more and more and more requests and eventually exhaust a threadpool DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback>() { @Override public void onSuccess(List result) { @@ -169,7 +205,7 @@ public String version() { */ @VisibleForTesting @ThreadSafe - static final class RttMeasurement implements RttSupplier { + static final class RttMeasurement { private static final int NUM_MEASUREMENTS = 5; private final long[] samples; @@ -180,7 +216,6 @@ static final class RttMeasurement implements RttSupplier { Arrays.fill(samples, Long.MAX_VALUE); } - @Override public OptionalLong getRttNanos() { return bestRttNanos == Long.MAX_VALUE ? OptionalLong.empty() : OptionalLong.of(bestRttNanos); } @@ -218,6 +253,10 @@ private RttMeasurementRateLimiter(Ticker clock) { this.clock = clock; } + /** + * The RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off + * more and more and more requests and eventually exhaust a threadpool. Permit is released in a future callback. + */ Optional tryAcquire() { if (lastMeasured + BETWEEN_SAMPLES > clock.read()) { return Optional.empty(); @@ -237,8 +276,4 @@ private interface RttMeasurementPermit extends Closeable { @Override void close(); } - - interface RttSupplier { - OptionalLong getRttNanos(); - } } From f62429d8e94d7d6497d9471f48d7a7b8f24cc3b1 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 17:31:00 +0100 Subject: [PATCH 33/35] Appease errorprone --- .../core/BalancedNodeSelectionStrategyChannel.java | 5 ++--- .../com/palantir/dialogue/core/RttSampler.java | 3 ++- .../BalancedNodeSelectionStrategyChannelTest.java | 14 +++++++------- .../java/com/palantir/dialogue/OkHttpChannel.java | 5 ----- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index f0be496ef..b5dd0242a 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -41,7 +41,6 @@ import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; -import java.util.stream.Stream; import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -154,8 +153,8 @@ private static List shuffleImmutableList(ImmutableList sourceList, Ran } @VisibleForTesting - Stream getScoresForTesting() { - return Arrays.stream(computeScores(rttSampler, channels)); + IntStream getScoresForTesting() { + return Arrays.stream(computeScores(rttSampler, channels)).mapToInt(SortableChannel::getScore); } @Override diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java index 713f15797..28aabd5e6 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -63,7 +63,7 @@ final class RttSampler { this.rateLimiter = new RttMeasurementRateLimiter(clock); this.clock = clock; this.rtts = IntStream.range(0, channels.size()) - .mapToObj(i -> new RttMeasurement()) + .mapToObj(_i -> new RttMeasurement()) .toArray(RttMeasurement[]::new); } @@ -247,6 +247,7 @@ private static final class RttMeasurementRateLimiter { private final AtomicBoolean currentlySampling = new AtomicBoolean(false); private volatile long lastMeasured = 0; + @SuppressWarnings("UnnecessaryLambda") // just let me avoid allocations private final RttMeasurementPermit finishedSampling = () -> currentlySampling.set(false); private RttMeasurementRateLimiter(Ticker clock) { diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java index c95d68ec6..e645978cc 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java @@ -126,7 +126,7 @@ void a_single_4xx_doesnt_move_the_needle() { clock.read() < start + Duration.ofSeconds(10).toNanos(); incrementClockBy(Duration.ofMillis(50))) { channel.maybeExecute(endpoint, request); - assertThat(channel.getScoresForTesting().map(c -> c.getScore())) + assertThat(channel.getScoresForTesting()) .describedAs("A single 400 at the beginning isn't enough to impact scores", channel) .containsExactly(0, 0); } @@ -142,19 +142,19 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu for (int i = 0; i < 11; i++) { rttChannel.maybeExecute(endpoint, request); - assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) + assertThat(rttChannel.getScoresForTesting()) .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), rttChannel) .containsExactly(0, 0); incrementClockBy(Duration.ofMillis(50)); } rttChannel.maybeExecute(endpoint, request); - assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) + assertThat(rttChannel.getScoresForTesting()) .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), rttChannel) .containsExactly(1, 0); incrementClockBy(Duration.ofSeconds(5)); - assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) + assertThat(rttChannel.getScoresForTesting()) .describedAs( "%s: We quickly forget about 4xxs and go back to fair shuffling %s", Duration.ofNanos(clock.read()), rttChannel) @@ -182,7 +182,7 @@ void rtt_is_measured_and_can_influence_choices() { incrementClockBy(Duration.ofNanos(456)); chan2OptionsResponse.set(new TestResponse().code(200)); - assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())) + assertThat(rttChannel.getScoresForTesting()) .describedAs("The poor latency of channel2 imposes a small constant penalty in the score") .containsExactly(0, 3); @@ -208,7 +208,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() { rttChannel.maybeExecute(endpoint, request); - assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); + assertThat(channel.getScoresForTesting()).containsExactly(0, 0); } @Test @@ -226,7 +226,7 @@ void when_rtt_measurements_havent_returned_yet_consider_both_far_away() { rttChannel.maybeExecute(endpoint, request); } - assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0); + assertThat(rttChannel.getScoresForTesting()).containsExactly(0, 0); verify(chan1, times(1)).maybeExecute(eq(rttEndpoint), any()); verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any()); } diff --git a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java index eb43faa3f..593466dce 100644 --- a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java +++ b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java @@ -98,11 +98,6 @@ public ListenableFuture execute(Endpoint endpoint, Request request) { case PATCH: okRequest = okRequest.patch(toOkHttpBody(request.body())); break; - case OPTIONS: - okRequest = okRequest.method( - "OPTIONS", - request.body().isPresent() ? toOkHttpBody(request.body().get()) : null); - break; } // Fill headers From d873cd1f9471cfe925fe1d455b19a47c433aa9c2 Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Fri, 29 May 2020 17:35:00 +0100 Subject: [PATCH 34/35] Minimise diff --- .../BalancedNodeSelectionStrategyChannel.java | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java index b5dd0242a..d75f9a15c 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java @@ -222,12 +222,16 @@ public Optional> maybeExecute(Endpoint endpoint, Requ } SortableChannel computeScore(float rttSpectrum) { + int requestsInflight = inflight.get(); + double failureReservoir = recentFailuresReservoir.get(); + // it's important that scores are integers because if we kept the full double precision, then a single 4xx // would end up influencing host selection long beyond its intended lifespan in the absence of other data. - int score = inflight.get() - + Ints.saturatedCast(Math.round(recentFailuresReservoir.get())) + int score = requestsInflight + + Ints.saturatedCast(Math.round(failureReservoir)) + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT)); + observability.debugLogComputedScore(requestsInflight, failureReservoir, rttSpectrum, score); return new SortableChannel(score, this); } @@ -245,14 +249,13 @@ public String toString() { * A dedicated value class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger * might change mid-sort, leading to undefined behaviour. */ - @VisibleForTesting - static final class SortableChannel { + private static final class SortableChannel { private final int score; private final MutableChannelWithStats delegate; SortableChannel(int score, MutableChannelWithStats delegate) { - this.delegate = delegate; this.score = score; + this.delegate = delegate; } int getScore() { @@ -332,6 +335,19 @@ void debugLogStatusFailure(Response response) { } } + void debugLogComputedScore(int inflight, double failures, float rttSpectrum, int score) { + if (log.isDebugEnabled()) { + log.debug( + "Computed score", + channelName, + hostIndex, + SafeArg.of("score", score), + SafeArg.of("inflight", inflight), + SafeArg.of("failures", failures), + SafeArg.of("rttSpectrum", rttSpectrum)); + } + } + static PerHostObservability create( ImmutableList channels, TaggedMetricRegistry taggedMetrics, From f6f69171fb829bab834abd73dfb4c92a06c6b15d Mon Sep 17 00:00:00 2001 From: Dan Fox Date: Mon, 1 Jun 2020 13:18:16 +0100 Subject: [PATCH 35/35] Remember to close the response! --- .../src/main/java/com/palantir/dialogue/core/RttSampler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java index 28aabd5e6..237c985b2 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java @@ -134,9 +134,10 @@ void maybeSampleRtts() { .maybeExecute(RttEndpoint.INSTANCE, rttRequest) .map(future -> Futures.transform( future, - _response -> { + response -> { long durationNanos = clock.read() - before; rtts[i].addMeasurement(durationNanos); + response.close(); return durationNanos; }, MoreExecutors.directExecutor()))