From 5c39335cb86e4f9e18183d28999b619cdba2f138 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 00:47:43 +0100
Subject: [PATCH 01/35] Balanced measures RTT using OPTIONS requests

---
 .../BalancedNodeSelectionStrategyChannel.java | 78 +++++++++++++++++--
 .../com/palantir/dialogue/HttpMethod.java     |  1 +
 2 files changed, 71 insertions(+), 8 deletions(-)
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index a0ed1c400..8fec8a82f 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -24,8 +24,10 @@
 import com.google.common.util.concurrent.FutureCallback;
 import com.google.common.util.concurrent.ListenableFuture;
 import com.palantir.dialogue.Endpoint;
+import com.palantir.dialogue.HttpMethod;
 import com.palantir.dialogue.Request;
 import com.palantir.dialogue.Response;
+import com.palantir.dialogue.UrlBuilder;
 import com.palantir.logsafe.Preconditions;
 import com.palantir.logsafe.SafeArg;
 import com.palantir.logsafe.UnsafeArg;
@@ -37,6 +39,7 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
+import java.util.Map;
 import java.util.Optional;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -55,8 +58,10 @@
  */
 final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class);
+    private static final Request BLANK_REQUEST = Request.builder().build();
 
-    private static final Comparator<SortableChannel> BY_SCORE = Comparator.comparingInt(SortableChannel::getScore);
+    private static final Comparator<SortableChannel> BY_SCORE =
+            Comparator.comparingInt(SortableChannel::getScore).thenComparingLong(SortableChannel::getRtt);
     private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30);
     private static final double FAILURE_WEIGHT = 10;
 
@@ -133,9 +138,12 @@ public String toString() {
     private static final class MutableChannelWithStats implements LimitedChannel {
         private final LimitedChannel delegate;
         private final FutureCallback<Response> updateStats;
+        private final Ticker clock;
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
+        private volatile long rttNanos = 0;
+        // private volatile long nextRttMeasurement = 0;
 
         /**
          * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns
@@ -147,6 +155,7 @@ private static final class MutableChannelWithStats implements LimitedChannel {
         MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) {
             this.delegate = delegate;
             this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY);
+            this.clock = clock;
             this.observability = observability;
             // Saves one allocation on each network call
             this.updateStats = new FutureCallback<Response>() {
@@ -180,15 +189,35 @@ public void onFailure(Throwable throwable) {
         public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Request request) {
             inflight.incrementAndGet();
             Optional<ListenableFuture<Response>> maybe = delegate.maybeExecute(endpoint, request);
-            if (maybe.isPresent()) {
-                DialogueFutures.addDirectCallback(maybe.get(), updateStats);
-                observability.markRequestMade();
-            } else {
+            if (!maybe.isPresent()) {
                 inflight.decrementAndGet(); // quickly undo
+                return Optional.empty();
             }
+
+            DialogueFutures.addDirectCallback(maybe.get(), updateStats);
+            observability.markRequestMade();
+            maybeSampleRtt();
             return maybe;
         }
 
+        private void maybeSampleRtt() {
+            long initialNanos = clock.read();
+            // if (initialNanos < nextRttMeasurement) {
+            //     return;
+            // }
+
+            Optional<ListenableFuture<Response>> maybe = delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
+            if (maybe.isPresent()) {
+                // nextRttMeasurement = initialNanos + TimeUnit.SECONDS.toNanos(1);
+                DialogueFutures.addDirectListener(maybe.get(), () -> {
+                    rttNanos = clock.read() - initialNanos;
+                    if (log.isInfoEnabled()) {
+                        log.info("rttNanos {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate));
+                    }
+                });
+            }
+        }
+
         SortableChannel computeScore() {
             int requestsInflight = inflight.get();
             double failureReservoir = recentFailuresReservoir.get();
@@ -198,7 +227,7 @@ SortableChannel computeScore() {
             int score = requestsInflight + Ints.saturatedCast(Math.round(failureReservoir));
 
             observability.debugLogComputedScore(requestsInflight, failureReservoir, score);
-            return new SortableChannel(score, this);
+            return new SortableChannel(score, rttNanos, this);
         }
 
         @Override
@@ -217,10 +246,12 @@ public String toString() {
      */
     private static final class SortableChannel {
         private final int score;
+        private final long rtt;
         private final MutableChannelWithStats delegate;
 
-        SortableChannel(int score, MutableChannelWithStats delegate) {
+        SortableChannel(int score, long rtt, MutableChannelWithStats delegate) {
             this.score = score;
+            this.rtt = rtt;
             this.delegate = delegate;
         }
 
@@ -228,9 +259,13 @@ int getScore() {
             return score;
         }
 
+        public long getRtt() {
+            return rtt;
+        }
+
         @Override
         public String toString() {
-            return "SortableChannel{score=" + score + ", delegate=" + delegate + '}';
+            return "SortableChannel{" + "score=" + score + ", rtt=" + rtt + ", delegate=" + delegate + '}';
         }
     }
 
@@ -342,4 +377,31 @@ public void markRequestMade() {
             };
         }
     }
+
+    private enum RttEndpoint implements Endpoint {
+        INSTANCE;
+
+        @Override
+        public void renderPath(Map<String, String> params, UrlBuilder url) {}
+
+        @Override
+        public HttpMethod httpMethod() {
+            return HttpMethod.OPTIONS;
+        }
+
+        @Override
+        public String serviceName() {
+            return "Balanced";
+        }
+
+        @Override
+        public String endpointName() {
+            return "rtt";
+        }
+
+        @Override
+        public String version() {
+            return "0.0.0";
+        }
+    }
 }
diff --git a/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java b/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java
index 13ba55512..61a608a7e 100644
--- a/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java
+++ b/dialogue-target/src/main/java/com/palantir/dialogue/HttpMethod.java
@@ -24,4 +24,5 @@ public enum HttpMethod {
     PATCH,
     POST,
     PUT,
+    OPTIONS
 }

From bc5b31d412b56e577b2ab1186e76b3d63a57a049 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 00:50:09 +0100
Subject: [PATCH 02/35] Compile

---
 .../dialogue/core/BalancedNodeSelectionStrategyChannel.java  | 2 +-
 .../java/com/palantir/dialogue/core/RetryingChannel.java     | 1 +
 .../src/main/java/com/palantir/dialogue/OkHttpChannel.java   | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 8fec8a82f..231ac9529 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -382,7 +382,7 @@ private enum RttEndpoint implements Endpoint {
         INSTANCE;
 
         @Override
-        public void renderPath(Map<String, String> params, UrlBuilder url) {}
+        public void renderPath(Map<String, String> _params, UrlBuilder _url) {}
 
         @Override
         public HttpMethod httpMethod() {
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java
index 2d1b7a5fb..167b8f895 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RetryingChannel.java
@@ -381,6 +381,7 @@ private static boolean safeToRetry(HttpMethod httpMethod) {
                 // in theory PUT and DELETE should be fine to retry too, we're just being conservative for now.
             case POST:
             case PATCH:
+            case OPTIONS:
                 return false;
         }
 
diff --git a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java
index a04f26ba9..011cddfbe 100644
--- a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java
+++ b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java
@@ -93,6 +93,11 @@ public ListenableFuture<Response> execute(Endpoint endpoint, Request request) {
             case PATCH:
                 okRequest = okRequest.patch(toOkHttpBody(request.body()));
                 break;
+            case OPTIONS:
+                okRequest = okRequest.method(
+                        "OPTIONS",
+                        request.body().isPresent() ? toOkHttpBody(request.body().get()) : null);
+                break;
         }
 
         // Fill headers

From 109b05d0e41e11d6addb64f67d7cb523873ec735 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 00:56:55 +0100
Subject: [PATCH 03/35] Handl OPTIONS

---
 .../java/com/palantir/dialogue/core/SimulationServer.java    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java b/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java
index 46df6fed3..c8bd29aea 100644
--- a/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java
+++ b/simulation/src/main/java/com/palantir/dialogue/core/SimulationServer.java
@@ -25,6 +25,7 @@
 import com.google.common.util.concurrent.MoreExecutors;
 import com.palantir.dialogue.Channel;
 import com.palantir.dialogue.Endpoint;
+import com.palantir.dialogue.HttpMethod;
 import com.palantir.dialogue.Request;
 import com.palantir.dialogue.Response;
 import com.palantir.dialogue.TestResponse;
@@ -74,6 +75,10 @@ public static Builder builder() {
 
     @Override
     public ListenableFuture<Response> execute(Endpoint endpoint, Request request) {
+        if (endpoint.httpMethod() == HttpMethod.OPTIONS) {
+            return Futures.immediateFuture(new TestResponse().code(204));
+        }
+
         Meter perEndpointRequests = MetricNames.requestMeter(simulation.taggedMetrics(), serverName, endpoint);
 
         activeRequests.inc();

From 466154980e74106b158b83c8fe70054da6f120f8 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 00:58:17 +0100
Subject: [PATCH 04/35] Update simulations

---
 ..._nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ..._reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ..._big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ach_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 simulation/src/test/resources/report.md            | 14 +++++++-------
 ...ate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ..._nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...ach_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...ate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 15 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 870cd83ac..3879dff8e 100644
--- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5acd44d3f56e2a84023c8cda0f6f1da0e0848dd9c3b2b7bd0df936f6166cd27b
-size 113221
+oid sha256:fa2908b6dde2dff46cd78175ea4fbf3a87b66baa0e29a495b3b6fe0daf987ef9
+size 116789
diff --git a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 4f58c5438..94064ea2d 100644
--- a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:301c818a4e266379a6c45dee81d725d1dad6f3c8d2abba49d38a1df8e68ac7ee
-size 103151
+oid sha256:7252511a753523b3869eea5007f48298746fab76bf097cc512767c9bb1699fc6
+size 103509
diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 24e0252de..1c2789624 100644
--- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e925ad02dd69d9f2a5ef8b47ad43aabd5a14106ce0937a73e9780a9a11f3c435
-size 123045
+oid sha256:2aa3973eac6964c1b10c22e034cb052e97dc6e22112d1e1902619a412b9a4fb3
+size 123784
diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index a7e4baf1c..e8365be98 100644
--- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2863197850e4d4573230ae2b4420e31baadf10421546cba011a3d271ca63834c
-size 78310
+oid sha256:26ff458ff960fbfc6f8d3801799b295a0bd382400583c561c64ab170ab922f8d
+size 82170
diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index aa3d69e55..b4a658b44 100644
--- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d492690bce9b84925c16675a75ff55af3cfb180b8b0f753c1fd036f317c2801c
-size 137045
+oid sha256:ab06d72114810dbf8530e882ca71c3384cc9b3ff13a198b5ef8f9d8647c38b54
+size 135051
diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md
index 892475d24..5b5f46f6a 100644
--- a/simulation/src/test/resources/report.md
+++ b/simulation/src/test/resources/report.md
@@ -2,10 +2,10 @@
 <!-- Run SimulationTest to regenerate this report. -->
 ```
                 all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=79.3%	client_mean=PT5.81342S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1586, 500=414}
-                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=73.7%	client_mean=PT3.039455S    	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1474, 500=526}
+                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=73.3%	client_mean=PT2.61872S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1465, 500=535}
                               all_nodes_500[UNLIMITED_ROUND_ROBIN].txt:	success=50.0%	client_mean=PT0.6S         	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1000, 500=1000}
                    black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=59.2%	client_mean=PT0.600655114S 	server_cpu=PT11M49.8S     	client_received=1183/2000	server_resps=1183	codes={200=1183}
-                       black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=91.5%	client_mean=PT0.6S         	server_cpu=PT18M17.4S     	client_received=1829/2000	server_resps=1829	codes={200=1829}
+                       black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
                                  black_hole[UNLIMITED_ROUND_ROBIN].txt:	success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
              drastic_slowdown[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2.947028083S 	server_cpu=PT41M8.862333314S	client_received=4000/4000	server_resps=4000	codes={200=4000}
                  drastic_slowdown[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.251969999S 	server_cpu=PT16M47.879999984S	client_received=4000/4000	server_resps=4000	codes={200=4000}
@@ -17,16 +17,16 @@
             fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                       fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=94.3%	client_mean=PT6.987708S    	server_cpu=PT1H56M8.59S   	client_received=2500/2500	server_resps=2500	codes={200=2357, 500=143}
-                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=84.0%	client_mean=PT4.6409416S   	server_cpu=PT1H53M13.43S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
+                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=83.8%	client_mean=PT4.53S        	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
                              live_reloading[UNLIMITED_ROUND_ROBIN].txt:	success=86.9%	client_mean=PT2.802124S    	server_cpu=PT1H56M45.31S  	client_received=2500/2500	server_resps=2500	codes={200=2173, 500=327}
                 one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2.569766928S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
-                    one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.521339525S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
+                    one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.516129859S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
                               one_big_spike[UNLIMITED_ROUND_ROBIN].txt:	success=99.9%	client_mean=PT1.000523583S 	server_cpu=PT7M37.35S     	client_received=1000/1000	server_resps=3049	codes={200=999, 429=1}
 one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=64.9%	client_mean=PT8.1950896S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1623, 500=877}
- one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=65.4%	client_mean=PT4.0117872S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1636, 500=864}
+ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=67.1%	client_mean=PT2.6955136S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1677, 500=823}
            one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt:	success=65.1%	client_mean=PT0.6S         	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1628, 500=872}
       server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT12M44.762591546S	server_cpu=PT9H55M40.4S   	client_received=150000/150000	server_resps=178702	codes={200=149989, 429=11}
-          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.290099498S 	server_cpu=PT9H9M39.2S    	client_received=150000/150000	server_resps=164896	codes={200=150000}
+          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.264165097S 	server_cpu=PT9H4M39S      	client_received=150000/150000	server_resps=163395	codes={200=150000}
                     server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.217416858S 	server_cpu=PT8H45M29.2S   	client_received=150000/150000	server_resps=157646	codes={200=150000}
        simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT0.834469696S 	server_cpu=PT3H3M35S      	client_received=13200/13200	server_resps=13200	codes={200=13200}
            simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.785757575S 	server_cpu=PT2H52M52S     	client_received=13200/13200	server_resps=13200	codes={200=13200}
@@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	succe
             slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
                       slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
 slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2M28.388896928S	server_cpu=PT8H48M18.546665835S	client_received=10000/10000	server_resps=10899	codes={200=10000}
-    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M24.308654421S	server_cpu=PT14H3M44.353333281S	client_received=10000/10000	server_resps=13383	codes={200=9999, 500=1}
+    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M20.776800234S	server_cpu=PT14H10M8.413333255S	client_received=10000/10000	server_resps=13515	codes={200=9998, 500=2}
               slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt:	success=3.6%	client_mean=PT21.551691121S	server_cpu=PT54H45M57.899999949S	client_received=10000/10000	server_resps=49335	codes={200=360, 500=9640}
               uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
                   uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 3c1b66b6a..e77c67847 100644
--- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33cfa2b188b84d30e139b5a8ff39a646f65588b5cf7b5755a2a61993c9d38c8e
-size 606983
+oid sha256:8f0cb8ff527ff2327b12b0298c073ea40037f57fb08177d263d7a1c07a04bf96
+size 587465
diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index f5dcc152c..374579fc5 100644
--- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:160c566ff4a67cdd8ecddbb91c2262d1310a4925728dadb36ff81bf9a9237875
-size 120294
+oid sha256:76e04d90b781eade0dc4a333c8d5451cbfb9441e7c241117515e89bbb7c446f9
+size 123285
diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index a0433ed68..a99adba40 100644
--- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=73.7%	client_mean=PT3.039455S    	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1474, 500=526}
\ No newline at end of file
+success=73.3%	client_mean=PT2.61872S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1465, 500=535}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 5b70ecd5f..5c1da373e 100644
--- a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=91.5%	client_mean=PT0.6S         	server_cpu=PT18M17.4S     	client_received=1829/2000	server_resps=1829	codes={200=1829}
\ No newline at end of file
+success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 46b26cdea..be188c732 100644
--- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=84.0%	client_mean=PT4.6409416S   	server_cpu=PT1H53M13.43S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
\ No newline at end of file
+success=83.8%	client_mean=PT4.53S        	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index b71e29339..09b358e6b 100644
--- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT1.521339525S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
\ No newline at end of file
+success=100.0%	client_mean=PT1.516129859S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 561f056b7..0f2287fc3 100644
--- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=65.4%	client_mean=PT4.0117872S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1636, 500=864}
\ No newline at end of file
+success=67.1%	client_mean=PT2.6955136S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1677, 500=823}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index ffdfaca53..e71d95262 100644
--- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT0.290099498S 	server_cpu=PT9H9M39.2S    	client_received=150000/150000	server_resps=164896	codes={200=150000}
\ No newline at end of file
+success=100.0%	client_mean=PT0.264165097S 	server_cpu=PT9H4M39S      	client_received=150000/150000	server_resps=163395	codes={200=150000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index d250cf150..fb9afe184 100644
--- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT3M24.308654421S	server_cpu=PT14H3M44.353333281S	client_received=10000/10000	server_resps=13383	codes={200=9999, 500=1}
\ No newline at end of file
+success=100.0%	client_mean=PT3M20.776800234S	server_cpu=PT14H10M8.413333255S	client_received=10000/10000	server_resps=13515	codes={200=9998, 500=2}
\ No newline at end of file

From 17c248e52bc076d89cbaeccfbc57e2b319576ea2 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 01:13:20 +0100
Subject: [PATCH 05/35] Updatee tests for rtt sampling

---
 .../BalancedNodeSelectionStrategyChannelTest.java    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 9086c290c..1e985a68e 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -76,8 +76,8 @@ void when_one_channel_is_in_use_prefer_the_other() {
         for (int i = 0; i < 200; i++) {
             channel.maybeExecute(endpoint, request);
         }
-        verify(chan1, times(199)).maybeExecute(any(), any());
-        verify(chan2, times(1)).maybeExecute(any(), any());
+        verify(chan1, times(398)).maybeExecute(any(), any());
+        verify(chan2, times(2)).maybeExecute(any(), any());
     }
 
     @Test
@@ -88,8 +88,8 @@ void when_both_channels_are_free_we_get_roughly_fair_tiebreaking() {
         for (int i = 0; i < 200; i++) {
             channel.maybeExecute(endpoint, request);
         }
-        verify(chan1, times(99)).maybeExecute(any(), any());
-        verify(chan2, times(101)).maybeExecute(any(), any());
+        verify(chan1, times(198)).maybeExecute(any(), any());
+        verify(chan2, times(202)).maybeExecute(any(), any());
     }
 
     @Test
@@ -116,8 +116,8 @@ void a_single_4xx_doesnt_move_the_needle() {
                     .containsExactly(0, 0);
         }
 
-        verify(chan1, times(99)).maybeExecute(any(), any());
-        verify(chan2, times(101)).maybeExecute(any(), any());
+        verify(chan1, times(198)).maybeExecute(any(), any());
+        verify(chan2, times(202)).maybeExecute(any(), any());
     }
 
     @Test

From a5e7531b7098b9ed49cbd44a0f58f3904e282423 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 16:26:22 +0100
Subject: [PATCH 06/35] Accumulate RTT average

---
 .../BalancedNodeSelectionStrategyChannel.java | 42 ++++++++++++++-----
 ...ancedNodeSelectionStrategyChannelTest.java | 13 ++++++
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 231ac9529..85507e06f 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -44,6 +44,7 @@
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
+import javax.annotation.concurrent.ThreadSafe;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -142,8 +143,7 @@ private static final class MutableChannelWithStats implements LimitedChannel {
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
-        private volatile long rttNanos = 0;
-        // private volatile long nextRttMeasurement = 0;
+        private final RoundTripTimeMeasurement rtt = new RoundTripTimeMeasurement();
 
         /**
          * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns
@@ -202,17 +202,14 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
         private void maybeSampleRtt() {
             long initialNanos = clock.read();
-            // if (initialNanos < nextRttMeasurement) {
-            //     return;
-            // }
 
             Optional<ListenableFuture<Response>> maybe = delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
             if (maybe.isPresent()) {
-                // nextRttMeasurement = initialNanos + TimeUnit.SECONDS.toNanos(1);
                 DialogueFutures.addDirectListener(maybe.get(), () -> {
-                    rttNanos = clock.read() - initialNanos;
+                    long rttNanos = clock.read() - initialNanos;
+                    rtt.update(rttNanos);
                     if (log.isInfoEnabled()) {
-                        log.info("rttNanos {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate));
+                        log.info("rttNanos {} {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate));
                     }
                 });
             }
@@ -227,7 +224,7 @@ SortableChannel computeScore() {
             int score = requestsInflight + Ints.saturatedCast(Math.round(failureReservoir));
 
             observability.debugLogComputedScore(requestsInflight, failureReservoir, score);
-            return new SortableChannel(score, rttNanos, this);
+            return new SortableChannel(score, rtt.getNanos(), this);
         }
 
         @Override
@@ -259,7 +256,7 @@ int getScore() {
             return score;
         }
 
-        public long getRtt() {
+        long getRtt() {
             return rtt;
         }
 
@@ -404,4 +401,29 @@ public String version() {
             return "0.0.0";
         }
     }
+
+    @VisibleForTesting
+    @ThreadSafe
+    static class RoundTripTimeMeasurement {
+
+        // TODO(dfox): switch to some exponentially decaying thingy?
+        // TODO(dfox): can we exclude outlier measurements that probably include TLS handshakes?
+        private volatile float rttNanos = 0;
+        private volatile long numSamples = 0;
+
+        long getNanos() {
+            return (long) rttNanos;
+        }
+
+        synchronized void update(long newMeasurement) {
+            float denominator = (float) numSamples / (numSamples + 1);
+            this.rttNanos = rttNanos * denominator + (float) newMeasurement / (numSamples + 1);
+            this.numSamples = numSamples + 1;
+        }
+
+        @Override
+        public String toString() {
+            return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + ", numSamples=" + numSamples + '}';
+        }
+    }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 1e985a68e..977ffe2dc 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -146,6 +146,19 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
                 .containsExactly(0, 0);
     }
 
+    @Test
+    void rtt_accumulates_avg_nicely() {
+        BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt =
+                new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement();
+        rtt.update(1);
+        rtt.update(2);
+        rtt.update(3);
+        assertThat(rtt.getNanos()).isEqualTo(2);
+
+        rtt.update(500);
+        assertThat(rtt.getNanos()).isEqualTo(126);
+    }
+
     private static void set200(LimitedChannel chan) {
         when(chan.maybeExecute(any(), any())).thenReturn(http(200));
     }

From e398af0b7afe071ffb5f1d2a42dc2c5d55d4a510 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 18:02:55 +0100
Subject: [PATCH 07/35] Sample all channels, with two sequential calls

---
 .../BalancedNodeSelectionStrategyChannel.java | 85 ++++++++++++++-----
 ...ancedNodeSelectionStrategyChannelTest.java |  8 +-
 2 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 85507e06f..b96705d94 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -22,7 +22,9 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.primitives.Ints;
 import com.google.common.util.concurrent.FutureCallback;
+import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.MoreExecutors;
 import com.palantir.dialogue.Endpoint;
 import com.palantir.dialogue.HttpMethod;
 import com.palantir.dialogue.Request;
@@ -42,7 +44,9 @@
 import java.util.Map;
 import java.util.Optional;
 import java.util.Random;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import javax.annotation.concurrent.ThreadSafe;
 import org.slf4j.Logger;
@@ -103,6 +107,7 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         for (SortableChannel channel : sortedChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
+                sampleRttsForAllChannels();
                 return maybe;
             }
         }
@@ -110,6 +115,66 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         return Optional.empty();
     }
 
+    private void sampleRttsForAllChannels() {
+        // TODO(dfox): don't do this 100% of the time
+        // if (random.nextDouble() > 0.01f) {
+        //     return;
+        // }
+
+        List<ListenableFuture<Long>> results = new ArrayList<>();
+        for (MutableChannelWithStats channel : channels) {
+            long attempt1Before = clock.read();
+
+            Optional<ListenableFuture<Response>> attempt1 =
+                    channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
+            if (!attempt1.isPresent()) {
+                results.add(Futures.immediateFuture(Long.MAX_VALUE));
+                continue;
+            }
+
+            ListenableFuture<Long> attempt1Duration = Futures.transform(
+                    attempt1.get(), response1 -> clock.read() - attempt1Before, MoreExecutors.directExecutor());
+
+            results.add(Futures.transformAsync(
+                    attempt1Duration,
+                    attempt1Nanos -> {
+                        long attempt2Before = clock.read();
+                        Optional<ListenableFuture<Response>> attempt2 =
+                                channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
+                        if (!attempt2.isPresent()) {
+                            return Futures.immediateFuture(Long.MAX_VALUE); // signifies we couldn't get two samples
+                        }
+
+                        return Futures.transform(
+                                attempt2.get(),
+                                response2 -> {
+                                    long attempt2Nanos = clock.read() - attempt2Before;
+                                    // taking min of two attempts to exclude samples that incurred a TLS handshake
+                                    long newMeasurement = Math.min(attempt1Nanos, attempt2Nanos);
+                                    channel.rtt.addMeasurement(newMeasurement);
+                                    return newMeasurement;
+                                },
+                                MoreExecutors.directExecutor());
+                    },
+                    MoreExecutors.directExecutor()));
+        }
+        if (log.isInfoEnabled()) {
+            DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback<List<Long>>() {
+                @Override
+                public void onSuccess(List<Long> result) {
+                    List<Long> millis =
+                            result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
+                    log.info("RTTs {} {}", SafeArg.of("nanos", result), SafeArg.of("millis", millis));
+                }
+
+                @Override
+                public void onFailure(Throwable throwable) {
+                    log.info("Throwable", throwable);
+                }
+            });
+        }
+    }
+
     private static SortableChannel[] sortByScore(List<MutableChannelWithStats> preShuffled) {
         SortableChannel[] sorted = new SortableChannel[preShuffled.size()];
         for (int i = 0; i < preShuffled.size(); i++) {
@@ -139,7 +204,6 @@ public String toString() {
     private static final class MutableChannelWithStats implements LimitedChannel {
         private final LimitedChannel delegate;
         private final FutureCallback<Response> updateStats;
-        private final Ticker clock;
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
@@ -155,7 +219,6 @@ private static final class MutableChannelWithStats implements LimitedChannel {
         MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) {
             this.delegate = delegate;
             this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY);
-            this.clock = clock;
             this.observability = observability;
             // Saves one allocation on each network call
             this.updateStats = new FutureCallback<Response>() {
@@ -196,25 +259,9 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
             DialogueFutures.addDirectCallback(maybe.get(), updateStats);
             observability.markRequestMade();
-            maybeSampleRtt();
             return maybe;
         }
 
-        private void maybeSampleRtt() {
-            long initialNanos = clock.read();
-
-            Optional<ListenableFuture<Response>> maybe = delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
-            if (maybe.isPresent()) {
-                DialogueFutures.addDirectListener(maybe.get(), () -> {
-                    long rttNanos = clock.read() - initialNanos;
-                    rtt.update(rttNanos);
-                    if (log.isInfoEnabled()) {
-                        log.info("rttNanos {} {}", SafeArg.of("rtt", rttNanos), UnsafeArg.of("channel", delegate));
-                    }
-                });
-            }
-        }
-
         SortableChannel computeScore() {
             int requestsInflight = inflight.get();
             double failureReservoir = recentFailuresReservoir.get();
@@ -415,7 +462,7 @@ long getNanos() {
             return (long) rttNanos;
         }
 
-        synchronized void update(long newMeasurement) {
+        synchronized void addMeasurement(long newMeasurement) {
             float denominator = (float) numSamples / (numSamples + 1);
             this.rttNanos = rttNanos * denominator + (float) newMeasurement / (numSamples + 1);
             this.numSamples = numSamples + 1;
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 977ffe2dc..5fcc2c8ae 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -150,12 +150,12 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
     void rtt_accumulates_avg_nicely() {
         BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt =
                 new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement();
-        rtt.update(1);
-        rtt.update(2);
-        rtt.update(3);
+        rtt.addMeasurement(1);
+        rtt.addMeasurement(2);
+        rtt.addMeasurement(3);
         assertThat(rtt.getNanos()).isEqualTo(2);
 
-        rtt.update(500);
+        rtt.addMeasurement(500);
         assertThat(rtt.getNanos()).isEqualTo(126);
     }
 

From 0701a64e4e20e3224aa74f1ebb2bf6baf7cc6b15 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 18:35:31 +0100
Subject: [PATCH 08/35] Also log accumulated ones

---
 .../core/BalancedNodeSelectionStrategyChannel.java        | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index b96705d94..387994ede 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -164,7 +164,13 @@ private void sampleRttsForAllChannels() {
                 public void onSuccess(List<Long> result) {
                     List<Long> millis =
                             result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
-                    log.info("RTTs {} {}", SafeArg.of("nanos", result), SafeArg.of("millis", millis));
+                    List<Long> accumulated =
+                            channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList());
+                    log.info(
+                            "RTTs {} {}",
+                            SafeArg.of("nanos", result),
+                            SafeArg.of("millis", millis),
+                            SafeArg.of("accumulated", accumulated));
                 }
 
                 @Override

From bef11a4bfc9d6338cde8aa08d8bcb1f455e517e0 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Wed, 27 May 2020 23:36:44 +0100
Subject: [PATCH 09/35] Just store the min

---
 .../BalancedNodeSelectionStrategyChannel.java | 66 +++++++------------
 ...ancedNodeSelectionStrategyChannelTest.java | 13 ++--
 2 files changed, 30 insertions(+), 49 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 387994ede..65de96f17 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -46,6 +46,7 @@
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import javax.annotation.concurrent.ThreadSafe;
@@ -123,40 +124,22 @@ private void sampleRttsForAllChannels() {
 
         List<ListenableFuture<Long>> results = new ArrayList<>();
         for (MutableChannelWithStats channel : channels) {
-            long attempt1Before = clock.read();
+            long before = clock.read();
 
-            Optional<ListenableFuture<Response>> attempt1 =
+            Optional<ListenableFuture<Response>> future =
                     channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
-            if (!attempt1.isPresent()) {
+            if (future.isPresent()) {
+                results.add(Futures.transform(
+                        future.get(),
+                        _response -> {
+                            long durationNanos = clock.read() - before;
+                            channel.rtt.addMeasurement(durationNanos);
+                            return durationNanos;
+                        },
+                        MoreExecutors.directExecutor()));
+            } else {
                 results.add(Futures.immediateFuture(Long.MAX_VALUE));
-                continue;
             }
-
-            ListenableFuture<Long> attempt1Duration = Futures.transform(
-                    attempt1.get(), response1 -> clock.read() - attempt1Before, MoreExecutors.directExecutor());
-
-            results.add(Futures.transformAsync(
-                    attempt1Duration,
-                    attempt1Nanos -> {
-                        long attempt2Before = clock.read();
-                        Optional<ListenableFuture<Response>> attempt2 =
-                                channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
-                        if (!attempt2.isPresent()) {
-                            return Futures.immediateFuture(Long.MAX_VALUE); // signifies we couldn't get two samples
-                        }
-
-                        return Futures.transform(
-                                attempt2.get(),
-                                response2 -> {
-                                    long attempt2Nanos = clock.read() - attempt2Before;
-                                    // taking min of two attempts to exclude samples that incurred a TLS handshake
-                                    long newMeasurement = Math.min(attempt1Nanos, attempt2Nanos);
-                                    channel.rtt.addMeasurement(newMeasurement);
-                                    return newMeasurement;
-                                },
-                                MoreExecutors.directExecutor());
-                    },
-                    MoreExecutors.directExecutor()));
         }
         if (log.isInfoEnabled()) {
             DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback<List<Long>>() {
@@ -164,18 +147,18 @@ private void sampleRttsForAllChannels() {
                 public void onSuccess(List<Long> result) {
                     List<Long> millis =
                             result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
-                    List<Long> accumulated =
+                    List<Long> best =
                             channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList());
                     log.info(
-                            "RTTs {} {}",
+                            "RTTs {} {} {}",
                             SafeArg.of("nanos", result),
                             SafeArg.of("millis", millis),
-                            SafeArg.of("accumulated", accumulated));
+                            SafeArg.of("best", best));
                 }
 
                 @Override
                 public void onFailure(Throwable throwable) {
-                    log.info("Throwable", throwable);
+                    log.info("Failed to sample RTT for channels", throwable);
                 }
             });
         }
@@ -460,23 +443,20 @@ public String version() {
     static class RoundTripTimeMeasurement {
 
         // TODO(dfox): switch to some exponentially decaying thingy?
-        // TODO(dfox): can we exclude outlier measurements that probably include TLS handshakes?
-        private volatile float rttNanos = 0;
-        private volatile long numSamples = 0;
+        private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE);
 
         long getNanos() {
-            return (long) rttNanos;
+            return rttNanos.get();
         }
 
-        synchronized void addMeasurement(long newMeasurement) {
-            float denominator = (float) numSamples / (numSamples + 1);
-            this.rttNanos = rttNanos * denominator + (float) newMeasurement / (numSamples + 1);
-            this.numSamples = numSamples + 1;
+        void addMeasurement(long newMeasurement) {
+            // Only stores the *minimum* values, so that we exclude slow calls that might include TLS handshakes.
+            rttNanos.getAndUpdate(existing -> Math.min(existing, newMeasurement));
         }
 
         @Override
         public String toString() {
-            return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + ", numSamples=" + numSamples + '}';
+            return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + '}';
         }
     }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 5fcc2c8ae..f54b58f60 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -76,8 +76,8 @@ void when_one_channel_is_in_use_prefer_the_other() {
         for (int i = 0; i < 200; i++) {
             channel.maybeExecute(endpoint, request);
         }
-        verify(chan1, times(398)).maybeExecute(any(), any());
-        verify(chan2, times(2)).maybeExecute(any(), any());
+        verify(chan1, times(399)).maybeExecute(any(), any());
+        verify(chan2, times(201)).maybeExecute(any(), any());
     }
 
     @Test
@@ -147,16 +147,17 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
     }
 
     @Test
-    void rtt_accumulates_avg_nicely() {
+    void rtt_just_remembers_the_min() {
         BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt =
                 new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement();
+        rtt.addMeasurement(3);
+        assertThat(rtt.getNanos()).isEqualTo(3);
         rtt.addMeasurement(1);
         rtt.addMeasurement(2);
-        rtt.addMeasurement(3);
-        assertThat(rtt.getNanos()).isEqualTo(2);
+        assertThat(rtt.getNanos()).isEqualTo(1);
 
         rtt.addMeasurement(500);
-        assertThat(rtt.getNanos()).isEqualTo(126);
+        assertThat(rtt.getNanos()).isEqualTo(1);
     }
 
     private static void set200(LimitedChannel chan) {

From 2284f62794ef18047857b61ceccedaf89aa14792 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 00:49:43 +0100
Subject: [PATCH 10/35] Compute score based on the range of observed rtts

---
 .../BalancedNodeSelectionStrategyChannel.java | 121 ++++++++++--------
 ...ancedNodeSelectionStrategyChannelTest.java |  17 +--
 2 files changed, 78 insertions(+), 60 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 65de96f17..4ba468fd9 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -65,12 +65,21 @@
 final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class);
     private static final Request BLANK_REQUEST = Request.builder().build();
+    private static final Comparator<ChannelStats> BY_SCORE = Comparator.comparingInt(ChannelStats::getScore);
 
-    private static final Comparator<SortableChannel> BY_SCORE =
-            Comparator.comparingInt(SortableChannel::getScore).thenComparingLong(SortableChannel::getRtt);
     private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30);
     private static final double FAILURE_WEIGHT = 10;
 
+    /**
+     * This determines how 'sticky' we are to our 'nearest' node (as measured by RTT). If this is set too high, then we
+     * may deliver suboptimal perf by sending all requests to a slow node that is physically nearby, when there's
+     * actually a faster one further away.
+     * If this is too low, then we may prematurely spill across AZs and pay the $ cost. Should probably
+     * keep this lower than {@link #FAILURE_WEIGHT} to ensure that a single 5xx makes a nearby node less attractive
+     * than a faraway node that exhibited zero failures.
+     */
+    private static final double RTT_WEIGHT = 3;
+
     private final ImmutableList<MutableChannelWithStats> channels;
     private final Random random;
     private final Ticker clock;
@@ -103,9 +112,9 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
         // TODO(dfox): P2C optimization when we have high number of nodes to save CPU?
         // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
-        SortableChannel[] sortedChannels = sortByScore(preShuffled);
+        ChannelStats[] sortedChannels = sortByScore(preShuffled);
 
-        for (SortableChannel channel : sortedChannels) {
+        for (ChannelStats channel : sortedChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
                 sampleRttsForAllChannels();
@@ -141,7 +150,7 @@ private void sampleRttsForAllChannels() {
                 results.add(Futures.immediateFuture(Long.MAX_VALUE));
             }
         }
-        if (log.isInfoEnabled()) {
+        if (log.isDebugEnabled()) {
             DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback<List<Long>>() {
                 @Override
                 public void onSuccess(List<Long> result) {
@@ -149,7 +158,7 @@ public void onSuccess(List<Long> result) {
                             result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
                     List<Long> best =
                             channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList());
-                    log.info(
+                    log.debug(
                             "RTTs {} {} {}",
                             SafeArg.of("nanos", result),
                             SafeArg.of("millis", millis),
@@ -164,13 +173,36 @@ public void onFailure(Throwable throwable) {
         }
     }
 
-    private static SortableChannel[] sortByScore(List<MutableChannelWithStats> preShuffled) {
-        SortableChannel[] sorted = new SortableChannel[preShuffled.size()];
+    private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuffled) {
+        ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()];
+
+        long bestRttNanos = Long.MAX_VALUE;
+        long worstRttNanos = 0;
         for (int i = 0; i < preShuffled.size(); i++) {
-            sorted[i] = preShuffled.get(i).computeScore();
+            ChannelStats snapshot = preShuffled.get(i).getSnapshot();
+            snapshotArray[i] = snapshot;
+
+            if (snapshot.rttNanos != Long.MAX_VALUE) {
+                bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos);
+                worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos);
+            }
+        }
+
+        // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding it
+        // would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a conversion
+        // fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos to worstRttNanos,
+        // with 0 being best and 1 being worst. This ensures that if several nodes are all within the same AZ and
+        // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero).
+        // Note, this can only be computed when we have all the snapshots in front of us.
+        if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0) {
+            long rttRange = worstRttNanos - bestRttNanos;
+            for (ChannelStats snapshot : snapshotArray) {
+                snapshot.rttSpectrum = ((float) snapshot.rttNanos - bestRttNanos) / rttRange;
+            }
         }
-        Arrays.sort(sorted, BY_SCORE);
-        return sorted;
+
+        Arrays.sort(snapshotArray, BY_SCORE);
+        return snapshotArray;
     }
 
     /** Returns a new shuffled list, without mutating the input list (which may be immutable). */
@@ -182,7 +214,7 @@ private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Ran
 
     @VisibleForTesting
     IntStream getScores() {
-        return channels.stream().mapToInt(c -> c.computeScore().score);
+        return channels.stream().mapToInt(c -> c.getSnapshot().getScore());
     }
 
     @Override
@@ -251,22 +283,14 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
             return maybe;
         }
 
-        SortableChannel computeScore() {
-            int requestsInflight = inflight.get();
-            double failureReservoir = recentFailuresReservoir.get();
-
-            // it's important that scores are integers because if we kept the full double precision, then a single 4xx
-            // would end up influencing host selection long beyond its intended lifespan in the absence of other data.
-            int score = requestsInflight + Ints.saturatedCast(Math.round(failureReservoir));
-
-            observability.debugLogComputedScore(requestsInflight, failureReservoir, score);
-            return new SortableChannel(score, rtt.getNanos(), this);
+        ChannelStats getSnapshot() {
+            return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getNanos());
         }
 
         @Override
         public String toString() {
-            return "MutableChannelWithStats{score=" + computeScore().score
-                    + ", inflight=" + inflight
+            return "MutableChannelWithStats{"
+                    + "inflight=" + inflight
                     + ", recentFailures=" + recentFailuresReservoir
                     + ", delegate=" + delegate
                     + '}';
@@ -274,31 +298,35 @@ public String toString() {
     }
 
     /**
-     * A dedicated immutable class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger
+     * A dedicated value class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger
      * might change mid-sort, leading to undefined behaviour.
      */
-    private static final class SortableChannel {
-        private final int score;
-        private final long rtt;
+    private static final class ChannelStats {
         private final MutableChannelWithStats delegate;
+        private final int requestsInflight;
+        private final double recentBadness;
 
-        SortableChannel(int score, long rtt, MutableChannelWithStats delegate) {
-            this.score = score;
-            this.rtt = rtt;
+        private final long rttNanos;
+        private float rttSpectrum = 0; // assigned later once we've figured out the range of rttNanos across channels
+
+        ChannelStats(MutableChannelWithStats delegate, int requestsInflight, double recentBadness, long rttNanos) {
             this.delegate = delegate;
+            this.requestsInflight = requestsInflight;
+            this.recentBadness = recentBadness;
+            this.rttNanos = rttNanos;
         }
 
         int getScore() {
-            return score;
-        }
-
-        long getRtt() {
-            return rtt;
+            // it's important that scores are integers because if we kept the full double precision, then a single 4xx
+            // would end up influencing host selection long beyond its intended lifespan in the absence of other data.
+            return requestsInflight
+                    + Ints.saturatedCast(Math.round(recentBadness))
+                    + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT));
         }
 
         @Override
         public String toString() {
-            return "SortableChannel{" + "score=" + score + ", rtt=" + rtt + ", delegate=" + delegate + '}';
+            return "SortableChannel{" + getScore() + '}';
         }
     }
 
@@ -320,7 +348,7 @@ private static void registerGauges(
             DialogueInternalWeakReducingGauge.getOrCreate(
                     taggedMetrics,
                     metricName,
-                    c -> c.computeScore().getScore(),
+                    c -> c.getSnapshot().getScore(),
                     longStream -> {
                         long[] longs = longStream.toArray();
                         if (log.isInfoEnabled() && longs.length > 1) {
@@ -369,18 +397,6 @@ void debugLogStatusFailure(Response response) {
             }
         }
 
-        void debugLogComputedScore(int inflight, double failures, int score) {
-            if (log.isDebugEnabled()) {
-                log.debug(
-                        "Computed score",
-                        channelName,
-                        hostIndex,
-                        SafeArg.of("score", score),
-                        SafeArg.of("inflight", inflight),
-                        SafeArg.of("failures", failures));
-            }
-        }
-
         static PerHostObservability create(
                 ImmutableList<LimitedChannel> channels,
                 TaggedMetricRegistry taggedMetrics,
@@ -411,7 +427,8 @@ public void markRequestMade() {
         }
     }
 
-    private enum RttEndpoint implements Endpoint {
+    @VisibleForTesting
+    enum RttEndpoint implements Endpoint {
         INSTANCE;
 
         @Override
@@ -442,7 +459,7 @@ public String version() {
     @ThreadSafe
     static class RoundTripTimeMeasurement {
 
-        // TODO(dfox): switch to some exponentially decaying thingy?
+        // TODO(dfox): switch to some exponentially decaying thingy, so we could detect if network topology changes
         private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE);
 
         long getNanos() {
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index f54b58f60..5833dfaad 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -18,6 +18,7 @@
 
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
@@ -76,8 +77,8 @@ void when_one_channel_is_in_use_prefer_the_other() {
         for (int i = 0; i < 200; i++) {
             channel.maybeExecute(endpoint, request);
         }
-        verify(chan1, times(399)).maybeExecute(any(), any());
-        verify(chan2, times(201)).maybeExecute(any(), any());
+        verify(chan1, times(199)).maybeExecute(eq(endpoint), any());
+        verify(chan2, times(1)).maybeExecute(eq(endpoint), any());
     }
 
     @Test
@@ -88,8 +89,8 @@ void when_both_channels_are_free_we_get_roughly_fair_tiebreaking() {
         for (int i = 0; i < 200; i++) {
             channel.maybeExecute(endpoint, request);
         }
-        verify(chan1, times(198)).maybeExecute(any(), any());
-        verify(chan2, times(202)).maybeExecute(any(), any());
+        verify(chan1, times(299)).maybeExecute(eq(endpoint), any());
+        verify(chan2, times(301)).maybeExecute(eq(endpoint), any());
     }
 
     @Test
@@ -98,8 +99,8 @@ void when_channels_refuse_try_all_then_give_up() {
         when(chan2.maybeExecute(any(), any())).thenReturn(Optional.empty());
 
         assertThat(channel.maybeExecute(endpoint, request)).isNotPresent();
-        verify(chan1, times(1)).maybeExecute(any(), any());
-        verify(chan2, times(1)).maybeExecute(any(), any());
+        verify(chan1, times(1)).maybeExecute(eq(endpoint), any());
+        verify(chan2, times(1)).maybeExecute(eq(endpoint), any());
     }
 
     @Test
@@ -116,8 +117,8 @@ void a_single_4xx_doesnt_move_the_needle() {
                     .containsExactly(0, 0);
         }
 
-        verify(chan1, times(198)).maybeExecute(any(), any());
-        verify(chan2, times(202)).maybeExecute(any(), any());
+        verify(chan1, times(99)).maybeExecute(eq(endpoint), any());
+        verify(chan2, times(101)).maybeExecute(eq(endpoint), any());
     }
 
     @Test

From 961b2404806c308ce48971dd01a7278619e8de96 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 00:53:54 +0100
Subject: [PATCH 11/35] Delete misleading method

---
 .../core/BalancedNodeSelectionStrategyChannel.java |  8 +++++---
 .../BalancedNodeSelectionStrategyChannelTest.java  | 14 +++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 4ba468fd9..1f33e0d07 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -49,6 +49,7 @@
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import java.util.stream.Stream;
 import javax.annotation.concurrent.ThreadSafe;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -213,8 +214,8 @@ private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Ran
     }
 
     @VisibleForTesting
-    IntStream getScores() {
-        return channels.stream().mapToInt(c -> c.getSnapshot().getScore());
+    Stream<ChannelStats> getScoresForTesting() {
+        return Arrays.stream(sortByScore(channels));
     }
 
     @Override
@@ -301,7 +302,8 @@ public String toString() {
      * A dedicated value class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger
      * might change mid-sort, leading to undefined behaviour.
      */
-    private static final class ChannelStats {
+    @VisibleForTesting
+    static final class ChannelStats {
         private final MutableChannelWithStats delegate;
         private final int requestsInflight;
         private final double recentBadness;
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 5833dfaad..3a30d32d2 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -89,8 +89,8 @@ void when_both_channels_are_free_we_get_roughly_fair_tiebreaking() {
         for (int i = 0; i < 200; i++) {
             channel.maybeExecute(endpoint, request);
         }
-        verify(chan1, times(299)).maybeExecute(eq(endpoint), any());
-        verify(chan2, times(301)).maybeExecute(eq(endpoint), any());
+        verify(chan1, times(99)).maybeExecute(eq(endpoint), any());
+        verify(chan2, times(101)).maybeExecute(eq(endpoint), any());
     }
 
     @Test
@@ -112,7 +112,7 @@ void a_single_4xx_doesnt_move_the_needle() {
                 clock.read() < start + Duration.ofSeconds(10).toNanos();
                 incrementClockBy(Duration.ofMillis(50))) {
             channel.maybeExecute(endpoint, request);
-            assertThat(channel.getScores())
+            assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
                     .describedAs("A single 400 at the beginning isn't enough to impact scores", channel)
                     .containsExactly(0, 0);
         }
@@ -128,19 +128,19 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
 
         for (int i = 0; i < 11; i++) {
             channel.maybeExecute(endpoint, request);
-            assertThat(channel.getScores())
+            assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
                     .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), channel)
                     .containsExactly(0, 0);
             incrementClockBy(Duration.ofMillis(50));
         }
         channel.maybeExecute(endpoint, request);
-        assertThat(channel.getScores())
+        assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
                 .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), channel)
-                .containsExactly(1, 0);
+                .containsExactly(0, 1);
 
         incrementClockBy(Duration.ofSeconds(5));
 
-        assertThat(channel.getScores())
+        assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
                 .describedAs(
                         "%s: We quickly forget about 4xxs and go back to fair shuffling %s",
                         Duration.ofNanos(clock.read()), channel)

From 3c2dc94fb874d8cc50bb2cde2cfc52936f02990a Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 01:06:50 +0100
Subject: [PATCH 12/35] Fix unit tests

---
 .../BalancedNodeSelectionStrategyChannel.java | 13 +++--
 ...ancedNodeSelectionStrategyChannelTest.java | 55 +++++++++++++++++++
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 1f33e0d07..64e5bed44 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -195,8 +195,8 @@ private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuff
         // with 0 being best and 1 being worst. This ensures that if several nodes are all within the same AZ and
         // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero).
         // Note, this can only be computed when we have all the snapshots in front of us.
-        if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0) {
-            long rttRange = worstRttNanos - bestRttNanos;
+        long rttRange = worstRttNanos - bestRttNanos;
+        if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0) {
             for (ChannelStats snapshot : snapshotArray) {
                 snapshot.rttSpectrum = ((float) snapshot.rttNanos - bestRttNanos) / rttRange;
             }
@@ -291,9 +291,10 @@ ChannelStats getSnapshot() {
         @Override
         public String toString() {
             return "MutableChannelWithStats{"
-                    + "inflight=" + inflight
+                    + "delegate=" + delegate
+                    + ", inflight=" + inflight
                     + ", recentFailures=" + recentFailuresReservoir
-                    + ", delegate=" + delegate
+                    + ", rtt=" + rtt
                     + '}';
         }
     }
@@ -328,7 +329,7 @@ int getScore() {
 
         @Override
         public String toString() {
-            return "SortableChannel{" + getScore() + '}';
+            return "ChannelStats{" + "score=" + getScore() + ", delegate=" + delegate + '}';
         }
     }
 
@@ -475,7 +476,7 @@ void addMeasurement(long newMeasurement) {
 
         @Override
         public String toString() {
-            return "RoundTripTimeMeasurement{" + "rttNanos=" + rttNanos + '}';
+            return "RoundTripTimeMeasurement{" + rttNanos + '}';
         }
     }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 3a30d32d2..59bbba3ae 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -147,6 +147,61 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
                 .containsExactly(0, 0);
     }
 
+    @Test
+    void rtt_is_measured_and_can_influence_choices() {
+        // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
+        when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
+
+        SettableFuture<Response> chan1OptionsResponse = SettableFuture.create();
+        SettableFuture<Response> chan2OptionsResponse = SettableFuture.create();
+        BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint =
+                BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE;
+        when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan1OptionsResponse));
+        when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan2OptionsResponse));
+
+        channel.maybeExecute(endpoint, request);
+
+        incrementClockBy(Duration.ofNanos(123));
+        chan1OptionsResponse.set(new TestResponse().code(200));
+
+        incrementClockBy(Duration.ofNanos(456));
+        chan2OptionsResponse.set(new TestResponse().code(200));
+
+        assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
+                .describedAs("The poor latency of channel2 imposes a small constant penalty in the score")
+                .containsExactly(0, 3);
+    }
+
+    @Test
+    void when_rtt_measurements_are_limited_dont_freak_out() {
+        // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
+        when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
+
+        BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint =
+                BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE;
+        when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty());
+        when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty());
+
+        channel.maybeExecute(endpoint, request);
+
+        assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
+    }
+
+    @Test
+    void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
+        // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
+        when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
+
+        BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint =
+                BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE;
+        when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create()));
+        when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create()));
+
+        channel.maybeExecute(endpoint, request);
+
+        assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
+    }
+
     @Test
     void rtt_just_remembers_the_min() {
         BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt =

From 9514f912fe9fdecb59d42b49a9f58f5ef4c3b618 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 01:08:55 +0100
Subject: [PATCH 13/35] Update simulations

---
 ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 simulation/src/test/resources/report.md              | 12 ++++++------
 ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 13 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 3879dff8e..dcbdc4b08 100644
--- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa2908b6dde2dff46cd78175ea4fbf3a87b66baa0e29a495b3b6fe0daf987ef9
-size 116789
+oid sha256:1cefd6e18f6003d8146aa97dcfef902d00185395ae1f8d162c2482dacfd57d81
+size 117492
diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 1c2789624..a011807a8 100644
--- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2aa3973eac6964c1b10c22e034cb052e97dc6e22112d1e1902619a412b9a4fb3
-size 123784
+oid sha256:32de7fbba94d7676ad359f3228e55337b61088fcb4102fe743e9105581c94280
+size 123659
diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index e8365be98..f6d0b4252 100644
--- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26ff458ff960fbfc6f8d3801799b295a0bd382400583c561c64ab170ab922f8d
-size 82170
+oid sha256:dd0ba5b93d7672d48abf2aef421884968bac702a9c760e159990ef4b3c6ac7a9
+size 82748
diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index b4a658b44..459bfa31b 100644
--- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab06d72114810dbf8530e882ca71c3384cc9b3ff13a198b5ef8f9d8647c38b54
-size 135051
+oid sha256:1836375d4ea7584287820c4b7658ff7b62c990ffac3cf1abe5657b6746651463
+size 135128
diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md
index 5b5f46f6a..61cf72fe9 100644
--- a/simulation/src/test/resources/report.md
+++ b/simulation/src/test/resources/report.md
@@ -2,7 +2,7 @@
 <!-- Run SimulationTest to regenerate this report. -->
 ```
                 all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=79.3%	client_mean=PT5.81342S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1586, 500=414}
-                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=73.3%	client_mean=PT2.61872S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1465, 500=535}
+                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=72.8%	client_mean=PT2.57287S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1456, 500=544}
                               all_nodes_500[UNLIMITED_ROUND_ROBIN].txt:	success=50.0%	client_mean=PT0.6S         	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1000, 500=1000}
                    black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=59.2%	client_mean=PT0.600655114S 	server_cpu=PT11M49.8S     	client_received=1183/2000	server_resps=1183	codes={200=1183}
                        black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
@@ -17,16 +17,16 @@
             fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                       fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=94.3%	client_mean=PT6.987708S    	server_cpu=PT1H56M8.59S   	client_received=2500/2500	server_resps=2500	codes={200=2357, 500=143}
-                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=83.8%	client_mean=PT4.53S        	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
+                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=83.8%	client_mean=PT4.529216S    	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
                              live_reloading[UNLIMITED_ROUND_ROBIN].txt:	success=86.9%	client_mean=PT2.802124S    	server_cpu=PT1H56M45.31S  	client_received=2500/2500	server_resps=2500	codes={200=2173, 500=327}
                 one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2.569766928S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
-                    one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.516129859S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
+                    one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.516076793S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
                               one_big_spike[UNLIMITED_ROUND_ROBIN].txt:	success=99.9%	client_mean=PT1.000523583S 	server_cpu=PT7M37.35S     	client_received=1000/1000	server_resps=3049	codes={200=999, 429=1}
 one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=64.9%	client_mean=PT8.1950896S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1623, 500=877}
- one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=67.1%	client_mean=PT2.6955136S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1677, 500=823}
+ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=66.8%	client_mean=PT2.6967232S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1671, 500=829}
            one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt:	success=65.1%	client_mean=PT0.6S         	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1628, 500=872}
       server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT12M44.762591546S	server_cpu=PT9H55M40.4S   	client_received=150000/150000	server_resps=178702	codes={200=149989, 429=11}
-          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.264165097S 	server_cpu=PT9H4M39S      	client_received=150000/150000	server_resps=163395	codes={200=150000}
+          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.244761087S 	server_cpu=PT9H1M59S      	client_received=150000/150000	server_resps=162595	codes={200=150000}
                     server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.217416858S 	server_cpu=PT8H45M29.2S   	client_received=150000/150000	server_resps=157646	codes={200=150000}
        simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT0.834469696S 	server_cpu=PT3H3M35S      	client_received=13200/13200	server_resps=13200	codes={200=13200}
            simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.785757575S 	server_cpu=PT2H52M52S     	client_received=13200/13200	server_resps=13200	codes={200=13200}
@@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	succe
             slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
                       slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
 slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2M28.388896928S	server_cpu=PT8H48M18.546665835S	client_received=10000/10000	server_resps=10899	codes={200=10000}
-    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M20.776800234S	server_cpu=PT14H10M8.413333255S	client_received=10000/10000	server_resps=13515	codes={200=9998, 500=2}
+    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M26.439571686S	server_cpu=PT14H31M14.346666611S	client_received=10000/10000	server_resps=13781	codes={200=9996, 500=4}
               slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt:	success=3.6%	client_mean=PT21.551691121S	server_cpu=PT54H45M57.899999949S	client_received=10000/10000	server_resps=49335	codes={200=360, 500=9640}
               uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
                   uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index e77c67847..69edd5eb7 100644
--- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f0cb8ff527ff2327b12b0298c073ea40037f57fb08177d263d7a1c07a04bf96
-size 587465
+oid sha256:c80da86e1873f4bd972a3cd3f2976c1bfb6f64e3bc28c7b9043feb08388a4c51
+size 552215
diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 374579fc5..6abe35c1c 100644
--- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76e04d90b781eade0dc4a333c8d5451cbfb9441e7c241117515e89bbb7c446f9
-size 123285
+oid sha256:dd941ae9716e18ce6d35cbef330f26979460c9124ed744d143a5e124d497c92b
+size 119483
diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index a99adba40..4848ec898 100644
--- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=73.3%	client_mean=PT2.61872S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1465, 500=535}
\ No newline at end of file
+success=72.8%	client_mean=PT2.57287S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1456, 500=544}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index be188c732..eb2d8e4b8 100644
--- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=83.8%	client_mean=PT4.53S        	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
\ No newline at end of file
+success=83.8%	client_mean=PT4.529216S    	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 09b358e6b..b5f6ef6a5 100644
--- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT1.516129859S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
\ No newline at end of file
+success=100.0%	client_mean=PT1.516076793S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 0f2287fc3..8bec96e81 100644
--- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=67.1%	client_mean=PT2.6955136S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1677, 500=823}
\ No newline at end of file
+success=66.8%	client_mean=PT2.6967232S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1671, 500=829}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index e71d95262..22cc13a0e 100644
--- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT0.264165097S 	server_cpu=PT9H4M39S      	client_received=150000/150000	server_resps=163395	codes={200=150000}
\ No newline at end of file
+success=100.0%	client_mean=PT0.244761087S 	server_cpu=PT9H1M59S      	client_received=150000/150000	server_resps=162595	codes={200=150000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index fb9afe184..0b011e68a 100644
--- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT3M20.776800234S	server_cpu=PT14H10M8.413333255S	client_received=10000/10000	server_resps=13515	codes={200=9998, 500=2}
\ No newline at end of file
+success=100.0%	client_mean=PT3M26.439571686S	server_cpu=PT14H31M14.346666611S	client_received=10000/10000	server_resps=13781	codes={200=9996, 500=4}
\ No newline at end of file

From cb3b440dfaf9803b7d445bc43681c4ec5a8b627d Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 01:15:26 +0100
Subject: [PATCH 14/35] streams are fine here

---
 .../BalancedNodeSelectionStrategyChannel.java | 100 +++++++++---------
 ...ancedNodeSelectionStrategyChannelTest.java |   4 +-
 2 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 64e5bed44..2111fad56 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -126,54 +126,6 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         return Optional.empty();
     }
 
-    private void sampleRttsForAllChannels() {
-        // TODO(dfox): don't do this 100% of the time
-        // if (random.nextDouble() > 0.01f) {
-        //     return;
-        // }
-
-        List<ListenableFuture<Long>> results = new ArrayList<>();
-        for (MutableChannelWithStats channel : channels) {
-            long before = clock.read();
-
-            Optional<ListenableFuture<Response>> future =
-                    channel.delegate.maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST);
-            if (future.isPresent()) {
-                results.add(Futures.transform(
-                        future.get(),
-                        _response -> {
-                            long durationNanos = clock.read() - before;
-                            channel.rtt.addMeasurement(durationNanos);
-                            return durationNanos;
-                        },
-                        MoreExecutors.directExecutor()));
-            } else {
-                results.add(Futures.immediateFuture(Long.MAX_VALUE));
-            }
-        }
-        if (log.isDebugEnabled()) {
-            DialogueFutures.addDirectCallback(Futures.allAsList(results), new FutureCallback<List<Long>>() {
-                @Override
-                public void onSuccess(List<Long> result) {
-                    List<Long> millis =
-                            result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
-                    List<Long> best =
-                            channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList());
-                    log.debug(
-                            "RTTs {} {} {}",
-                            SafeArg.of("nanos", result),
-                            SafeArg.of("millis", millis),
-                            SafeArg.of("best", best));
-                }
-
-                @Override
-                public void onFailure(Throwable throwable) {
-                    log.info("Failed to sample RTT for channels", throwable);
-                }
-            });
-        }
-    }
-
     private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuffled) {
         ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()];
 
@@ -206,6 +158,52 @@ private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuff
         return snapshotArray;
     }
 
+    private void sampleRttsForAllChannels() {
+        // TODO(dfox): don't do this 100% of the time
+        // if (random.nextDouble() > 0.01f) {
+        //     return;
+        // }
+
+        List<ListenableFuture<Long>> futures = channels.stream()
+                .map(channel -> {
+                    long before = clock.read();
+                    return channel.delegate
+                            .maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST)
+                            .map(future -> Futures.transform(
+                                    future,
+                                    _response -> {
+                                        long durationNanos = clock.read() - before;
+                                        channel.rtt.addMeasurement(durationNanos);
+                                        return durationNanos;
+                                    },
+                                    MoreExecutors.directExecutor()))
+                            .orElseGet(() -> Futures.immediateFuture(Long.MAX_VALUE));
+                })
+                .collect(ImmutableList.toImmutableList());
+
+        if (log.isDebugEnabled()) {
+            DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback<List<Long>>() {
+                @Override
+                public void onSuccess(List<Long> result) {
+                    List<Long> millis =
+                            result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
+                    List<Long> best =
+                            channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList());
+                    log.debug(
+                            "RTTs {} {} {}",
+                            SafeArg.of("nanos", result),
+                            SafeArg.of("millis", millis),
+                            SafeArg.of("best", best));
+                }
+
+                @Override
+                public void onFailure(Throwable throwable) {
+                    log.info("Failed to sample RTT for channels", throwable);
+                }
+            });
+        }
+    }
+
     /** Returns a new shuffled list, without mutating the input list (which may be immutable). */
     private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Random random) {
         List<T> mutableList = new ArrayList<>(sourceList);
@@ -229,7 +227,7 @@ private static final class MutableChannelWithStats implements LimitedChannel {
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
-        private final RoundTripTimeMeasurement rtt = new RoundTripTimeMeasurement();
+        private final RttMeasurement rtt = new RttMeasurement();
 
         /**
          * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns
@@ -460,7 +458,7 @@ public String version() {
 
     @VisibleForTesting
     @ThreadSafe
-    static class RoundTripTimeMeasurement {
+    static final class RttMeasurement {
 
         // TODO(dfox): switch to some exponentially decaying thingy, so we could detect if network topology changes
         private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE);
@@ -476,7 +474,7 @@ void addMeasurement(long newMeasurement) {
 
         @Override
         public String toString() {
-            return "RoundTripTimeMeasurement{" + rttNanos + '}';
+            return "RttMeasurement{" + rttNanos + '}';
         }
     }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 59bbba3ae..5d9357b4a 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -204,8 +204,8 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
 
     @Test
     void rtt_just_remembers_the_min() {
-        BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement rtt =
-                new BalancedNodeSelectionStrategyChannel.RoundTripTimeMeasurement();
+        BalancedNodeSelectionStrategyChannel.RttMeasurement rtt =
+                new BalancedNodeSelectionStrategyChannel.RttMeasurement();
         rtt.addMeasurement(3);
         assertThat(rtt.getNanos()).isEqualTo(3);
         rtt.addMeasurement(1);

From 9398d9a08f94a65363bcf7a4071e67652d8141ed Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 01:33:24 +0100
Subject: [PATCH 15/35] test for rate limiter

---
 .../BalancedNodeSelectionStrategyChannel.java | 38 +++++++++++++++----
 ...ancedNodeSelectionStrategyChannelTest.java | 13 +++++++
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 2111fad56..7a449c20e 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -84,6 +84,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private final ImmutableList<MutableChannelWithStats> channels;
     private final Random random;
     private final Ticker clock;
+    private final RttMeasurementRateLimiter shouldMeasureRtts;
 
     BalancedNodeSelectionStrategyChannel(
             ImmutableList<LimitedChannel> channels,
@@ -94,6 +95,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
         Preconditions.checkState(channels.size() >= 2, "At least two channels required");
         this.random = random;
         this.clock = ticker;
+        this.shouldMeasureRtts = new RttMeasurementRateLimiter(clock);
         this.channels = IntStream.range(0, channels.size())
                 .mapToObj(index -> new MutableChannelWithStats(
                         channels.get(index),
@@ -118,7 +120,9 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         for (ChannelStats channel : sortedChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
-                sampleRttsForAllChannels();
+                if (shouldMeasureRtts.tryAcquire()) {
+                    sampleAllChannelsRttsNonBlocking();
+                }
                 return maybe;
             }
         }
@@ -158,12 +162,7 @@ private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuff
         return snapshotArray;
     }
 
-    private void sampleRttsForAllChannels() {
-        // TODO(dfox): don't do this 100% of the time
-        // if (random.nextDouble() > 0.01f) {
-        //     return;
-        // }
-
+    private void sampleAllChannelsRttsNonBlocking() {
         List<ListenableFuture<Long>> futures = channels.stream()
                 .map(channel -> {
                     long before = clock.read();
@@ -477,4 +476,29 @@ public String toString() {
             return "RttMeasurement{" + rttNanos + '}';
         }
     }
+
+    static final class RttMeasurementRateLimiter {
+        private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos();
+
+        private final Ticker clock;
+        private final AtomicLong lastMeasured = new AtomicLong(0);
+
+        private RttMeasurementRateLimiter(Ticker clock) {
+            this.clock = clock;
+        }
+
+        boolean tryAcquire() {
+            long last = lastMeasured.get();
+            long now = clock.read();
+            if (last + BETWEEN_SAMPLES > now) {
+                return false;
+            }
+
+            // TODO(dfox): if we have thousands of requests/sec, maybe we should re-evaluate every 1000 or so?
+            // TODO(dfox): do we care if multiple samples are running at the same time?
+
+            lastMeasured.compareAndSet(last, now);
+            return true;
+        }
+    }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 5d9357b4a..7fbc38dce 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -149,6 +149,8 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
 
     @Test
     void rtt_is_measured_and_can_influence_choices() {
+        incrementClockBy(Duration.ofHours(1));
+
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
         when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
 
@@ -170,10 +172,20 @@ void rtt_is_measured_and_can_influence_choices() {
         assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
                 .describedAs("The poor latency of channel2 imposes a small constant penalty in the score")
                 .containsExactly(0, 3);
+
+        for (int i = 0; i < 500; i++) {
+            incrementClockBy(Duration.ofMillis(10));
+            channel.maybeExecute(endpoint, request);
+        }
+        // rate limiter ensures a sensible amount of rtt sampling
+        verify(chan1, times(6)).maybeExecute(eq(rttEndpoint), any());
+        verify(chan2, times(6)).maybeExecute(eq(rttEndpoint), any());
     }
 
     @Test
     void when_rtt_measurements_are_limited_dont_freak_out() {
+        incrementClockBy(Duration.ofHours(1));
+
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
         when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
 
@@ -189,6 +201,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() {
 
     @Test
     void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
+        incrementClockBy(Duration.ofHours(1));
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
         when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
 

From b17e311062c3cb702584457800d4f2eb0c2ed1a6 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 01:34:13 +0100
Subject: [PATCH 16/35] Simulations

---
 ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 simulation/src/test/resources/report.md              | 12 ++++++------
 ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...ne_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 13 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index dcbdc4b08..12fe6c646 100644
--- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1cefd6e18f6003d8146aa97dcfef902d00185395ae1f8d162c2482dacfd57d81
-size 117492
+oid sha256:516145937579fb582d7a76100b05fd7054b6357b95d560770988fc5b460e20f0
+size 115803
diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index a011807a8..b0f68688b 100644
--- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32de7fbba94d7676ad359f3228e55337b61088fcb4102fe743e9105581c94280
-size 123659
+oid sha256:f6d4e200f4549b0c11a3bdd95af0c8e1dbafa2bb7fb7d2c5c27b48e9a9ed1d86
+size 122953
diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index f6d0b4252..0972c02e4 100644
--- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd0ba5b93d7672d48abf2aef421884968bac702a9c760e159990ef4b3c6ac7a9
-size 82748
+oid sha256:76b09c3a5e31760626407eb8f6c1f50b2091a4d2ca7efe5e7d7a84cd099f7601
+size 83448
diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 459bfa31b..437e4e988 100644
--- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1836375d4ea7584287820c4b7658ff7b62c990ffac3cf1abe5657b6746651463
-size 135128
+oid sha256:8e2f1f9109ba521a3226d4d9fb745c3b7c9c440efb0ccc4d71f7ee5037199054
+size 133708
diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md
index 61cf72fe9..c773eef36 100644
--- a/simulation/src/test/resources/report.md
+++ b/simulation/src/test/resources/report.md
@@ -2,7 +2,7 @@
 <!-- Run SimulationTest to regenerate this report. -->
 ```
                 all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=79.3%	client_mean=PT5.81342S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1586, 500=414}
-                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=72.8%	client_mean=PT2.57287S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1456, 500=544}
+                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=73.6%	client_mean=PT2.91032S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1472, 500=528}
                               all_nodes_500[UNLIMITED_ROUND_ROBIN].txt:	success=50.0%	client_mean=PT0.6S         	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1000, 500=1000}
                    black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=59.2%	client_mean=PT0.600655114S 	server_cpu=PT11M49.8S     	client_received=1183/2000	server_resps=1183	codes={200=1183}
                        black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
@@ -17,16 +17,16 @@
             fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                       fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=94.3%	client_mean=PT6.987708S    	server_cpu=PT1H56M8.59S   	client_received=2500/2500	server_resps=2500	codes={200=2357, 500=143}
-                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=83.8%	client_mean=PT4.529216S    	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
+                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=84.0%	client_mean=PT4.5915112S   	server_cpu=PT1H52M50.27S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
                              live_reloading[UNLIMITED_ROUND_ROBIN].txt:	success=86.9%	client_mean=PT2.802124S    	server_cpu=PT1H56M45.31S  	client_received=2500/2500	server_resps=2500	codes={200=2173, 500=327}
                 one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2.569766928S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
-                    one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.516076793S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
+                    one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.521339525S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
                               one_big_spike[UNLIMITED_ROUND_ROBIN].txt:	success=99.9%	client_mean=PT1.000523583S 	server_cpu=PT7M37.35S     	client_received=1000/1000	server_resps=3049	codes={200=999, 429=1}
 one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=64.9%	client_mean=PT8.1950896S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1623, 500=877}
- one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=66.8%	client_mean=PT2.6967232S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1671, 500=829}
+ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=64.2%	client_mean=PT3.9523312S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1605, 500=895}
            one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt:	success=65.1%	client_mean=PT0.6S         	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1628, 500=872}
       server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT12M44.762591546S	server_cpu=PT9H55M40.4S   	client_received=150000/150000	server_resps=178702	codes={200=149989, 429=11}
-          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.244761087S 	server_cpu=PT9H1M59S      	client_received=150000/150000	server_resps=162595	codes={200=150000}
+          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.262702651S 	server_cpu=PT9H5M50.6S    	client_received=150000/150000	server_resps=163753	codes={200=150000}
                     server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.217416858S 	server_cpu=PT8H45M29.2S   	client_received=150000/150000	server_resps=157646	codes={200=150000}
        simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT0.834469696S 	server_cpu=PT3H3M35S      	client_received=13200/13200	server_resps=13200	codes={200=13200}
            simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.785757575S 	server_cpu=PT2H52M52S     	client_received=13200/13200	server_resps=13200	codes={200=13200}
@@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	succe
             slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
                       slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
 slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2M28.388896928S	server_cpu=PT8H48M18.546665835S	client_received=10000/10000	server_resps=10899	codes={200=10000}
-    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M26.439571686S	server_cpu=PT14H31M14.346666611S	client_received=10000/10000	server_resps=13781	codes={200=9996, 500=4}
+    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M18.314817484S	server_cpu=PT13H50M38.666666634S	client_received=10000/10000	server_resps=13148	codes={200=9995, 500=5}
               slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt:	success=3.6%	client_mean=PT21.551691121S	server_cpu=PT54H45M57.899999949S	client_received=10000/10000	server_resps=49335	codes={200=360, 500=9640}
               uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
                   uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 69edd5eb7..4a1422710 100644
--- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c80da86e1873f4bd972a3cd3f2976c1bfb6f64e3bc28c7b9043feb08388a4c51
-size 552215
+oid sha256:29856bd2342821f7d0c0bb19b4b1efdcaed1af8b3c36cb2533485c55e0e00bfd
+size 560290
diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 6abe35c1c..91ad7456a 100644
--- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd941ae9716e18ce6d35cbef330f26979460c9124ed744d143a5e124d497c92b
-size 119483
+oid sha256:dc3169a632ae8fd7de4e96ba5bb2d39b99e97e12416d37c5fb6516a452a1eedf
+size 119379
diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 4848ec898..71eaf7e5e 100644
--- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=72.8%	client_mean=PT2.57287S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1456, 500=544}
\ No newline at end of file
+success=73.6%	client_mean=PT2.91032S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1472, 500=528}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index eb2d8e4b8..e253a01c0 100644
--- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=83.8%	client_mean=PT4.529216S    	server_cpu=PT1H53M23.21S  	client_received=2500/2500	server_resps=2500	codes={200=2096, 500=404}
\ No newline at end of file
+success=84.0%	client_mean=PT4.5915112S   	server_cpu=PT1H52M50.27S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index b5f6ef6a5..b71e29339 100644
--- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT1.516076793S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
\ No newline at end of file
+success=100.0%	client_mean=PT1.521339525S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 8bec96e81..a4008cf4c 100644
--- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=66.8%	client_mean=PT2.6967232S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1671, 500=829}
\ No newline at end of file
+success=64.2%	client_mean=PT3.9523312S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1605, 500=895}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 22cc13a0e..bfc7b44c2 100644
--- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT0.244761087S 	server_cpu=PT9H1M59S      	client_received=150000/150000	server_resps=162595	codes={200=150000}
\ No newline at end of file
+success=100.0%	client_mean=PT0.262702651S 	server_cpu=PT9H5M50.6S    	client_received=150000/150000	server_resps=163753	codes={200=150000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 0b011e68a..9fe47e0ea 100644
--- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT3M26.439571686S	server_cpu=PT14H31M14.346666611S	client_received=10000/10000	server_resps=13781	codes={200=9996, 500=4}
\ No newline at end of file
+success=100.0%	client_mean=PT3M18.314817484S	server_cpu=PT13H50M38.666666634S	client_received=10000/10000	server_resps=13148	codes={200=9995, 500=5}
\ No newline at end of file

From 3beac0f782f3b09ee86a2cb24646cd7f34fcd3bd Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 02:08:58 +0100
Subject: [PATCH 17/35] Smaller diff

---
 .../BalancedNodeSelectionStrategyChannel.java | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 7a449c20e..a0cc5fb09 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -72,12 +72,12 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private static final double FAILURE_WEIGHT = 10;
 
     /**
-     * This determines how 'sticky' we are to our 'nearest' node (as measured by RTT). If this is set too high, then we
-     * may deliver suboptimal perf by sending all requests to a slow node that is physically nearby, when there's
-     * actually a faster one further away.
-     * If this is too low, then we may prematurely spill across AZs and pay the $ cost. Should probably
-     * keep this lower than {@link #FAILURE_WEIGHT} to ensure that a single 5xx makes a nearby node less attractive
-     * than a faraway node that exhibited zero failures.
+     * RTT_WEIGHT determines how sticky we are to the physically nearest node (as measured by RTT). If this is set
+     * too high, then we may deliver suboptimal perf by sending all requests to a slow node that is physically nearby,
+     * when there's actually a faster one further away.
+     * If this is too low, then we may prematurely spill across AZs and pay the $ cost. Keep this lower than
+     * {@link #FAILURE_WEIGHT} to ensure that a single 5xx makes a nearby node less attractive than a faraway node
+     * that exhibited zero failures.
      */
     private static final double RTT_WEIGHT = 3;
 
@@ -226,7 +226,7 @@ private static final class MutableChannelWithStats implements LimitedChannel {
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
-        private final RttMeasurement rtt = new RttMeasurement();
+        private final RttMeasurement rtt;
 
         /**
          * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns
@@ -239,6 +239,7 @@ private static final class MutableChannelWithStats implements LimitedChannel {
             this.delegate = delegate;
             this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY);
             this.observability = observability;
+            this.rtt = new RttMeasurement();
             // Saves one allocation on each network call
             this.updateStats = new FutureCallback<Response>() {
                 @Override
@@ -271,13 +272,12 @@ public void onFailure(Throwable throwable) {
         public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Request request) {
             inflight.incrementAndGet();
             Optional<ListenableFuture<Response>> maybe = delegate.maybeExecute(endpoint, request);
-            if (!maybe.isPresent()) {
+            if (maybe.isPresent()) {
+                DialogueFutures.addDirectCallback(maybe.get(), updateStats);
+                observability.markRequestMade();
+            } else {
                 inflight.decrementAndGet(); // quickly undo
-                return Optional.empty();
             }
-
-            DialogueFutures.addDirectCallback(maybe.get(), updateStats);
-            observability.markRequestMade();
             return maybe;
         }
 
@@ -477,7 +477,7 @@ public String toString() {
         }
     }
 
-    static final class RttMeasurementRateLimiter {
+    private static final class RttMeasurementRateLimiter {
         private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos();
 
         private final Ticker clock;

From c40950f3dda77294f4d7d1eaa023c04060ae8c28 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 01:08:58 +0000
Subject: [PATCH 18/35] Add generated changelog entries

---
 changelog/@unreleased/pr-794.v2.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 changelog/@unreleased/pr-794.v2.yml

diff --git a/changelog/@unreleased/pr-794.v2.yml b/changelog/@unreleased/pr-794.v2.yml
new file mode 100644
index 000000000..013c4ed00
--- /dev/null
+++ b/changelog/@unreleased/pr-794.v2.yml
@@ -0,0 +1,6 @@
+type: feature
+feature:
+  description: Balanced channel now biases towards whichever node has the lowest latency,
+    which should reduce AWS spend by routing requests within AZ.
+  links:
+  - https://github.com/palantir/dialogue/pull/794

From c91333487e37e284c5e0495e6456f94512b173aa Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 17:06:58 +0100
Subject: [PATCH 19/35] Don't allow multiple samples to run at the same time

---
 .../BalancedNodeSelectionStrategyChannel.java | 45 +++++++++++++------
 ...ancedNodeSelectionStrategyChannelTest.java |  7 ++-
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index a0cc5fb09..0d6a6c955 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -35,6 +35,7 @@
 import com.palantir.logsafe.UnsafeArg;
 import com.palantir.tritium.metrics.registry.MetricName;
 import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
+import java.io.Closeable;
 import java.time.Duration;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -45,6 +46,7 @@
 import java.util.Optional;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
@@ -120,9 +122,7 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         for (ChannelStats channel : sortedChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
-                if (shouldMeasureRtts.tryAcquire()) {
-                    sampleAllChannelsRttsNonBlocking();
-                }
+                maybeSampleAllChannelRttsNonBlocking();
                 return maybe;
             }
         }
@@ -162,7 +162,12 @@ private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuff
         return snapshotArray;
     }
 
-    private void sampleAllChannelsRttsNonBlocking() {
+    private void maybeSampleAllChannelRttsNonBlocking() {
+        Optional<RttSamplePermit> maybePermit = shouldMeasureRtts.acquireNonBlocking();
+        if (!maybePermit.isPresent()) {
+            return;
+        }
+
         List<ListenableFuture<Long>> futures = channels.stream()
                 .map(channel -> {
                     long before = clock.read();
@@ -180,8 +185,13 @@ private void sampleAllChannelsRttsNonBlocking() {
                 })
                 .collect(ImmutableList.toImmutableList());
 
+        // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off
+        // more and more and more requests and eventually exhaust a threadpool
+        ListenableFuture<List<Long>> allAsList = Futures.allAsList(futures);
+        DialogueFutures.addDirectListener(allAsList, maybePermit.get()::close);
+
         if (log.isDebugEnabled()) {
-            DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback<List<Long>>() {
+            DialogueFutures.addDirectCallback(allAsList, new FutureCallback<List<Long>>() {
                 @Override
                 public void onSuccess(List<Long> result) {
                     List<Long> millis =
@@ -482,23 +492,30 @@ private static final class RttMeasurementRateLimiter {
 
         private final Ticker clock;
         private final AtomicLong lastMeasured = new AtomicLong(0);
+        private final AtomicBoolean currentlySampling = new AtomicBoolean(false);
 
         private RttMeasurementRateLimiter(Ticker clock) {
             this.clock = clock;
         }
 
-        boolean tryAcquire() {
+        Optional<RttSamplePermit> acquireNonBlocking() {
             long last = lastMeasured.get();
-            long now = clock.read();
-            if (last + BETWEEN_SAMPLES > now) {
-                return false;
+            if (last + BETWEEN_SAMPLES > clock.read()) {
+                return Optional.empty();
             }
 
-            // TODO(dfox): if we have thousands of requests/sec, maybe we should re-evaluate every 1000 or so?
-            // TODO(dfox): do we care if multiple samples are running at the same time?
-
-            lastMeasured.compareAndSet(last, now);
-            return true;
+            if (currentlySampling.compareAndSet(false, true)) {
+                lastMeasured.set(clock.read());
+                return Optional.of(() -> currentlySampling.compareAndSet(true, false));
+            } else {
+                log.warn("Wanted to sample RTTs but an existing sample was still in progress");
+                return Optional.empty();
+            }
         }
     }
+
+    private interface RttSamplePermit extends Closeable {
+        @Override
+        void close();
+    }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 7fbc38dce..e900b88c2 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -210,9 +210,14 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
         when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create()));
         when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create()));
 
-        channel.maybeExecute(endpoint, request);
+        for (int i = 0; i < 20; i++) {
+            incrementClockBy(Duration.ofSeconds(5));
+            channel.maybeExecute(endpoint, request);
+        }
 
         assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
+        verify(chan1, times(1)).maybeExecute(eq(rttEndpoint), any());
+        verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any());
     }
 
     @Test

From 7005529a5540036a41e581e1ce283eebdff2213c Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 17:24:00 +0100
Subject: [PATCH 20/35] Return the min of the last 5 measurements

---
 .../BalancedNodeSelectionStrategyChannel.java | 45 +++++++++++++------
 ...ancedNodeSelectionStrategyChannelTest.java | 14 ++++--
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 0d6a6c955..fa29c3073 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -67,9 +67,8 @@
  */
 final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class);
-    private static final Request BLANK_REQUEST = Request.builder().build();
-    private static final Comparator<ChannelStats> BY_SCORE = Comparator.comparingInt(ChannelStats::getScore);
 
+    private static final Comparator<ChannelStats> BY_SCORE = Comparator.comparingInt(ChannelStats::getScore);
     private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30);
     private static final double FAILURE_WEIGHT = 10;
 
@@ -163,7 +162,7 @@ private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuff
     }
 
     private void maybeSampleAllChannelRttsNonBlocking() {
-        Optional<RttSamplePermit> maybePermit = shouldMeasureRtts.acquireNonBlocking();
+        Optional<RttMeasurementPermit> maybePermit = shouldMeasureRtts.acquireNonBlocking();
         if (!maybePermit.isPresent()) {
             return;
         }
@@ -172,7 +171,8 @@ private void maybeSampleAllChannelRttsNonBlocking() {
                 .map(channel -> {
                     long before = clock.read();
                     return channel.delegate
-                            .maybeExecute(RttEndpoint.INSTANCE, BLANK_REQUEST)
+                            .maybeExecute(
+                                    RttEndpoint.INSTANCE, Request.builder().build())
                             .map(future -> Futures.transform(
                                     future,
                                     _response -> {
@@ -465,25 +465,44 @@ public String version() {
         }
     }
 
+    /**
+     * Always returns the *minimum* value from the last few samples, so that we exclude slow calls that might include
+     * TLS handshakes.
+     */
     @VisibleForTesting
     @ThreadSafe
     static final class RttMeasurement {
+        private static final int NUM_MEASUREMENTS = 5;
 
-        // TODO(dfox): switch to some exponentially decaying thingy, so we could detect if network topology changes
-        private final AtomicLong rttNanos = new AtomicLong(Long.MAX_VALUE);
+        private final long[] samples;
+        private volatile long bestRttNanos = Long.MAX_VALUE;
+
+        RttMeasurement() {
+            samples = new long[NUM_MEASUREMENTS];
+            Arrays.fill(samples, Long.MAX_VALUE);
+        }
 
         long getNanos() {
-            return rttNanos.get();
+            return bestRttNanos;
         }
 
-        void addMeasurement(long newMeasurement) {
-            // Only stores the *minimum* values, so that we exclude slow calls that might include TLS handshakes.
-            rttNanos.getAndUpdate(existing -> Math.min(existing, newMeasurement));
+        synchronized void addMeasurement(long newMeasurement) {
+            Preconditions.checkArgument(newMeasurement > 0, "Must be greater than zero");
+            Preconditions.checkArgument(newMeasurement < Long.MAX_VALUE, "Must be less than MAX_VALUE");
+
+            if (samples[0] == Long.MAX_VALUE) {
+                Arrays.fill(samples, newMeasurement);
+                bestRttNanos = newMeasurement;
+            } else {
+                System.arraycopy(samples, 1, samples, 0, NUM_MEASUREMENTS - 1);
+                samples[NUM_MEASUREMENTS - 1] = newMeasurement;
+                bestRttNanos = Arrays.stream(samples).min().getAsLong();
+            }
         }
 
         @Override
         public String toString() {
-            return "RttMeasurement{" + rttNanos + '}';
+            return "RttMeasurement{" + "bestRttNanos=" + bestRttNanos + ", samples=" + Arrays.toString(samples) + '}';
         }
     }
 
@@ -498,7 +517,7 @@ private RttMeasurementRateLimiter(Ticker clock) {
             this.clock = clock;
         }
 
-        Optional<RttSamplePermit> acquireNonBlocking() {
+        Optional<RttMeasurementPermit> acquireNonBlocking() {
             long last = lastMeasured.get();
             if (last + BETWEEN_SAMPLES > clock.read()) {
                 return Optional.empty();
@@ -514,7 +533,7 @@ Optional<RttSamplePermit> acquireNonBlocking() {
         }
     }
 
-    private interface RttSamplePermit extends Closeable {
+    private interface RttMeasurementPermit extends Closeable {
         @Override
         void close();
     }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index e900b88c2..ba3e35c5b 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -221,17 +221,23 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
     }
 
     @Test
-    void rtt_just_remembers_the_min() {
+    void rtt_returns_the_min_of_the_last_5_measurements() {
         BalancedNodeSelectionStrategyChannel.RttMeasurement rtt =
                 new BalancedNodeSelectionStrategyChannel.RttMeasurement();
         rtt.addMeasurement(3);
-        assertThat(rtt.getNanos()).isEqualTo(3);
+        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(3);
         rtt.addMeasurement(1);
         rtt.addMeasurement(2);
-        assertThat(rtt.getNanos()).isEqualTo(1);
+        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1);
 
         rtt.addMeasurement(500);
-        assertThat(rtt.getNanos()).isEqualTo(1);
+        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1);
+        rtt.addMeasurement(500);
+        rtt.addMeasurement(500);
+        rtt.addMeasurement(500);
+        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(2);
+        rtt.addMeasurement(500);
+        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(500);
     }
 
     private static void set200(LimitedChannel chan) {

From 4c4ce9164bd0f6147ef864b6197847c389dee579 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 17:38:51 +0100
Subject: [PATCH 21/35] Move sorting

---
 .../core/BalancedNodeSelectionStrategyChannel.java   | 12 ++++++------
 .../BalancedNodeSelectionStrategyChannelTest.java    |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index fa29c3073..55e3abc9e 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -116,9 +116,10 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
         // TODO(dfox): P2C optimization when we have high number of nodes to save CPU?
         // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
-        ChannelStats[] sortedChannels = sortByScore(preShuffled);
+        ChannelStats[] sortableChannels = computeScores(preShuffled);
+        Arrays.sort(sortableChannels, BY_SCORE);
 
-        for (ChannelStats channel : sortedChannels) {
+        for (ChannelStats channel : sortableChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
                 maybeSampleAllChannelRttsNonBlocking();
@@ -129,7 +130,7 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         return Optional.empty();
     }
 
-    private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuffled) {
+    private static ChannelStats[] computeScores(List<MutableChannelWithStats> preShuffled) {
         ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()];
 
         long bestRttNanos = Long.MAX_VALUE;
@@ -157,7 +158,6 @@ private static ChannelStats[] sortByScore(List<MutableChannelWithStats> preShuff
             }
         }
 
-        Arrays.sort(snapshotArray, BY_SCORE);
         return snapshotArray;
     }
 
@@ -222,7 +222,7 @@ private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Ran
 
     @VisibleForTesting
     Stream<ChannelStats> getScoresForTesting() {
-        return Arrays.stream(sortByScore(channels));
+        return Arrays.stream(computeScores(channels));
     }
 
     @Override
@@ -358,7 +358,7 @@ private static void registerGauges(
             DialogueInternalWeakReducingGauge.getOrCreate(
                     taggedMetrics,
                     metricName,
-                    c -> c.getSnapshot().getScore(),
+                    c -> c.getSnapshot().getScore(), // note, this doesn't include rtts
                     longStream -> {
                         long[] longs = longStream.toArray();
                         if (log.isInfoEnabled() && longs.length > 1) {
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index ba3e35c5b..59a82d131 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -136,7 +136,7 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
         channel.maybeExecute(endpoint, request);
         assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
                 .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), channel)
-                .containsExactly(0, 1);
+                .containsExactly(1, 0);
 
         incrementClockBy(Duration.ofSeconds(5));
 

From 7f9db171fb005aece822211851e81198ca9dca3f Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 17:43:51 +0100
Subject: [PATCH 22/35] Refactor

---
 .../BalancedNodeSelectionStrategyChannel.java | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 55e3abc9e..81313c819 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -187,13 +187,12 @@ private void maybeSampleAllChannelRttsNonBlocking() {
 
         // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off
         // more and more and more requests and eventually exhaust a threadpool
-        ListenableFuture<List<Long>> allAsList = Futures.allAsList(futures);
-        DialogueFutures.addDirectListener(allAsList, maybePermit.get()::close);
+        DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback<List<Long>>() {
+            @Override
+            public void onSuccess(List<Long> result) {
+                maybePermit.get().close();
 
-        if (log.isDebugEnabled()) {
-            DialogueFutures.addDirectCallback(allAsList, new FutureCallback<List<Long>>() {
-                @Override
-                public void onSuccess(List<Long> result) {
+                if (log.isDebugEnabled()) {
                     List<Long> millis =
                             result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
                     List<Long> best =
@@ -204,13 +203,14 @@ public void onSuccess(List<Long> result) {
                             SafeArg.of("millis", millis),
                             SafeArg.of("best", best));
                 }
+            }
 
-                @Override
-                public void onFailure(Throwable throwable) {
-                    log.info("Failed to sample RTT for channels", throwable);
-                }
-            });
-        }
+            @Override
+            public void onFailure(Throwable throwable) {
+                maybePermit.get().close();
+                log.info("Failed to sample RTT for channels", throwable);
+            }
+        });
     }
 
     /** Returns a new shuffled list, without mutating the input list (which may be immutable). */

From 64a955b5ee11df19795679a2d3bd940dc080f1fd Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 18:08:35 +0100
Subject: [PATCH 23/35] Pull everything out to a dedicated 'RttSampler' class

---
 .../BalancedNodeSelectionStrategyChannel.java | 181 +-------------
 .../palantir/dialogue/core/RttSampler.java    | 229 ++++++++++++++++++
 ...ancedNodeSelectionStrategyChannelTest.java |  29 +--
 .../dialogue/core/RttSamplerTest.java         |  43 ++++
 4 files changed, 285 insertions(+), 197 deletions(-)
 create mode 100644 dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
 create mode 100644 dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 81313c819..a47712671 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -22,37 +22,27 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.primitives.Ints;
 import com.google.common.util.concurrent.FutureCallback;
-import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
-import com.google.common.util.concurrent.MoreExecutors;
 import com.palantir.dialogue.Endpoint;
-import com.palantir.dialogue.HttpMethod;
 import com.palantir.dialogue.Request;
 import com.palantir.dialogue.Response;
-import com.palantir.dialogue.UrlBuilder;
 import com.palantir.logsafe.Preconditions;
 import com.palantir.logsafe.SafeArg;
 import com.palantir.logsafe.UnsafeArg;
 import com.palantir.tritium.metrics.registry.MetricName;
 import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
-import java.io.Closeable;
 import java.time.Duration;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
-import java.util.Map;
 import java.util.Optional;
 import java.util.Random;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.stream.Collectors;
+import java.util.function.LongSupplier;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
-import javax.annotation.concurrent.ThreadSafe;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -85,7 +75,7 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private final ImmutableList<MutableChannelWithStats> channels;
     private final Random random;
     private final Ticker clock;
-    private final RttMeasurementRateLimiter shouldMeasureRtts;
+    private final RttSampler rttSampler;
 
     BalancedNodeSelectionStrategyChannel(
             ImmutableList<LimitedChannel> channels,
@@ -96,11 +86,12 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
         Preconditions.checkState(channels.size() >= 2, "At least two channels required");
         this.random = random;
         this.clock = ticker;
-        this.shouldMeasureRtts = new RttMeasurementRateLimiter(clock);
+        this.rttSampler = new RttSampler(channels, clock);
         this.channels = IntStream.range(0, channels.size())
                 .mapToObj(index -> new MutableChannelWithStats(
                         channels.get(index),
                         clock,
+                        rttSampler.get(index),
                         PerHostObservability.create(channels, taggedMetrics, channelName, index)))
                 .collect(ImmutableList.toImmutableList());
 
@@ -122,7 +113,7 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         for (ChannelStats channel : sortableChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
-                maybeSampleAllChannelRttsNonBlocking();
+                rttSampler.maybeSampleRtts();
                 return maybe;
             }
         }
@@ -161,58 +152,6 @@ private static ChannelStats[] computeScores(List<MutableChannelWithStats> preShu
         return snapshotArray;
     }
 
-    private void maybeSampleAllChannelRttsNonBlocking() {
-        Optional<RttMeasurementPermit> maybePermit = shouldMeasureRtts.acquireNonBlocking();
-        if (!maybePermit.isPresent()) {
-            return;
-        }
-
-        List<ListenableFuture<Long>> futures = channels.stream()
-                .map(channel -> {
-                    long before = clock.read();
-                    return channel.delegate
-                            .maybeExecute(
-                                    RttEndpoint.INSTANCE, Request.builder().build())
-                            .map(future -> Futures.transform(
-                                    future,
-                                    _response -> {
-                                        long durationNanos = clock.read() - before;
-                                        channel.rtt.addMeasurement(durationNanos);
-                                        return durationNanos;
-                                    },
-                                    MoreExecutors.directExecutor()))
-                            .orElseGet(() -> Futures.immediateFuture(Long.MAX_VALUE));
-                })
-                .collect(ImmutableList.toImmutableList());
-
-        // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off
-        // more and more and more requests and eventually exhaust a threadpool
-        DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback<List<Long>>() {
-            @Override
-            public void onSuccess(List<Long> result) {
-                maybePermit.get().close();
-
-                if (log.isDebugEnabled()) {
-                    List<Long> millis =
-                            result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
-                    List<Long> best =
-                            channels.stream().map(ch -> ch.rtt.getNanos()).collect(Collectors.toList());
-                    log.debug(
-                            "RTTs {} {} {}",
-                            SafeArg.of("nanos", result),
-                            SafeArg.of("millis", millis),
-                            SafeArg.of("best", best));
-                }
-            }
-
-            @Override
-            public void onFailure(Throwable throwable) {
-                maybePermit.get().close();
-                log.info("Failed to sample RTT for channels", throwable);
-            }
-        });
-    }
-
     /** Returns a new shuffled list, without mutating the input list (which may be immutable). */
     private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Random random) {
         List<T> mutableList = new ArrayList<>(sourceList);
@@ -233,10 +172,10 @@ public String toString() {
     private static final class MutableChannelWithStats implements LimitedChannel {
         private final LimitedChannel delegate;
         private final FutureCallback<Response> updateStats;
+        private final LongSupplier rtt;
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
-        private final RttMeasurement rtt;
 
         /**
          * We keep track of failures within a time window to do well in scenarios where an unhealthy server returns
@@ -245,11 +184,12 @@ private static final class MutableChannelWithStats implements LimitedChannel {
          */
         private final CoarseExponentialDecayReservoir recentFailuresReservoir;
 
-        MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) {
+        MutableChannelWithStats(
+                LimitedChannel delegate, Ticker clock, LongSupplier rtt, PerHostObservability observability) {
             this.delegate = delegate;
             this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY);
+            this.rtt = rtt;
             this.observability = observability;
-            this.rtt = new RttMeasurement();
             // Saves one allocation on each network call
             this.updateStats = new FutureCallback<Response>() {
                 @Override
@@ -292,7 +232,7 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         }
 
         ChannelStats getSnapshot() {
-            return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getNanos());
+            return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getAsLong());
         }
 
         @Override
@@ -436,105 +376,4 @@ public void markRequestMade() {
             };
         }
     }
-
-    @VisibleForTesting
-    enum RttEndpoint implements Endpoint {
-        INSTANCE;
-
-        @Override
-        public void renderPath(Map<String, String> _params, UrlBuilder _url) {}
-
-        @Override
-        public HttpMethod httpMethod() {
-            return HttpMethod.OPTIONS;
-        }
-
-        @Override
-        public String serviceName() {
-            return "Balanced";
-        }
-
-        @Override
-        public String endpointName() {
-            return "rtt";
-        }
-
-        @Override
-        public String version() {
-            return "0.0.0";
-        }
-    }
-
-    /**
-     * Always returns the *minimum* value from the last few samples, so that we exclude slow calls that might include
-     * TLS handshakes.
-     */
-    @VisibleForTesting
-    @ThreadSafe
-    static final class RttMeasurement {
-        private static final int NUM_MEASUREMENTS = 5;
-
-        private final long[] samples;
-        private volatile long bestRttNanos = Long.MAX_VALUE;
-
-        RttMeasurement() {
-            samples = new long[NUM_MEASUREMENTS];
-            Arrays.fill(samples, Long.MAX_VALUE);
-        }
-
-        long getNanos() {
-            return bestRttNanos;
-        }
-
-        synchronized void addMeasurement(long newMeasurement) {
-            Preconditions.checkArgument(newMeasurement > 0, "Must be greater than zero");
-            Preconditions.checkArgument(newMeasurement < Long.MAX_VALUE, "Must be less than MAX_VALUE");
-
-            if (samples[0] == Long.MAX_VALUE) {
-                Arrays.fill(samples, newMeasurement);
-                bestRttNanos = newMeasurement;
-            } else {
-                System.arraycopy(samples, 1, samples, 0, NUM_MEASUREMENTS - 1);
-                samples[NUM_MEASUREMENTS - 1] = newMeasurement;
-                bestRttNanos = Arrays.stream(samples).min().getAsLong();
-            }
-        }
-
-        @Override
-        public String toString() {
-            return "RttMeasurement{" + "bestRttNanos=" + bestRttNanos + ", samples=" + Arrays.toString(samples) + '}';
-        }
-    }
-
-    private static final class RttMeasurementRateLimiter {
-        private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos();
-
-        private final Ticker clock;
-        private final AtomicLong lastMeasured = new AtomicLong(0);
-        private final AtomicBoolean currentlySampling = new AtomicBoolean(false);
-
-        private RttMeasurementRateLimiter(Ticker clock) {
-            this.clock = clock;
-        }
-
-        Optional<RttMeasurementPermit> acquireNonBlocking() {
-            long last = lastMeasured.get();
-            if (last + BETWEEN_SAMPLES > clock.read()) {
-                return Optional.empty();
-            }
-
-            if (currentlySampling.compareAndSet(false, true)) {
-                lastMeasured.set(clock.read());
-                return Optional.of(() -> currentlySampling.compareAndSet(true, false));
-            } else {
-                log.warn("Wanted to sample RTTs but an existing sample was still in progress");
-                return Optional.empty();
-            }
-        }
-    }
-
-    private interface RttMeasurementPermit extends Closeable {
-        @Override
-        void close();
-    }
 }
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
new file mode 100644
index 000000000..c7b3d97cf
--- /dev/null
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -0,0 +1,229 @@
+/*
+ * (c) Copyright 2020 Palantir Technologies Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.palantir.dialogue.core;
+
+import com.github.benmanes.caffeine.cache.Ticker;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.FutureCallback;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.MoreExecutors;
+import com.palantir.dialogue.Endpoint;
+import com.palantir.dialogue.HttpMethod;
+import com.palantir.dialogue.Request;
+import com.palantir.dialogue.UrlBuilder;
+import com.palantir.logsafe.Preconditions;
+import com.palantir.logsafe.SafeArg;
+import java.io.Closeable;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.LongSupplier;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import javax.annotation.concurrent.ThreadSafe;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+final class RttSampler {
+    private static final Logger log = LoggerFactory.getLogger(RttSampler.class);
+
+    private final ImmutableList<LimitedChannel> channels;
+    private final RttMeasurement[] rtts;
+    private final RttMeasurementRateLimiter rateLimiter;
+    private final Ticker clock;
+
+    RttSampler(ImmutableList<LimitedChannel> channels, Ticker clock) {
+        this.channels = channels;
+        this.rateLimiter = new RttMeasurementRateLimiter(clock);
+        this.clock = clock;
+        this.rtts = IntStream.range(0, channels.size())
+                .mapToObj(i -> new RttMeasurement())
+                .toArray(RttMeasurement[]::new);
+    }
+
+    /**
+     * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link java.lang.Long#MAX_VALUE} to signify unknown.
+     */
+    LongSupplier get(int index) {
+        return rtts[index];
+    }
+
+    /**
+     * Non-blocking.
+     */
+    void maybeSampleRtts() {
+        Optional<RttMeasurementPermit> maybePermit = rateLimiter.acquireNonBlocking();
+        if (!maybePermit.isPresent()) {
+            return;
+        }
+
+        List<ListenableFuture<Long>> futures = IntStream.range(0, channels.size())
+                .mapToObj(i -> {
+                    long before = clock.read();
+                    return channels.get(i)
+                            .maybeExecute(
+                                    RttEndpoint.INSTANCE, Request.builder().build())
+                            .map(future -> Futures.transform(
+                                    future,
+                                    _response -> {
+                                        long durationNanos = clock.read() - before;
+                                        rtts[i].addMeasurement(durationNanos);
+                                        return durationNanos;
+                                    },
+                                    MoreExecutors.directExecutor()))
+                            .orElseGet(() -> Futures.immediateFuture(Long.MAX_VALUE));
+                })
+                .collect(ImmutableList.toImmutableList());
+
+        // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off
+        // more and more and more requests and eventually exhaust a threadpool
+        DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback<List<Long>>() {
+            @Override
+            public void onSuccess(List<Long> result) {
+                maybePermit.get().close();
+
+                if (log.isDebugEnabled()) {
+                    List<Long> millis =
+                            result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
+                    long[] best = Arrays.stream(rtts)
+                            .mapToLong(rtt -> rtt.getAsLong())
+                            .toArray();
+                    log.debug(
+                            "RTTs {} {} {}",
+                            SafeArg.of("nanos", result),
+                            SafeArg.of("millis", millis),
+                            SafeArg.of("best", Arrays.toString(best)));
+                }
+            }
+
+            @Override
+            public void onFailure(Throwable throwable) {
+                maybePermit.get().close();
+                log.info("Failed to sample RTT for channels", throwable);
+            }
+        });
+    }
+
+    @VisibleForTesting
+    enum RttEndpoint implements Endpoint {
+        INSTANCE;
+
+        @Override
+        public void renderPath(Map<String, String> _params, UrlBuilder _url) {}
+
+        @Override
+        public HttpMethod httpMethod() {
+            return HttpMethod.OPTIONS;
+        }
+
+        @Override
+        public String serviceName() {
+            return "Balanced";
+        }
+
+        @Override
+        public String endpointName() {
+            return "rtt";
+        }
+
+        @Override
+        public String version() {
+            return "0.0.0";
+        }
+    }
+
+    /**
+     * Always returns the *minimum* value from the last few samples, so that we exclude slow calls that might include
+     * TLS handshakes.
+     */
+    @VisibleForTesting
+    @ThreadSafe
+    static final class RttMeasurement implements LongSupplier {
+        private static final int NUM_MEASUREMENTS = 5;
+
+        private final long[] samples;
+        private volatile long bestRttNanos = Long.MAX_VALUE;
+
+        RttMeasurement() {
+            samples = new long[NUM_MEASUREMENTS];
+            Arrays.fill(samples, Long.MAX_VALUE);
+        }
+
+        @Override
+        public long getAsLong() {
+            return bestRttNanos;
+        }
+
+        synchronized void addMeasurement(long newMeasurement) {
+            Preconditions.checkArgument(newMeasurement > 0, "Must be greater than zero");
+            Preconditions.checkArgument(newMeasurement < Long.MAX_VALUE, "Must be less than MAX_VALUE");
+
+            if (samples[0] == Long.MAX_VALUE) {
+                Arrays.fill(samples, newMeasurement);
+                bestRttNanos = newMeasurement;
+            } else {
+                System.arraycopy(samples, 1, samples, 0, NUM_MEASUREMENTS - 1);
+                samples[NUM_MEASUREMENTS - 1] = newMeasurement;
+                bestRttNanos = Arrays.stream(samples).min().getAsLong();
+            }
+        }
+
+        @Override
+        public String toString() {
+            return "RttMeasurement{" + "bestRttNanos=" + bestRttNanos + ", samples=" + Arrays.toString(samples) + '}';
+        }
+    }
+
+    private static final class RttMeasurementRateLimiter {
+        private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos();
+
+        private final Ticker clock;
+        private final AtomicLong lastMeasured = new AtomicLong(0);
+        private final AtomicBoolean currentlySampling = new AtomicBoolean(false);
+
+        private RttMeasurementRateLimiter(Ticker clock) {
+            this.clock = clock;
+        }
+
+        Optional<RttMeasurementPermit> acquireNonBlocking() {
+            long last = lastMeasured.get();
+            if (last + BETWEEN_SAMPLES > clock.read()) {
+                return Optional.empty();
+            }
+
+            if (currentlySampling.compareAndSet(false, true)) {
+                lastMeasured.set(clock.read());
+                return Optional.of(() -> currentlySampling.compareAndSet(true, false));
+            } else {
+                log.warn("Wanted to sample RTTs but an existing sample was still in progress");
+                return Optional.empty();
+            }
+        }
+    }
+
+    private interface RttMeasurementPermit extends Closeable {
+        @Override
+        void close();
+    }
+}
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 59a82d131..9164375e4 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -156,8 +156,7 @@ void rtt_is_measured_and_can_influence_choices() {
 
         SettableFuture<Response> chan1OptionsResponse = SettableFuture.create();
         SettableFuture<Response> chan2OptionsResponse = SettableFuture.create();
-        BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint =
-                BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE;
+        RttSampler.RttEndpoint rttEndpoint = RttSampler.RttEndpoint.INSTANCE;
         when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan1OptionsResponse));
         when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan2OptionsResponse));
 
@@ -189,8 +188,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() {
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
         when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
 
-        BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint =
-                BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE;
+        RttSampler.RttEndpoint rttEndpoint = RttSampler.RttEndpoint.INSTANCE;
         when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty());
         when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty());
 
@@ -205,8 +203,7 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
         when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
 
-        BalancedNodeSelectionStrategyChannel.RttEndpoint rttEndpoint =
-                BalancedNodeSelectionStrategyChannel.RttEndpoint.INSTANCE;
+        RttSampler.RttEndpoint rttEndpoint = RttSampler.RttEndpoint.INSTANCE;
         when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create()));
         when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(SettableFuture.create()));
 
@@ -220,26 +217,6 @@ void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
         verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any());
     }
 
-    @Test
-    void rtt_returns_the_min_of_the_last_5_measurements() {
-        BalancedNodeSelectionStrategyChannel.RttMeasurement rtt =
-                new BalancedNodeSelectionStrategyChannel.RttMeasurement();
-        rtt.addMeasurement(3);
-        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(3);
-        rtt.addMeasurement(1);
-        rtt.addMeasurement(2);
-        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1);
-
-        rtt.addMeasurement(500);
-        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(1);
-        rtt.addMeasurement(500);
-        rtt.addMeasurement(500);
-        rtt.addMeasurement(500);
-        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(2);
-        rtt.addMeasurement(500);
-        assertThat(rtt.getNanos()).describedAs("%s", rtt).isEqualTo(500);
-    }
-
     private static void set200(LimitedChannel chan) {
         when(chan.maybeExecute(any(), any())).thenReturn(http(200));
     }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java
new file mode 100644
index 000000000..1a2b23383
--- /dev/null
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java
@@ -0,0 +1,43 @@
+/*
+ * (c) Copyright 2020 Palantir Technologies Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.palantir.dialogue.core;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.jupiter.api.Test;
+
+class RttSamplerTest {
+
+    @Test
+    void rtt_returns_the_min_of_the_last_5_measurements() {
+        RttSampler.RttMeasurement rtt = new RttSampler.RttMeasurement();
+        rtt.addMeasurement(3);
+        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(3);
+        rtt.addMeasurement(1);
+        rtt.addMeasurement(2);
+        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1);
+
+        rtt.addMeasurement(500);
+        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1);
+        rtt.addMeasurement(500);
+        rtt.addMeasurement(500);
+        rtt.addMeasurement(500);
+        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(2);
+        rtt.addMeasurement(500);
+        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(500);
+    }
+}

From 2389a068be3014320c7b4c3b9b568c2a5fadfa6d Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 18:18:52 +0100
Subject: [PATCH 24/35] Use OptionalLong instead of Long.MAX_VALUE as special
 value

---
 .../BalancedNodeSelectionStrategyChannel.java | 30 +++++++++++--------
 .../palantir/dialogue/core/RttSampler.java    | 17 +++++++----
 .../dialogue/core/RttSamplerTest.java         | 10 +++----
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index a47712671..16ad8c8c1 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -38,9 +38,9 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Optional;
+import java.util.OptionalLong;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.function.LongSupplier;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
 import org.slf4j.Logger;
@@ -123,16 +123,17 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
     private static ChannelStats[] computeScores(List<MutableChannelWithStats> preShuffled) {
         ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()];
-
-        long bestRttNanos = Long.MAX_VALUE;
-        long worstRttNanos = 0;
         for (int i = 0; i < preShuffled.size(); i++) {
             ChannelStats snapshot = preShuffled.get(i).getSnapshot();
             snapshotArray[i] = snapshot;
+        }
 
-            if (snapshot.rttNanos != Long.MAX_VALUE) {
-                bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos);
-                worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos);
+        long bestRttNanos = Long.MAX_VALUE;
+        long worstRttNanos = 0;
+        for (ChannelStats snapshot : snapshotArray) {
+            if (snapshot.rttNanos.isPresent()) {
+                bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos.getAsLong());
+                worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos.getAsLong());
             }
         }
 
@@ -145,7 +146,9 @@ private static ChannelStats[] computeScores(List<MutableChannelWithStats> preShu
         long rttRange = worstRttNanos - bestRttNanos;
         if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0) {
             for (ChannelStats snapshot : snapshotArray) {
-                snapshot.rttSpectrum = ((float) snapshot.rttNanos - bestRttNanos) / rttRange;
+                if (snapshot.rttNanos.isPresent()) {
+                    snapshot.rttSpectrum = ((float) snapshot.rttNanos.getAsLong() - bestRttNanos) / rttRange;
+                }
             }
         }
 
@@ -172,7 +175,7 @@ public String toString() {
     private static final class MutableChannelWithStats implements LimitedChannel {
         private final LimitedChannel delegate;
         private final FutureCallback<Response> updateStats;
-        private final LongSupplier rtt;
+        private final RttSampler.RttSupplier rtt;
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
@@ -185,7 +188,7 @@ private static final class MutableChannelWithStats implements LimitedChannel {
         private final CoarseExponentialDecayReservoir recentFailuresReservoir;
 
         MutableChannelWithStats(
-                LimitedChannel delegate, Ticker clock, LongSupplier rtt, PerHostObservability observability) {
+                LimitedChannel delegate, Ticker clock, RttSampler.RttSupplier rtt, PerHostObservability observability) {
             this.delegate = delegate;
             this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY);
             this.rtt = rtt;
@@ -232,7 +235,7 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         }
 
         ChannelStats getSnapshot() {
-            return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getAsLong());
+            return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getRttNanos());
         }
 
         @Override
@@ -256,10 +259,11 @@ static final class ChannelStats {
         private final int requestsInflight;
         private final double recentBadness;
 
-        private final long rttNanos;
+        private final OptionalLong rttNanos;
         private float rttSpectrum = 0; // assigned later once we've figured out the range of rttNanos across channels
 
-        ChannelStats(MutableChannelWithStats delegate, int requestsInflight, double recentBadness, long rttNanos) {
+        ChannelStats(
+                MutableChannelWithStats delegate, int requestsInflight, double recentBadness, OptionalLong rttNanos) {
             this.delegate = delegate;
             this.requestsInflight = requestsInflight;
             this.recentBadness = recentBadness;
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
index c7b3d97cf..9873cde9f 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -35,6 +35,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.OptionalLong;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
@@ -63,9 +64,9 @@ final class RttSampler {
     }
 
     /**
-     * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link java.lang.Long#MAX_VALUE} to signify unknown.
+     * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link OptionalLong#empty()} if we have no data.
      */
-    LongSupplier get(int index) {
+    RttSupplier get(int index) {
         return rtts[index];
     }
 
@@ -107,7 +108,7 @@ public void onSuccess(List<Long> result) {
                     List<Long> millis =
                             result.stream().map(TimeUnit.NANOSECONDS::toMillis).collect(Collectors.toList());
                     long[] best = Arrays.stream(rtts)
-                            .mapToLong(rtt -> rtt.getAsLong())
+                            .mapToLong(rtt -> rtt.getRttNanos().orElse(Long.MAX_VALUE))
                             .toArray();
                     log.debug(
                             "RTTs {} {} {}",
@@ -159,7 +160,7 @@ public String version() {
      */
     @VisibleForTesting
     @ThreadSafe
-    static final class RttMeasurement implements LongSupplier {
+    static final class RttMeasurement implements RttSupplier {
         private static final int NUM_MEASUREMENTS = 5;
 
         private final long[] samples;
@@ -171,8 +172,8 @@ static final class RttMeasurement implements LongSupplier {
         }
 
         @Override
-        public long getAsLong() {
-            return bestRttNanos;
+        public OptionalLong getRttNanos() {
+            return bestRttNanos == Long.MAX_VALUE ? OptionalLong.empty() : OptionalLong.of(bestRttNanos);
         }
 
         synchronized void addMeasurement(long newMeasurement) {
@@ -226,4 +227,8 @@ private interface RttMeasurementPermit extends Closeable {
         @Override
         void close();
     }
+
+    interface RttSupplier {
+        OptionalLong getRttNanos();
+    }
 }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java
index 1a2b23383..00ccb1be9 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/RttSamplerTest.java
@@ -26,18 +26,18 @@ class RttSamplerTest {
     void rtt_returns_the_min_of_the_last_5_measurements() {
         RttSampler.RttMeasurement rtt = new RttSampler.RttMeasurement();
         rtt.addMeasurement(3);
-        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(3);
+        assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(3);
         rtt.addMeasurement(1);
         rtt.addMeasurement(2);
-        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1);
+        assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(1);
 
         rtt.addMeasurement(500);
-        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(1);
+        assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(1);
         rtt.addMeasurement(500);
         rtt.addMeasurement(500);
         rtt.addMeasurement(500);
-        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(2);
+        assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(2);
         rtt.addMeasurement(500);
-        assertThat(rtt.getAsLong()).describedAs("%s", rtt).isEqualTo(500);
+        assertThat(rtt.getRttNanos()).describedAs("%s", rtt).hasValue(500);
     }
 }

From 3adc16c3503bf2c303bb7ae8b409440e0f09fed6 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 18:34:19 +0100
Subject: [PATCH 25/35] be more immutable, reduce diff

---
 .../BalancedNodeSelectionStrategyChannel.java | 74 +++++++++----------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 16ad8c8c1..e642224d9 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -58,7 +58,7 @@
 final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private static final Logger log = LoggerFactory.getLogger(BalancedNodeSelectionStrategyChannel.class);
 
-    private static final Comparator<ChannelStats> BY_SCORE = Comparator.comparingInt(ChannelStats::getScore);
+    private static final Comparator<SortableChannel> BY_SCORE = Comparator.comparingInt(SortableChannel::getScore);
     private static final Duration FAILURE_MEMORY = Duration.ofSeconds(30);
     private static final double FAILURE_WEIGHT = 10;
 
@@ -107,10 +107,10 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
         // TODO(dfox): P2C optimization when we have high number of nodes to save CPU?
         // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
-        ChannelStats[] sortableChannels = computeScores(preShuffled);
+        SortableChannel[] sortableChannels = computeScores(preShuffled);
         Arrays.sort(sortableChannels, BY_SCORE);
 
-        for (ChannelStats channel : sortableChannels) {
+        for (SortableChannel channel : sortableChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
                 rttSampler.maybeSampleRtts();
@@ -121,19 +121,18 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         return Optional.empty();
     }
 
-    private static ChannelStats[] computeScores(List<MutableChannelWithStats> preShuffled) {
-        ChannelStats[] snapshotArray = new ChannelStats[preShuffled.size()];
+    private static SortableChannel[] computeScores(List<MutableChannelWithStats> preShuffled) {
+        OptionalLong[] rtts = new OptionalLong[preShuffled.size()];
         for (int i = 0; i < preShuffled.size(); i++) {
-            ChannelStats snapshot = preShuffled.get(i).getSnapshot();
-            snapshotArray[i] = snapshot;
+            rtts[i] = preShuffled.get(i).rtt.getRttNanos();
         }
 
         long bestRttNanos = Long.MAX_VALUE;
         long worstRttNanos = 0;
-        for (ChannelStats snapshot : snapshotArray) {
-            if (snapshot.rttNanos.isPresent()) {
-                bestRttNanos = Math.min(bestRttNanos, snapshot.rttNanos.getAsLong());
-                worstRttNanos = Math.max(worstRttNanos, snapshot.rttNanos.getAsLong());
+        for (OptionalLong rtt : rtts) {
+            if (rtt.isPresent()) {
+                bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong());
+                worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong());
             }
         }
 
@@ -144,12 +143,16 @@ private static ChannelStats[] computeScores(List<MutableChannelWithStats> preShu
         // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero).
         // Note, this can only be computed when we have all the snapshots in front of us.
         long rttRange = worstRttNanos - bestRttNanos;
-        if (bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0) {
-            for (ChannelStats snapshot : snapshotArray) {
-                if (snapshot.rttNanos.isPresent()) {
-                    snapshot.rttSpectrum = ((float) snapshot.rttNanos.getAsLong() - bestRttNanos) / rttRange;
-                }
-            }
+        boolean includeRtts = bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0;
+
+        SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()];
+        for (int i = 0; i < preShuffled.size(); i++) {
+            OptionalLong rtt = rtts[i];
+            float rttSpectrum =
+                    (includeRtts && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0;
+
+            SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum);
+            snapshotArray[i] = snapshot;
         }
 
         return snapshotArray;
@@ -163,7 +166,7 @@ private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Ran
     }
 
     @VisibleForTesting
-    Stream<ChannelStats> getScoresForTesting() {
+    Stream<SortableChannel> getScoresForTesting() {
         return Arrays.stream(computeScores(channels));
     }
 
@@ -234,8 +237,14 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
             return maybe;
         }
 
-        ChannelStats getSnapshot() {
-            return new ChannelStats(this, inflight.get(), recentFailuresReservoir.get(), rtt.getRttNanos());
+        SortableChannel computeScore(float rttSpectrum) {
+            // it's important that scores are integers because if we kept the full double precision, then a single 4xx
+            // would end up influencing host selection long beyond its intended lifespan in the absence of other data.
+            int score = inflight.get()
+                    + Ints.saturatedCast(Math.round(recentFailuresReservoir.get()))
+                    + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT));
+
+            return new SortableChannel(score, this);
         }
 
         @Override
@@ -254,33 +263,22 @@ public String toString() {
      * might change mid-sort, leading to undefined behaviour.
      */
     @VisibleForTesting
-    static final class ChannelStats {
+    static final class SortableChannel {
+        private final int score;
         private final MutableChannelWithStats delegate;
-        private final int requestsInflight;
-        private final double recentBadness;
-
-        private final OptionalLong rttNanos;
-        private float rttSpectrum = 0; // assigned later once we've figured out the range of rttNanos across channels
 
-        ChannelStats(
-                MutableChannelWithStats delegate, int requestsInflight, double recentBadness, OptionalLong rttNanos) {
+        SortableChannel(int score, MutableChannelWithStats delegate) {
             this.delegate = delegate;
-            this.requestsInflight = requestsInflight;
-            this.recentBadness = recentBadness;
-            this.rttNanos = rttNanos;
+            this.score = score;
         }
 
         int getScore() {
-            // it's important that scores are integers because if we kept the full double precision, then a single 4xx
-            // would end up influencing host selection long beyond its intended lifespan in the absence of other data.
-            return requestsInflight
-                    + Ints.saturatedCast(Math.round(recentBadness))
-                    + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT));
+            return score;
         }
 
         @Override
         public String toString() {
-            return "ChannelStats{" + "score=" + getScore() + ", delegate=" + delegate + '}';
+            return "SortableChannel{score=" + score + ", delegate=" + delegate + '}';
         }
     }
 
@@ -302,7 +300,7 @@ private static void registerGauges(
             DialogueInternalWeakReducingGauge.getOrCreate(
                     taggedMetrics,
                     metricName,
-                    c -> c.getSnapshot().getScore(), // note, this doesn't include rtts
+                    c -> c.computeScore(0).getScore(),
                     longStream -> {
                         long[] longs = longStream.toArray();
                         if (log.isInfoEnabled() && longs.length > 1) {

From eb76abb96d3858f0788610c16ef61452b98e72b8 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Thu, 28 May 2020 19:01:35 +0100
Subject: [PATCH 26/35] Feature flag it off

---
 .../BalancedNodeSelectionStrategyChannel.java | 98 +++++++++++--------
 .../core/NodeSelectionStrategyChannel.java    |  4 +-
 ...ancedNodeSelectionStrategyChannelTest.java | 17 +++-
 .../dialogue/core/NodeSelectionBenchmark.java |  5 +-
 4 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index e642224d9..526728470 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -43,6 +43,7 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
+import javax.annotation.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -75,6 +76,8 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
     private final ImmutableList<MutableChannelWithStats> channels;
     private final Random random;
     private final Ticker clock;
+
+    @Nullable
     private final RttSampler rttSampler;
 
     BalancedNodeSelectionStrategyChannel(
@@ -82,16 +85,16 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
             Random random,
             Ticker ticker,
             TaggedMetricRegistry taggedMetrics,
-            String channelName) {
+            String channelName,
+            RttSampling samplingEnabled) {
         Preconditions.checkState(channels.size() >= 2, "At least two channels required");
         this.random = random;
         this.clock = ticker;
-        this.rttSampler = new RttSampler(channels, clock);
+        this.rttSampler = samplingEnabled == RttSampling.DEFAULT_OFF ? null : new RttSampler(channels, clock);
         this.channels = IntStream.range(0, channels.size())
                 .mapToObj(index -> new MutableChannelWithStats(
                         channels.get(index),
                         clock,
-                        rttSampler.get(index),
                         PerHostObservability.create(channels, taggedMetrics, channelName, index)))
                 .collect(ImmutableList.toImmutableList());
 
@@ -99,6 +102,11 @@ final class BalancedNodeSelectionStrategyChannel implements LimitedChannel {
         log.debug("Initialized", SafeArg.of("count", channels.size()), UnsafeArg.of("channels", channels));
     }
 
+    enum RttSampling {
+        DEFAULT_OFF,
+        ENABLED
+    }
+
     @Override
     public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Request request) {
         // pre-shuffling is pretty important here, otherwise when there are no requests in flight, we'd
@@ -107,13 +115,15 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
 
         // TODO(dfox): P2C optimization when we have high number of nodes to save CPU?
         // http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
-        SortableChannel[] sortableChannels = computeScores(preShuffled);
+        SortableChannel[] sortableChannels = computeScores(rttSampler, preShuffled);
         Arrays.sort(sortableChannels, BY_SCORE);
 
         for (SortableChannel channel : sortableChannels) {
             Optional<ListenableFuture<Response>> maybe = channel.delegate.maybeExecute(endpoint, request);
             if (maybe.isPresent()) {
-                rttSampler.maybeSampleRtts();
+                if (rttSampler != null) {
+                    rttSampler.maybeSampleRtts();
+                }
                 return maybe;
             }
         }
@@ -121,41 +131,51 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         return Optional.empty();
     }
 
-    private static SortableChannel[] computeScores(List<MutableChannelWithStats> preShuffled) {
-        OptionalLong[] rtts = new OptionalLong[preShuffled.size()];
-        for (int i = 0; i < preShuffled.size(); i++) {
-            rtts[i] = preShuffled.get(i).rtt.getRttNanos();
-        }
+    private static SortableChannel[] computeScores(
+            @Nullable RttSampler rttSampler, List<MutableChannelWithStats> preShuffled) {
+        SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()];
 
-        long bestRttNanos = Long.MAX_VALUE;
-        long worstRttNanos = 0;
-        for (OptionalLong rtt : rtts) {
-            if (rtt.isPresent()) {
-                bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong());
-                worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong());
-            }
-        }
+        if (rttSampler == null) {
 
-        // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding it
-        // would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a conversion
-        // fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos to worstRttNanos,
-        // with 0 being best and 1 being worst. This ensures that if several nodes are all within the same AZ and
-        // can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have a similar rttScore (near zero).
-        // Note, this can only be computed when we have all the snapshots in front of us.
-        long rttRange = worstRttNanos - bestRttNanos;
-        boolean includeRtts = bestRttNanos != Long.MAX_VALUE && worstRttNanos != 0 && rttRange > 0;
+            for (int i = 0; i < preShuffled.size(); i++) {
+                SortableChannel snapshot = preShuffled.get(i).computeScore(0);
+                snapshotArray[i] = snapshot;
+            }
+            return snapshotArray;
+
+        } else {
+            // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding
+            // it would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a
+            // conversion fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos
+            // to worstRttNanos, with 0 being best and 1 being worst. This ensures that if several nodes are all
+            // within the same AZ and can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have
+            // a similar rttScore (near zero). Note, this can only be computed when we have all the snapshots in
+            // front of us.
+            long bestRttNanos = Long.MAX_VALUE;
+            long worstRttNanos = 0;
+
+            OptionalLong[] rtts = new OptionalLong[preShuffled.size()];
+            for (int i = 0; i < preShuffled.size(); i++) {
+                OptionalLong rtt = rttSampler.get(i).getRttNanos();
+                rtts[i] = rtt;
+
+                if (rtt.isPresent()) {
+                    bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong());
+                    worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong());
+                }
+            }
+            long rttRange = worstRttNanos - bestRttNanos;
 
-        SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()];
-        for (int i = 0; i < preShuffled.size(); i++) {
-            OptionalLong rtt = rtts[i];
-            float rttSpectrum =
-                    (includeRtts && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0;
+            for (int i = 0; i < preShuffled.size(); i++) {
+                OptionalLong rtt = rtts[i];
+                float rttSpectrum =
+                        (rttRange > 0 && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0;
 
-            SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum);
-            snapshotArray[i] = snapshot;
+                SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum);
+                snapshotArray[i] = snapshot;
+            }
+            return snapshotArray;
         }
-
-        return snapshotArray;
     }
 
     /** Returns a new shuffled list, without mutating the input list (which may be immutable). */
@@ -167,7 +187,7 @@ private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Ran
 
     @VisibleForTesting
     Stream<SortableChannel> getScoresForTesting() {
-        return Arrays.stream(computeScores(channels));
+        return Arrays.stream(computeScores(rttSampler, channels));
     }
 
     @Override
@@ -178,7 +198,6 @@ public String toString() {
     private static final class MutableChannelWithStats implements LimitedChannel {
         private final LimitedChannel delegate;
         private final FutureCallback<Response> updateStats;
-        private final RttSampler.RttSupplier rtt;
         private final PerHostObservability observability;
 
         private final AtomicInteger inflight = new AtomicInteger(0);
@@ -190,11 +209,9 @@ private static final class MutableChannelWithStats implements LimitedChannel {
          */
         private final CoarseExponentialDecayReservoir recentFailuresReservoir;
 
-        MutableChannelWithStats(
-                LimitedChannel delegate, Ticker clock, RttSampler.RttSupplier rtt, PerHostObservability observability) {
+        MutableChannelWithStats(LimitedChannel delegate, Ticker clock, PerHostObservability observability) {
             this.delegate = delegate;
             this.recentFailuresReservoir = new CoarseExponentialDecayReservoir(clock::read, FAILURE_MEMORY);
-            this.rtt = rtt;
             this.observability = observability;
             // Saves one allocation on each network call
             this.updateStats = new FutureCallback<Response>() {
@@ -253,7 +270,6 @@ public String toString() {
                     + "delegate=" + delegate
                     + ", inflight=" + inflight
                     + ", recentFailures=" + recentFailuresReservoir
-                    + ", rtt=" + rtt
                     + '}';
         }
     }
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java
index 461e56c39..69bf6155d 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java
@@ -24,6 +24,7 @@
 import com.palantir.dialogue.Endpoint;
 import com.palantir.dialogue.Request;
 import com.palantir.dialogue.Response;
+import com.palantir.dialogue.core.BalancedNodeSelectionStrategyChannel.RttSampling;
 import com.palantir.logsafe.SafeArg;
 import com.palantir.logsafe.exceptions.SafeRuntimeException;
 import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
@@ -140,7 +141,8 @@ private NodeSelectionChannel createNodeSelectionChannel(
                 // When people ask for 'ROUND_ROBIN', they usually just want something to load balance better.
                 // We used to have a naive RoundRobinChannel, then tried RandomSelection and now use this heuristic:
                 return channelBuilder
-                        .channel(new BalancedNodeSelectionStrategyChannel(channels, random, tick, metrics, channelName))
+                        .channel(new BalancedNodeSelectionStrategyChannel(
+                                channels, random, tick, metrics, channelName, RttSampling.DEFAULT_OFF))
                         .build();
             case UNKNOWN:
         }
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 9164375e4..606a94e5e 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -33,6 +33,7 @@
 import com.palantir.dialogue.Response;
 import com.palantir.dialogue.TestEndpoint;
 import com.palantir.dialogue.TestResponse;
+import com.palantir.dialogue.core.BalancedNodeSelectionStrategyChannel.RttSampling;
 import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry;
 import java.time.Duration;
 import java.util.Optional;
@@ -65,7 +66,12 @@ class BalancedNodeSelectionStrategyChannelTest {
     @BeforeEach
     public void before() {
         channel = new BalancedNodeSelectionStrategyChannel(
-                ImmutableList.of(chan1, chan2), random, clock, new DefaultTaggedMetricRegistry(), "channelName");
+                ImmutableList.of(chan1, chan2),
+                random,
+                clock,
+                new DefaultTaggedMetricRegistry(),
+                "channelName",
+                RttSampling.DEFAULT_OFF);
     }
 
     @Test
@@ -149,6 +155,13 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
 
     @Test
     void rtt_is_measured_and_can_influence_choices() {
+        channel = new BalancedNodeSelectionStrategyChannel(
+                ImmutableList.of(chan1, chan2),
+                random,
+                clock,
+                new DefaultTaggedMetricRegistry(),
+                "channelName",
+                RttSampling.ENABLED);
         incrementClockBy(Duration.ofHours(1));
 
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
@@ -198,7 +211,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() {
     }
 
     @Test
-    void when_rtt_measurements_havent_returned_yet_dont_freak_out() {
+    void when_rtt_measurements_havent_returned_yet_consider_both_far_away() {
         incrementClockBy(Duration.ofHours(1));
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
         when(chan2.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
diff --git a/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java b/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java
index f58380571..d407339e6 100644
--- a/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java
+++ b/dialogue-jmh/src/main/java/com/palantir/dialogue/core/NodeSelectionBenchmark.java
@@ -26,6 +26,7 @@
 import com.palantir.dialogue.Response;
 import com.palantir.dialogue.TestEndpoint;
 import com.palantir.dialogue.TestResponse;
+import com.palantir.dialogue.core.BalancedNodeSelectionStrategyChannel.RttSampling;
 import com.palantir.logsafe.exceptions.SafeIllegalArgumentException;
 import com.palantir.random.SafeThreadLocalRandom;
 import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry;
@@ -123,8 +124,8 @@ public void before() {
                             "channelName");
                     break;
                 case ROUND_ROBIN:
-                    channel =
-                            new BalancedNodeSelectionStrategyChannel(channels, random, ticker, metrics, "channelName");
+                    channel = new BalancedNodeSelectionStrategyChannel(
+                            channels, random, ticker, metrics, "channelName", RttSampling.DEFAULT_OFF);
                     break;
                 default:
                     throw new SafeIllegalArgumentException("Unsupported");

From 4f73a902ea0190e67a63a3dd45fb034fe71b3ef5 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 14:18:57 +0100
Subject: [PATCH 27/35] Allow servers to enable it with BALANCED_RTT

---
 .../core/DialogueNodeSelectionStrategy.java   | 29 +++++++++++++------
 .../core/NodeSelectionStrategyChannel.java    |  5 ++++
 .../DialogueNodeSelectionStrategyTest.java    |  4 +++
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java
index 5261d51aa..94218ec43 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategy.java
@@ -16,6 +16,7 @@
 
 package com.palantir.dialogue.core;
 
+import com.google.common.annotations.Beta;
 import com.google.common.base.Splitter;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
@@ -35,6 +36,8 @@ enum DialogueNodeSelectionStrategy {
     PIN_UNTIL_ERROR,
     PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE,
     BALANCED,
+    @Beta
+    BALANCED_RTT,
     UNKNOWN;
 
     private static final Logger log = LoggerFactory.getLogger(DialogueNodeSelectionStrategy.class);
@@ -45,17 +48,25 @@ static List<DialogueNodeSelectionStrategy> fromHeader(String header) {
                 Lists.transform(SPLITTER.splitToList(header), DialogueNodeSelectionStrategy::safeValueOf));
     }
 
-    private static DialogueNodeSelectionStrategy safeValueOf(String value) {
-        String normalizedValue = value.toUpperCase();
-        if (PIN_UNTIL_ERROR.name().equals(normalizedValue)) {
-            return PIN_UNTIL_ERROR;
-        } else if (PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE.name().equals(normalizedValue)) {
-            return PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE;
-        } else if (BALANCED.name().equals(normalizedValue)) {
-            return BALANCED;
+    /**
+     * We allow server-determined headers to access some incubating dialogue-specific strategies (e.g. BALANCED_RTT)
+     * which users can't normally configure.
+     */
+    private static DialogueNodeSelectionStrategy safeValueOf(String string) {
+        String uppercaseString = string.toUpperCase();
+
+        switch (uppercaseString) {
+            case "PIN_UNTIL_ERROR":
+                return PIN_UNTIL_ERROR;
+            case "PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE":
+                return PIN_UNTIL_ERROR_WITHOUT_RESHUFFLE;
+            case "BALANCED":
+                return BALANCED;
+            case "BALANCED_RTT":
+                return BALANCED_RTT;
         }
 
-        log.info("Received unknown selection strategy {}", SafeArg.of("strategy", value));
+        log.info("Received unknown selection strategy {}", SafeArg.of("strategy", string));
         return UNKNOWN;
     }
 
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java
index 69bf6155d..aad2ee31c 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/NodeSelectionStrategyChannel.java
@@ -144,6 +144,11 @@ private NodeSelectionChannel createNodeSelectionChannel(
                         .channel(new BalancedNodeSelectionStrategyChannel(
                                 channels, random, tick, metrics, channelName, RttSampling.DEFAULT_OFF))
                         .build();
+            case BALANCED_RTT:
+                return channelBuilder
+                        .channel(new BalancedNodeSelectionStrategyChannel(
+                                channels, random, tick, metrics, channelName, RttSampling.ENABLED))
+                        .build();
             case UNKNOWN:
         }
         throw new SafeRuntimeException("Unknown NodeSelectionStrategy", SafeArg.of("unknown", strategy));
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java
index 91b11a2fb..757ebcb9e 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/DialogueNodeSelectionStrategyTest.java
@@ -32,6 +32,10 @@ void parses_single_strategy() {
     void parses_multiple_strategies() {
         assertThat(DialogueNodeSelectionStrategy.fromHeader("BALANCED, PIN_UNTIL_ERROR"))
                 .containsExactly(DialogueNodeSelectionStrategy.BALANCED, DialogueNodeSelectionStrategy.PIN_UNTIL_ERROR);
+        assertThat(DialogueNodeSelectionStrategy.fromHeader("BALANCED_RTT, BALANCED"))
+                .containsExactly(DialogueNodeSelectionStrategy.BALANCED_RTT, DialogueNodeSelectionStrategy.BALANCED);
+        assertThat(DialogueNodeSelectionStrategy.fromHeader("BALANCED_FUTURE_EXPERIMENT, BALANCED"))
+                .containsExactly(DialogueNodeSelectionStrategy.UNKNOWN, DialogueNodeSelectionStrategy.BALANCED);
     }
 
     @Test

From f3642056efa12e206c381a65fa6c96c2e8371841 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 14:45:49 +0100
Subject: [PATCH 28/35] Re-run simulations

---
 ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 .../black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png  |  4 ++--
 ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 simulation/src/test/resources/report.md              | 12 ++++++------
 ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png |  4 ++--
 ...ll_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 .../black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt  |  2 +-
 ...ve_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ..._rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 ...r_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt |  2 +-
 13 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 12fe6c646..d8a479f77 100644
--- a/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:516145937579fb582d7a76100b05fd7054b6357b95d560770988fc5b460e20f0
-size 115803
+oid sha256:17e4b03f7618600213e834e93b52bc594fa9e96f5cf6220030e284d6250a6176
+size 113249
diff --git a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 94064ea2d..4f58c5438 100644
--- a/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7252511a753523b3869eea5007f48298746fab76bf097cc512767c9bb1699fc6
-size 103509
+oid sha256:301c818a4e266379a6c45dee81d725d1dad6f3c8d2abba49d38a1df8e68ac7ee
+size 103151
diff --git a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index b0f68688b..24e0252de 100644
--- a/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6d4e200f4549b0c11a3bdd95af0c8e1dbafa2bb7fb7d2c5c27b48e9a9ed1d86
-size 122953
+oid sha256:e925ad02dd69d9f2a5ef8b47ad43aabd5a14106ce0937a73e9780a9a11f3c435
+size 123045
diff --git a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 437e4e988..aa3d69e55 100644
--- a/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e2f1f9109ba521a3226d4d9fb745c3b7c9c440efb0ccc4d71f7ee5037199054
-size 133708
+oid sha256:d492690bce9b84925c16675a75ff55af3cfb180b8b0f753c1fd036f317c2801c
+size 137045
diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md
index c773eef36..892475d24 100644
--- a/simulation/src/test/resources/report.md
+++ b/simulation/src/test/resources/report.md
@@ -2,10 +2,10 @@
 <!-- Run SimulationTest to regenerate this report. -->
 ```
                 all_nodes_500[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=79.3%	client_mean=PT5.81342S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1586, 500=414}
-                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=73.6%	client_mean=PT2.91032S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1472, 500=528}
+                    all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=73.7%	client_mean=PT3.039455S    	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1474, 500=526}
                               all_nodes_500[UNLIMITED_ROUND_ROBIN].txt:	success=50.0%	client_mean=PT0.6S         	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1000, 500=1000}
                    black_hole[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=59.2%	client_mean=PT0.600655114S 	server_cpu=PT11M49.8S     	client_received=1183/2000	server_resps=1183	codes={200=1183}
-                       black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
+                       black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=91.5%	client_mean=PT0.6S         	server_cpu=PT18M17.4S     	client_received=1829/2000	server_resps=1829	codes={200=1829}
                                  black_hole[UNLIMITED_ROUND_ROBIN].txt:	success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
              drastic_slowdown[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2.947028083S 	server_cpu=PT41M8.862333314S	client_received=4000/4000	server_resps=4000	codes={200=4000}
                  drastic_slowdown[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.251969999S 	server_cpu=PT16M47.879999984S	client_received=4000/4000	server_resps=4000	codes={200=4000}
@@ -17,16 +17,16 @@
             fast_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                       fast_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.120107163S 	server_cpu=PT1H30M0.0000004S	client_received=45000/45000	server_resps=45040	codes={200=45000}
                live_reloading[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=94.3%	client_mean=PT6.987708S    	server_cpu=PT1H56M8.59S   	client_received=2500/2500	server_resps=2500	codes={200=2357, 500=143}
-                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=84.0%	client_mean=PT4.5915112S   	server_cpu=PT1H52M50.27S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
+                   live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=84.0%	client_mean=PT4.6409416S   	server_cpu=PT1H53M13.43S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
                              live_reloading[UNLIMITED_ROUND_ROBIN].txt:	success=86.9%	client_mean=PT2.802124S    	server_cpu=PT1H56M45.31S  	client_received=2500/2500	server_resps=2500	codes={200=2173, 500=327}
                 one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2.569766928S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
                     one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT1.521339525S 	server_cpu=PT2M30S        	client_received=1000/1000	server_resps=1000	codes={200=1000}
                               one_big_spike[UNLIMITED_ROUND_ROBIN].txt:	success=99.9%	client_mean=PT1.000523583S 	server_cpu=PT7M37.35S     	client_received=1000/1000	server_resps=3049	codes={200=999, 429=1}
 one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=64.9%	client_mean=PT8.1950896S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1623, 500=877}
- one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=64.2%	client_mean=PT3.9523312S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1605, 500=895}
+ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=65.4%	client_mean=PT4.0117872S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1636, 500=864}
            one_endpoint_dies_on_each_server[UNLIMITED_ROUND_ROBIN].txt:	success=65.1%	client_mean=PT0.6S         	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1628, 500=872}
       server_side_rate_limits[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT12M44.762591546S	server_cpu=PT9H55M40.4S   	client_received=150000/150000	server_resps=178702	codes={200=149989, 429=11}
-          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.262702651S 	server_cpu=PT9H5M50.6S    	client_received=150000/150000	server_resps=163753	codes={200=150000}
+          server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.290099498S 	server_cpu=PT9H9M39.2S    	client_received=150000/150000	server_resps=164896	codes={200=150000}
                     server_side_rate_limits[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.217416858S 	server_cpu=PT8H45M29.2S   	client_received=150000/150000	server_resps=157646	codes={200=150000}
        simplest_possible_case[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT0.834469696S 	server_cpu=PT3H3M35S      	client_received=13200/13200	server_resps=13200	codes={200=13200}
            simplest_possible_case[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.785757575S 	server_cpu=PT2H52M52S     	client_received=13200/13200	server_resps=13200	codes={200=13200}
@@ -35,7 +35,7 @@ one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	succe
             slow_503s_then_revert[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
                       slow_503s_then_revert[UNLIMITED_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT0.088828705S 	server_cpu=PT4M22.364666657S	client_received=3000/3000	server_resps=3031	codes={200=3000}
 slowdown_and_error_thresholds[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=100.0%	client_mean=PT2M28.388896928S	server_cpu=PT8H48M18.546665835S	client_received=10000/10000	server_resps=10899	codes={200=10000}
-    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M18.314817484S	server_cpu=PT13H50M38.666666634S	client_received=10000/10000	server_resps=13148	codes={200=9995, 500=5}
+    slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=100.0%	client_mean=PT3M24.308654421S	server_cpu=PT14H3M44.353333281S	client_received=10000/10000	server_resps=13383	codes={200=9999, 500=1}
               slowdown_and_error_thresholds[UNLIMITED_ROUND_ROBIN].txt:	success=3.6%	client_mean=PT21.551691121S	server_cpu=PT54H45M57.899999949S	client_received=10000/10000	server_resps=49335	codes={200=360, 500=9640}
               uncommon_flakes[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
                   uncommon_flakes[CONCURRENCY_LIMITER_ROUND_ROBIN].txt:	success=99.0%	client_mean=PT0.000001S    	server_cpu=PT0.01S        	client_received=10000/10000	server_resps=10000	codes={200=9900, 500=100}
diff --git a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 4a1422710..3c1b66b6a 100644
--- a/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:29856bd2342821f7d0c0bb19b4b1efdcaed1af8b3c36cb2533485c55e0e00bfd
-size 560290
+oid sha256:33cfa2b188b84d30e139b5a8ff39a646f65588b5cf7b5755a2a61993c9d38c8e
+size 606983
diff --git a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
index 91ad7456a..f5dcc152c 100644
--- a/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
+++ b/simulation/src/test/resources/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc3169a632ae8fd7de4e96ba5bb2d39b99e97e12416d37c5fb6516a452a1eedf
-size 119379
+oid sha256:160c566ff4a67cdd8ecddbb91c2262d1310a4925728dadb36ff81bf9a9237875
+size 120294
diff --git a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 71eaf7e5e..a0433ed68 100644
--- a/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/all_nodes_500[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=73.6%	client_mean=PT2.91032S     	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1472, 500=528}
\ No newline at end of file
+success=73.7%	client_mean=PT3.039455S    	server_cpu=PT20M          	client_received=2000/2000	server_resps=2000	codes={200=1474, 500=526}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 5c1da373e..5b70ecd5f 100644
--- a/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/black_hole[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=91.4%	client_mean=PT0.6S         	server_cpu=PT18M16.8S     	client_received=1828/2000	server_resps=1828	codes={200=1828}
\ No newline at end of file
+success=91.5%	client_mean=PT0.6S         	server_cpu=PT18M17.4S     	client_received=1829/2000	server_resps=1829	codes={200=1829}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index e253a01c0..46b26cdea 100644
--- a/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=84.0%	client_mean=PT4.5915112S   	server_cpu=PT1H52M50.27S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
\ No newline at end of file
+success=84.0%	client_mean=PT4.6409416S   	server_cpu=PT1H53M13.43S  	client_received=2500/2500	server_resps=2500	codes={200=2101, 500=399}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index a4008cf4c..561f056b7 100644
--- a/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=64.2%	client_mean=PT3.9523312S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1605, 500=895}
\ No newline at end of file
+success=65.4%	client_mean=PT4.0117872S   	server_cpu=PT25M          	client_received=2500/2500	server_resps=2500	codes={200=1636, 500=864}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index bfc7b44c2..ffdfaca53 100644
--- a/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/server_side_rate_limits[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT0.262702651S 	server_cpu=PT9H5M50.6S    	client_received=150000/150000	server_resps=163753	codes={200=150000}
\ No newline at end of file
+success=100.0%	client_mean=PT0.290099498S 	server_cpu=PT9H9M39.2S    	client_received=150000/150000	server_resps=164896	codes={200=150000}
\ No newline at end of file
diff --git a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
index 9fe47e0ea..d250cf150 100644
--- a/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
+++ b/simulation/src/test/resources/txt/slowdown_and_error_thresholds[CONCURRENCY_LIMITER_ROUND_ROBIN].txt
@@ -1 +1 @@
-success=100.0%	client_mean=PT3M18.314817484S	server_cpu=PT13H50M38.666666634S	client_received=10000/10000	server_resps=13148	codes={200=9995, 500=5}
\ No newline at end of file
+success=100.0%	client_mean=PT3M24.308654421S	server_cpu=PT14H3M44.353333281S	client_received=10000/10000	server_resps=13383	codes={200=9999, 500=1}
\ No newline at end of file

From 6f9804ad4818432992db0cbe8859d3d1a9493671 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 14:48:43 +0100
Subject: [PATCH 29/35] Fix tests

---
 ...ancedNodeSelectionStrategyChannelTest.java | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index 606a94e5e..c95d68ec6 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -59,6 +59,7 @@ class BalancedNodeSelectionStrategyChannelTest {
 
     private Endpoint endpoint = TestEndpoint.GET;
     private BalancedNodeSelectionStrategyChannel channel;
+    private BalancedNodeSelectionStrategyChannel rttChannel;
 
     @Mock
     Ticker clock;
@@ -72,6 +73,13 @@ public void before() {
                 new DefaultTaggedMetricRegistry(),
                 "channelName",
                 RttSampling.DEFAULT_OFF);
+        rttChannel = new BalancedNodeSelectionStrategyChannel(
+                ImmutableList.of(chan1, chan2),
+                random,
+                clock,
+                new DefaultTaggedMetricRegistry(),
+                "channelName",
+                RttSampling.ENABLED);
     }
 
     @Test
@@ -133,35 +141,28 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
         when(chan2.maybeExecute(any(), any())).thenReturn(http(200));
 
         for (int i = 0; i < 11; i++) {
-            channel.maybeExecute(endpoint, request);
-            assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
-                    .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), channel)
+            rttChannel.maybeExecute(endpoint, request);
+            assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
+                    .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), rttChannel)
                     .containsExactly(0, 0);
             incrementClockBy(Duration.ofMillis(50));
         }
-        channel.maybeExecute(endpoint, request);
-        assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
-                .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), channel)
+        rttChannel.maybeExecute(endpoint, request);
+        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
+                .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), rttChannel)
                 .containsExactly(1, 0);
 
         incrementClockBy(Duration.ofSeconds(5));
 
-        assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
+        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
                 .describedAs(
                         "%s: We quickly forget about 4xxs and go back to fair shuffling %s",
-                        Duration.ofNanos(clock.read()), channel)
+                        Duration.ofNanos(clock.read()), rttChannel)
                 .containsExactly(0, 0);
     }
 
     @Test
     void rtt_is_measured_and_can_influence_choices() {
-        channel = new BalancedNodeSelectionStrategyChannel(
-                ImmutableList.of(chan1, chan2),
-                random,
-                clock,
-                new DefaultTaggedMetricRegistry(),
-                "channelName",
-                RttSampling.ENABLED);
         incrementClockBy(Duration.ofHours(1));
 
         // when(chan1.maybeExecute(eq(endpoint), any())).thenReturn(http(200));
@@ -173,7 +174,7 @@ void rtt_is_measured_and_can_influence_choices() {
         when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan1OptionsResponse));
         when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.of(chan2OptionsResponse));
 
-        channel.maybeExecute(endpoint, request);
+        rttChannel.maybeExecute(endpoint, request);
 
         incrementClockBy(Duration.ofNanos(123));
         chan1OptionsResponse.set(new TestResponse().code(200));
@@ -181,13 +182,13 @@ void rtt_is_measured_and_can_influence_choices() {
         incrementClockBy(Duration.ofNanos(456));
         chan2OptionsResponse.set(new TestResponse().code(200));
 
-        assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
+        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
                 .describedAs("The poor latency of channel2 imposes a small constant penalty in the score")
                 .containsExactly(0, 3);
 
         for (int i = 0; i < 500; i++) {
             incrementClockBy(Duration.ofMillis(10));
-            channel.maybeExecute(endpoint, request);
+            rttChannel.maybeExecute(endpoint, request);
         }
         // rate limiter ensures a sensible amount of rtt sampling
         verify(chan1, times(6)).maybeExecute(eq(rttEndpoint), any());
@@ -205,7 +206,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() {
         when(chan1.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty());
         when(chan2.maybeExecute(eq(rttEndpoint), any())).thenReturn(Optional.empty());
 
-        channel.maybeExecute(endpoint, request);
+        rttChannel.maybeExecute(endpoint, request);
 
         assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
     }
@@ -222,10 +223,10 @@ void when_rtt_measurements_havent_returned_yet_consider_both_far_away() {
 
         for (int i = 0; i < 20; i++) {
             incrementClockBy(Duration.ofSeconds(5));
-            channel.maybeExecute(endpoint, request);
+            rttChannel.maybeExecute(endpoint, request);
         }
 
-        assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
+        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
         verify(chan1, times(1)).maybeExecute(eq(rttEndpoint), any());
         verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any());
     }

From 0128c4eca6327b5179cdaf2c2add6b23f14f74db Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 16:53:55 +0100
Subject: [PATCH 30/35] Ensure we send a good user agent with these OPTIONS
 requests

---
 .../com/palantir/dialogue/core/RttSampler.java | 18 ++++++++++++++----
 .../core/UserAgentEndpointChannel.java         |  9 +++++++--
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
index 9873cde9f..3c0f463ed 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -23,6 +23,9 @@
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.MoreExecutors;
+import com.palantir.conjure.java.api.config.service.UserAgent;
+import com.palantir.conjure.java.api.config.service.UserAgent.Agent;
+import com.palantir.conjure.java.api.config.service.UserAgents;
 import com.palantir.dialogue.Endpoint;
 import com.palantir.dialogue.HttpMethod;
 import com.palantir.dialogue.Request;
@@ -48,6 +51,9 @@
 
 final class RttSampler {
     private static final Logger log = LoggerFactory.getLogger(RttSampler.class);
+    private static final String USER_AGENT =
+            UserAgents.format(UserAgent.of(Agent.of(RttEndpoint.INSTANCE.serviceName(), RttEndpoint.INSTANCE.version()))
+                    .addAgent(UserAgentEndpointChannel.DIALOGUE_AGENT));
 
     private final ImmutableList<LimitedChannel> channels;
     private final RttMeasurement[] rtts;
@@ -79,12 +85,16 @@ void maybeSampleRtts() {
             return;
         }
 
+        Request rttRequest = Request.builder()
+                // necessary as we've already gone through the UserAgentEndpointChannel
+                .putHeaderParams("user-agent", USER_AGENT)
+                .build();
+
         List<ListenableFuture<Long>> futures = IntStream.range(0, channels.size())
                 .mapToObj(i -> {
                     long before = clock.read();
                     return channels.get(i)
-                            .maybeExecute(
-                                    RttEndpoint.INSTANCE, Request.builder().build())
+                            .maybeExecute(RttEndpoint.INSTANCE, rttRequest)
                             .map(future -> Futures.transform(
                                     future,
                                     _response -> {
@@ -140,7 +150,7 @@ public HttpMethod httpMethod() {
 
         @Override
         public String serviceName() {
-            return "Balanced";
+            return "RttSampler";
         }
 
         @Override
@@ -150,7 +160,7 @@ public String endpointName() {
 
         @Override
         public String version() {
-            return "0.0.0";
+            return UserAgentEndpointChannel.dialogueVersion();
         }
     }
 
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java
index 0a116dd79..46a5a5f88 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/UserAgentEndpointChannel.java
@@ -31,7 +31,7 @@
  * {@link Endpoint}'s target service and endpoint.
  */
 final class UserAgentEndpointChannel implements EndpointChannel {
-    private static final UserAgent.Agent DIALOGUE_AGENT = extractDialogueAgent();
+    static final UserAgent.Agent DIALOGUE_AGENT = extractDialogueAgent();
 
     private final EndpointChannel delegate;
     private final String userAgent;
@@ -62,8 +62,13 @@ private static UserAgent augmentUserAgent(UserAgent baseAgent, Endpoint endpoint
     }
 
     private static UserAgent.Agent extractDialogueAgent() {
+        String version = dialogueVersion();
+        return UserAgent.Agent.of("dialogue", version);
+    }
+
+    static String dialogueVersion() {
         String maybeDialogueVersion = Channel.class.getPackage().getImplementationVersion();
-        return UserAgent.Agent.of("dialogue", maybeDialogueVersion != null ? maybeDialogueVersion : "0.0.0");
+        return maybeDialogueVersion != null ? maybeDialogueVersion : "0.0.0";
     }
 
     @Override

From 1af6a22d361f554b52f23994254be60a34a75225 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 17:00:03 +0100
Subject: [PATCH 31/35] More CR

---
 .../BalancedNodeSelectionStrategyChannel.java  |  3 ++-
 .../com/palantir/dialogue/core/RttSampler.java | 18 +++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 526728470..97313832d 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -49,7 +49,8 @@
 
 /**
  * Chooses nodes based on stats about each channel, i.e. how many requests are currently
- * being served and also how many failures have been seen in the last few seconds.
+ * being served, how many failures have been seen in the last few seconds and (optionally) also what the best latency
+ * to each node is. Use {@link RttSampling#ENABLED} to switch this on.
  *
  * This is intended to be a strict improvement over Round Robin and Random Selection which can leave fast servers
  * underutilized, as it sends the same number to both a slow and fast node. It is *not* appropriate for transactional
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
index 3c0f463ed..67bc07e0f 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -41,7 +41,6 @@
 import java.util.OptionalLong;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.LongSupplier;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
@@ -80,7 +79,7 @@ RttSupplier get(int index) {
      * Non-blocking.
      */
     void maybeSampleRtts() {
-        Optional<RttMeasurementPermit> maybePermit = rateLimiter.acquireNonBlocking();
+        Optional<RttMeasurementPermit> maybePermit = rateLimiter.tryAcquire();
         if (!maybePermit.isPresent()) {
             return;
         }
@@ -210,22 +209,23 @@ private static final class RttMeasurementRateLimiter {
         private static final long BETWEEN_SAMPLES = Duration.ofSeconds(1).toNanos();
 
         private final Ticker clock;
-        private final AtomicLong lastMeasured = new AtomicLong(0);
         private final AtomicBoolean currentlySampling = new AtomicBoolean(false);
+        private volatile long lastMeasured = 0;
+
+        private final RttMeasurementPermit finishedSampling = () -> currentlySampling.set(false);
 
         private RttMeasurementRateLimiter(Ticker clock) {
             this.clock = clock;
         }
 
-        Optional<RttMeasurementPermit> acquireNonBlocking() {
-            long last = lastMeasured.get();
-            if (last + BETWEEN_SAMPLES > clock.read()) {
+        Optional<RttMeasurementPermit> tryAcquire() {
+            if (lastMeasured + BETWEEN_SAMPLES > clock.read()) {
                 return Optional.empty();
             }
 
-            if (currentlySampling.compareAndSet(false, true)) {
-                lastMeasured.set(clock.read());
-                return Optional.of(() -> currentlySampling.compareAndSet(true, false));
+            if (!currentlySampling.get() && currentlySampling.compareAndSet(false, true)) {
+                lastMeasured = clock.read();
+                return Optional.of(finishedSampling);
             } else {
                 log.warn("Wanted to sample RTTs but an existing sample was still in progress");
                 return Optional.empty();

From f1566379949b45dbcb162186cc0479e7ee5ddddb Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 17:23:43 +0100
Subject: [PATCH 32/35] Move logic to RttSampler

---
 .../BalancedNodeSelectionStrategyChannel.java | 55 ++++-------------
 .../palantir/dialogue/core/RttSampler.java    | 61 +++++++++++++++----
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index 97313832d..f0be496ef 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -38,7 +38,6 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Optional;
-import java.util.OptionalLong;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
@@ -133,50 +132,18 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
     }
 
     private static SortableChannel[] computeScores(
-            @Nullable RttSampler rttSampler, List<MutableChannelWithStats> preShuffled) {
-        SortableChannel[] snapshotArray = new SortableChannel[preShuffled.size()];
-
-        if (rttSampler == null) {
-
-            for (int i = 0; i < preShuffled.size(); i++) {
-                SortableChannel snapshot = preShuffled.get(i).computeScore(0);
-                snapshotArray[i] = snapshot;
-            }
-            return snapshotArray;
-
-        } else {
-            // Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding
-            // it would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a
-            // conversion fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos
-            // to worstRttNanos, with 0 being best and 1 being worst. This ensures that if several nodes are all
-            // within the same AZ and can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have
-            // a similar rttScore (near zero). Note, this can only be computed when we have all the snapshots in
-            // front of us.
-            long bestRttNanos = Long.MAX_VALUE;
-            long worstRttNanos = 0;
-
-            OptionalLong[] rtts = new OptionalLong[preShuffled.size()];
-            for (int i = 0; i < preShuffled.size(); i++) {
-                OptionalLong rtt = rttSampler.get(i).getRttNanos();
-                rtts[i] = rtt;
-
-                if (rtt.isPresent()) {
-                    bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong());
-                    worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong());
-                }
-            }
-            long rttRange = worstRttNanos - bestRttNanos;
-
-            for (int i = 0; i < preShuffled.size(); i++) {
-                OptionalLong rtt = rtts[i];
-                float rttSpectrum =
-                        (rttRange > 0 && rtt.isPresent()) ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0;
-
-                SortableChannel snapshot = preShuffled.get(i).computeScore(rttSpectrum);
-                snapshotArray[i] = snapshot;
-            }
-            return snapshotArray;
+            @Nullable RttSampler rttSampler, List<MutableChannelWithStats> chans) {
+        // if the feature is disabled (i.e. RttSampling.DEFAULT_OFF), then we just consider every host to have a
+        // rttSpectrum of '0'
+        float[] rttSpectrums = rttSampler != null ? rttSampler.computeRttSpectrums() : new float[chans.size()];
+
+        SortableChannel[] snapshotArray = new SortableChannel[chans.size()];
+        for (int i = 0; i < chans.size(); i++) {
+            MutableChannelWithStats channel = chans.get(i);
+            float rttSpectrum = rttSpectrums[i];
+            snapshotArray[i] = channel.computeScore(rttSpectrum);
         }
+        return snapshotArray;
     }
 
     /** Returns a new shuffled list, without mutating the input list (which may be immutable). */
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
index 67bc07e0f..713f15797 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -41,7 +41,6 @@
 import java.util.OptionalLong;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.function.LongSupplier;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import javax.annotation.concurrent.ThreadSafe;
@@ -69,14 +68,53 @@ final class RttSampler {
     }
 
     /**
-     * The {@link LongSupplier} returns the RTT in nanoseconds, or {@link OptionalLong#empty()} if we have no data.
+     * Latency (rtt) is measured in nanos, which is a tricky unit to include in our 'score' because adding
+     * it would dominate all the other data (which has the unit of 'num requests'). To avoid the need for a
+     * conversion fudge-factor, we instead figure out where each rtt lies on the spectrum from bestRttNanos
+     * to worstRttNanos, with 0 being best and 1 being worst. This ensures that if several nodes are all
+     * within the same AZ and can return in ~1 ms but others return in ~5ms, the 1ms nodes will all have
+     * a similar rttScore (near zero). Note, this can only be computed when we have all the snapshots in
+     * front of us.
      */
-    RttSupplier get(int index) {
-        return rtts[index];
+    float[] computeRttSpectrums() {
+        long bestRttNanos = Long.MAX_VALUE;
+        long worstRttNanos = 0;
+
+        // first we take a snapshot of all channels' RTT
+        OptionalLong[] snapshots = new OptionalLong[rtts.length];
+        for (int i = 0; i < rtts.length; i++) {
+            OptionalLong rtt = rtts[i].getRttNanos();
+            snapshots[i] = rtt;
+
+            if (rtt.isPresent()) {
+                bestRttNanos = Math.min(bestRttNanos, rtt.getAsLong());
+                worstRttNanos = Math.max(worstRttNanos, rtt.getAsLong());
+            }
+        }
+
+        // given the best & worst values, we can then compute the spectrums
+        float[] spectrums = new float[rtts.length];
+        long rttRange = worstRttNanos - bestRttNanos;
+        if (rttRange <= 0) {
+            return spectrums;
+        }
+
+        for (int i = 0; i < channels.size(); i++) {
+            OptionalLong rtt = snapshots[i];
+            float rttSpectrum = rtt.isPresent() ? ((float) rtt.getAsLong() - bestRttNanos) / rttRange : 0;
+            Preconditions.checkState(
+                    0 <= rttSpectrum && rttSpectrum <= 1,
+                    "rttSpectrum must be between 0 and 1",
+                    SafeArg.of("value", rttSpectrum),
+                    SafeArg.of("hostIndex", i));
+            spectrums[i] = rttSpectrum;
+        }
+
+        return spectrums;
     }
 
     /**
-     * Non-blocking.
+     * Non-blocking - should return pretty much instantly.
      */
     void maybeSampleRtts() {
         Optional<RttMeasurementPermit> maybePermit = rateLimiter.tryAcquire();
@@ -106,8 +144,6 @@ void maybeSampleRtts() {
                 })
                 .collect(ImmutableList.toImmutableList());
 
-        // the RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off
-        // more and more and more requests and eventually exhaust a threadpool
         DialogueFutures.addDirectCallback(Futures.allAsList(futures), new FutureCallback<List<Long>>() {
             @Override
             public void onSuccess(List<Long> result) {
@@ -169,7 +205,7 @@ public String version() {
      */
     @VisibleForTesting
     @ThreadSafe
-    static final class RttMeasurement implements RttSupplier {
+    static final class RttMeasurement {
         private static final int NUM_MEASUREMENTS = 5;
 
         private final long[] samples;
@@ -180,7 +216,6 @@ static final class RttMeasurement implements RttSupplier {
             Arrays.fill(samples, Long.MAX_VALUE);
         }
 
-        @Override
         public OptionalLong getRttNanos() {
             return bestRttNanos == Long.MAX_VALUE ? OptionalLong.empty() : OptionalLong.of(bestRttNanos);
         }
@@ -218,6 +253,10 @@ private RttMeasurementRateLimiter(Ticker clock) {
             this.clock = clock;
         }
 
+        /**
+         * The RttSamplePermit ensures that if a server black-holes one of our OPTIONS requests, we don't kick off
+         * more and more and more requests and eventually exhaust a threadpool. Permit is released in a future callback.
+         */
         Optional<RttMeasurementPermit> tryAcquire() {
             if (lastMeasured + BETWEEN_SAMPLES > clock.read()) {
                 return Optional.empty();
@@ -237,8 +276,4 @@ private interface RttMeasurementPermit extends Closeable {
         @Override
         void close();
     }
-
-    interface RttSupplier {
-        OptionalLong getRttNanos();
-    }
 }

From f62429d8e94d7d6497d9471f48d7a7b8f24cc3b1 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 17:31:00 +0100
Subject: [PATCH 33/35] Appease errorprone

---
 .../core/BalancedNodeSelectionStrategyChannel.java |  5 ++---
 .../com/palantir/dialogue/core/RttSampler.java     |  3 ++-
 .../BalancedNodeSelectionStrategyChannelTest.java  | 14 +++++++-------
 .../java/com/palantir/dialogue/OkHttpChannel.java  |  5 -----
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index f0be496ef..b5dd0242a 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -41,7 +41,6 @@
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
-import java.util.stream.Stream;
 import javax.annotation.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -154,8 +153,8 @@ private static <T> List<T> shuffleImmutableList(ImmutableList<T> sourceList, Ran
     }
 
     @VisibleForTesting
-    Stream<SortableChannel> getScoresForTesting() {
-        return Arrays.stream(computeScores(rttSampler, channels));
+    IntStream getScoresForTesting() {
+        return Arrays.stream(computeScores(rttSampler, channels)).mapToInt(SortableChannel::getScore);
     }
 
     @Override
diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
index 713f15797..28aabd5e6 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -63,7 +63,7 @@ final class RttSampler {
         this.rateLimiter = new RttMeasurementRateLimiter(clock);
         this.clock = clock;
         this.rtts = IntStream.range(0, channels.size())
-                .mapToObj(i -> new RttMeasurement())
+                .mapToObj(_i -> new RttMeasurement())
                 .toArray(RttMeasurement[]::new);
     }
 
@@ -247,6 +247,7 @@ private static final class RttMeasurementRateLimiter {
         private final AtomicBoolean currentlySampling = new AtomicBoolean(false);
         private volatile long lastMeasured = 0;
 
+        @SuppressWarnings("UnnecessaryLambda") // just let me avoid allocations
         private final RttMeasurementPermit finishedSampling = () -> currentlySampling.set(false);
 
         private RttMeasurementRateLimiter(Ticker clock) {
diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
index c95d68ec6..e645978cc 100644
--- a/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
+++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannelTest.java
@@ -126,7 +126,7 @@ void a_single_4xx_doesnt_move_the_needle() {
                 clock.read() < start + Duration.ofSeconds(10).toNanos();
                 incrementClockBy(Duration.ofMillis(50))) {
             channel.maybeExecute(endpoint, request);
-            assertThat(channel.getScoresForTesting().map(c -> c.getScore()))
+            assertThat(channel.getScoresForTesting())
                     .describedAs("A single 400 at the beginning isn't enough to impact scores", channel)
                     .containsExactly(0, 0);
         }
@@ -142,19 +142,19 @@ void constant_4xxs_do_eventually_move_the_needle_but_we_go_back_to_fair_distribu
 
         for (int i = 0; i < 11; i++) {
             rttChannel.maybeExecute(endpoint, request);
-            assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
+            assertThat(rttChannel.getScoresForTesting())
                     .describedAs("%s %s: Scores not affected yet %s", i, Duration.ofNanos(clock.read()), rttChannel)
                     .containsExactly(0, 0);
             incrementClockBy(Duration.ofMillis(50));
         }
         rttChannel.maybeExecute(endpoint, request);
-        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
+        assertThat(rttChannel.getScoresForTesting())
                 .describedAs("%s: Constant 4xxs did move the needle %s", Duration.ofNanos(clock.read()), rttChannel)
                 .containsExactly(1, 0);
 
         incrementClockBy(Duration.ofSeconds(5));
 
-        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
+        assertThat(rttChannel.getScoresForTesting())
                 .describedAs(
                         "%s: We quickly forget about 4xxs and go back to fair shuffling %s",
                         Duration.ofNanos(clock.read()), rttChannel)
@@ -182,7 +182,7 @@ void rtt_is_measured_and_can_influence_choices() {
         incrementClockBy(Duration.ofNanos(456));
         chan2OptionsResponse.set(new TestResponse().code(200));
 
-        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore()))
+        assertThat(rttChannel.getScoresForTesting())
                 .describedAs("The poor latency of channel2 imposes a small constant penalty in the score")
                 .containsExactly(0, 3);
 
@@ -208,7 +208,7 @@ void when_rtt_measurements_are_limited_dont_freak_out() {
 
         rttChannel.maybeExecute(endpoint, request);
 
-        assertThat(channel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
+        assertThat(channel.getScoresForTesting()).containsExactly(0, 0);
     }
 
     @Test
@@ -226,7 +226,7 @@ void when_rtt_measurements_havent_returned_yet_consider_both_far_away() {
             rttChannel.maybeExecute(endpoint, request);
         }
 
-        assertThat(rttChannel.getScoresForTesting().map(c -> c.getScore())).containsExactly(0, 0);
+        assertThat(rttChannel.getScoresForTesting()).containsExactly(0, 0);
         verify(chan1, times(1)).maybeExecute(eq(rttEndpoint), any());
         verify(chan2, times(1)).maybeExecute(eq(rttEndpoint), any());
     }
diff --git a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java
index eb43faa3f..593466dce 100644
--- a/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java
+++ b/dialogue-okhttp-client/src/main/java/com/palantir/dialogue/OkHttpChannel.java
@@ -98,11 +98,6 @@ public ListenableFuture<Response> execute(Endpoint endpoint, Request request) {
             case PATCH:
                 okRequest = okRequest.patch(toOkHttpBody(request.body()));
                 break;
-            case OPTIONS:
-                okRequest = okRequest.method(
-                        "OPTIONS",
-                        request.body().isPresent() ? toOkHttpBody(request.body().get()) : null);
-                break;
         }
 
         // Fill headers

From d873cd1f9471cfe925fe1d455b19a47c433aa9c2 Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Fri, 29 May 2020 17:35:00 +0100
Subject: [PATCH 34/35] Minimise diff

---
 .../BalancedNodeSelectionStrategyChannel.java | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
index b5dd0242a..d75f9a15c 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/BalancedNodeSelectionStrategyChannel.java
@@ -222,12 +222,16 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
         }
 
         SortableChannel computeScore(float rttSpectrum) {
+            int requestsInflight = inflight.get();
+            double failureReservoir = recentFailuresReservoir.get();
+
             // it's important that scores are integers because if we kept the full double precision, then a single 4xx
             // would end up influencing host selection long beyond its intended lifespan in the absence of other data.
-            int score = inflight.get()
-                    + Ints.saturatedCast(Math.round(recentFailuresReservoir.get()))
+            int score = requestsInflight
+                    + Ints.saturatedCast(Math.round(failureReservoir))
                     + Ints.saturatedCast(Math.round(rttSpectrum * RTT_WEIGHT));
 
+            observability.debugLogComputedScore(requestsInflight, failureReservoir, rttSpectrum, score);
             return new SortableChannel(score, this);
         }
 
@@ -245,14 +249,13 @@ public String toString() {
      * A dedicated value class ensures safe sorting, as otherwise there's a risk that the inflight AtomicInteger
      * might change mid-sort, leading to undefined behaviour.
      */
-    @VisibleForTesting
-    static final class SortableChannel {
+    private static final class SortableChannel {
         private final int score;
         private final MutableChannelWithStats delegate;
 
         SortableChannel(int score, MutableChannelWithStats delegate) {
-            this.delegate = delegate;
             this.score = score;
+            this.delegate = delegate;
         }
 
         int getScore() {
@@ -332,6 +335,19 @@ void debugLogStatusFailure(Response response) {
             }
         }
 
+        void debugLogComputedScore(int inflight, double failures, float rttSpectrum, int score) {
+            if (log.isDebugEnabled()) {
+                log.debug(
+                        "Computed score",
+                        channelName,
+                        hostIndex,
+                        SafeArg.of("score", score),
+                        SafeArg.of("inflight", inflight),
+                        SafeArg.of("failures", failures),
+                        SafeArg.of("rttSpectrum", rttSpectrum));
+            }
+        }
+
         static PerHostObservability create(
                 ImmutableList<LimitedChannel> channels,
                 TaggedMetricRegistry taggedMetrics,

From f6f69171fb829bab834abd73dfb4c92a06c6b15d Mon Sep 17 00:00:00 2001
From: Dan Fox <dfox@palantir.com>
Date: Mon, 1 Jun 2020 13:18:16 +0100
Subject: [PATCH 35/35] Remember to close the response!

---
 .../src/main/java/com/palantir/dialogue/core/RttSampler.java   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
index 28aabd5e6..237c985b2 100644
--- a/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
+++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/RttSampler.java
@@ -134,9 +134,10 @@ void maybeSampleRtts() {
                             .maybeExecute(RttEndpoint.INSTANCE, rttRequest)
                             .map(future -> Futures.transform(
                                     future,
-                                    _response -> {
+                                    response -> {
                                         long durationNanos = clock.read() - before;
                                         rtts[i].addMeasurement(durationNanos);
+                                        response.close();
                                         return durationNanos;
                                     },
                                     MoreExecutors.directExecutor()))