awslabs · CsengerG · May 3, 2024 · Apr 30, 2024 · May 1, 2024 · May 1, 2024
diff --git a/README.md b/README.md
@@ -13,6 +13,26 @@ The project is configured to be built via Gradle (Gradle 8.7). It also targets J
 * To skip tests: `./gradlew build -x test -x integrationTest`
 * To apply formatting `./gradlew spotlessApply`
 
+# Microbenchmarks
+
+We have a basic set of micro-benchmarks which test full sequential read, forward seeks, backward seeks and a 
+Parquet-like ("jumping around") read pattern.
+
+## Data Generation
+
+Our JMH micro-benchmarks run against S3. To run the micro-benchmarks, you have to first generate data. To generate data 
+you first have to tell us where to put this random data.  You can do this by populating the following two environment 
+variables: `BENCHMARK_BUCKET` (the bucket you want to use) and `BENCHMARK_DATA_PREFIX` (you might want to use a common 
+prefix in your bucket for all micro-benchmark related stuff). Now, to generate some random data, you can run `main` in 
+`SequentialReadDataGenerator`.
+
+## Running the Benchmarks
+
+Just run `./gradlew jmh --rerun`. (The reason for re-run is a Gradle-quirk. You may want to re-run benchmarks even when
+you did not actually change the source of your project: `--rerun` turns off the Gradle optimisation that falls through
+build steps when nothing changed.)
+
+
 ## Security
 
 See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.

diff --git a/input-stream/build.gradle.kts b/input-stream/build.gradle.kts
@@ -5,6 +5,7 @@
 plugins {
     id("buildlogic.java-library-conventions")
     id("io.freefair.lombok") version "8.6"
+    id("me.champeau.jmh") version "0.7.2"
 }
 
 dependencies {
@@ -43,3 +44,12 @@ tasks.test {
         languageVersion = JavaLanguageVersion.of(17)
     }
 }
+
+jmh {
+    jmhVersion = "1.37"
+    failOnError = true
+    forceGC = true
+    includeTests = false
+    resultFormat = "JSON"
+    zip64 = true
+}
diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java
@@ -0,0 +1,150 @@
+package com.amazon.connector.s3.benchmark;
+
+import com.amazon.connector.s3.S3SeekableInputStream;
+import com.amazon.connector.s3.datagen.BenchmarkData;
+import com.amazon.connector.s3.datagen.BenchmarkData.Read;
+import com.amazon.connector.s3.datagen.Constants;
+import com.amazon.connector.s3.util.S3URI;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.CompletableFuture;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import software.amazon.awssdk.core.ResponseInputStream;
+import software.amazon.awssdk.core.async.AsyncResponseTransformer;
+import software.amazon.awssdk.services.s3.S3AsyncClient;
+import software.amazon.awssdk.services.s3.model.GetObjectRequest;
+import software.amazon.awssdk.services.s3.model.GetObjectResponse;
+import software.amazon.awssdk.utils.IoUtils;
+
+/**
+ * Benchmarks following a read pattern which jumps around in the stream. This is useful to catch
+ * regressions in column-oriented read patterns. We also have tests for backwards seeks.
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 15)
+@BenchmarkMode(Mode.SingleShotTime)
+public class SeekingReadBenchmarks {
+
+  private static final S3AsyncClient client = S3AsyncClient.create();
+
+  @Param(
+      value = {
+        "random-1mb.txt",
+        "random-4mb.txt",
+        "random-16mb.txt",
+        // TODO: Extend this parameter to bigger objects once we improve performance
+        // https://app.asana.com/0/1206885953994785/1207212328457565/f
+        // "random-64mb.txt",
+        // "random-128mb.txt",
+        // "random-256mb.txt"
+      })
+  private String key;
+
+  /** Test backward seeks with S3 client */
+  @Benchmark
+  public void testBackwardSeeks__withStandardAsyncClient() {
+    BenchmarkData.getBenchMarkObjectByName(key)
+        .getBackwardSeekReadPattern()
+        .forEach(range -> doReadWithAsyncClient(client, range));
+  }
+
+  /** Test backward seeks with SeekableStream */
+  @Benchmark
+  public void testBackwardSeeks__withSeekableStream() {
+    S3SeekableInputStream stream = getStreamForKey(key);
+
+    BenchmarkData.getBenchMarkObjectByName(key)
+        .getBackwardSeekReadPattern()
+        .forEach(range -> doReadWithStream(stream, range));
+  }
+
+  /** Test forward seeks with S3 client */
+  @Benchmark
+  public void testForwardSeeks__withStandardAsyncClient() {
+    BenchmarkData.getBenchMarkObjectByName(key)
+        .getForwardSeekReadPattern()
+        .forEach(range -> doReadWithAsyncClient(client, range));
+  }
+
+  /** Test forward seeks with Seekable Stream */
+  @Benchmark
+  public void testForwardSeeks__withSeekableStream() {
+    S3SeekableInputStream stream = getStreamForKey(key);
+
+    BenchmarkData.getBenchMarkObjectByName(key)
+        .getForwardSeekReadPattern()
+        .forEach(range -> doReadWithStream(stream, range));
+  }
+
+  /** Test parquet-like reads with S3 client */
+  @Benchmark
+  public void testParquetLikeRead__withStandardAsyncClient() {
+    BenchmarkData.getBenchMarkObjectByName(key)
+        .getParquetLikeReadPattern()
+        .forEach(range -> doReadWithAsyncClient(client, range));
+  }
+
+  /** Test parquet-like reads with Seekable Stream */
+  @Benchmark
+  public void testParquetLikeRead__withSeekableStream() {
+    S3SeekableInputStream stream = getStreamForKey(key);
+
+    BenchmarkData.getBenchMarkObjectByName(key)
+        .getParquetLikeReadPattern()
+        .forEach(range -> doReadWithStream(stream, range));
+  }
+
+  private void doReadWithAsyncClient(S3AsyncClient client, Read read) {
+    CompletableFuture<ResponseInputStream<GetObjectResponse>> response =
+        client.getObject(
+            GetObjectRequest.builder()
+                .bucket(Constants.BENCHMARK_BUCKET)
+                .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key)
+                .range(rangeOf(read.getStart(), read.getStart() + read.getLength() - 1))
+                .build(),
+            AsyncResponseTransformer.toBlockingInputStream());
+
+    try {
+      System.out.println(IoUtils.toUtf8String(response.get()).hashCode());
+    } catch (Exception e) {
+      throw new RuntimeException("Could not finish read", e);
+    }
+  }
+
+  private void doReadWithStream(S3SeekableInputStream stream, Read range) {
+    try {
+      stream.seek(range.getStart());
+
+      int len = (int) range.getLength();
+      byte[] buf = new byte[len];
+      stream.read(buf, 0, len);
+      String content = new String(buf, StandardCharsets.UTF_8);
+      System.out.println(content.hashCode());
+    } catch (IOException e) {
+      new RuntimeException(
+          String.format(
+              "Could not fully read range %s-%s with SeekableStream",
+              range.getStart(), range.getStart() + range.getLength()),
+          e);
+    }
+  }
+
+  private String rangeOf(long start, long end) {
+    return String.format("bytes=%s-%s", start, end);
+  }
+
+  private S3SeekableInputStream getStreamForKey(String key) {
+    return new S3SeekableInputStream(
+        S3URI.of(Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key));
+  }
+}
diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java
@@ -0,0 +1,77 @@
+package com.amazon.connector.s3.benchmark;
+
+import com.amazon.connector.s3.S3SeekableInputStream;
+import com.amazon.connector.s3.datagen.Constants;
+import com.amazon.connector.s3.util.S3URI;
+import java.io.IOException;
+import java.util.concurrent.CompletableFuture;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import software.amazon.awssdk.core.ResponseInputStream;
+import software.amazon.awssdk.core.async.AsyncResponseTransformer;
+import software.amazon.awssdk.services.s3.S3AsyncClient;
+import software.amazon.awssdk.services.s3.model.GetObjectRequest;
+import software.amazon.awssdk.services.s3.model.GetObjectResponse;
+import software.amazon.awssdk.utils.IoUtils;
+
+/**
+ * Benchmarks which just read data sequentially. Useful for catching regressions in prefetching and
+ * regressions in how much we utilise CRT.
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 15)
+@BenchmarkMode(Mode.SingleShotTime)
+public class SequentialReadBenchmark {
+
+  @Param(
+      value = {
+        "random-1mb.txt",
+        "random-4mb.txt",
+        "random-16mb.txt",
+        // TODO: Extend this parameter to bigger objects once we improve performance
+        // https://app.asana.com/0/1206885953994785/1207212328457565/f
+        // "random-64mb.txt",
+        // "random-128mb.txt",
+        // "random-256mb.txt"
+      })
+  private String key;
+
+  /**
+   * Not a perfect baseline but will do for now. Use the standard S3Async client for sequential
+   * reads and compare its performance to seekable stream.
+   */
+  @Benchmark
+  public void testSequentialRead__withStandardAsyncClient() throws IOException {
+    S3AsyncClient client = S3AsyncClient.create();
+    CompletableFuture<ResponseInputStream<GetObjectResponse>> response =
+        client.getObject(
+            GetObjectRequest.builder()
+                .bucket(Constants.BENCHMARK_BUCKET)
+                .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key)
+                .build(),
+            AsyncResponseTransformer.toBlockingInputStream());
+
+    String content = IoUtils.toUtf8String(response.join());
+    System.out.println(content.hashCode());
+  }
+
+  /** Test sequential reads with seekable streams. */
+  @Benchmark
+  public void testSequentialRead__withSeekableStream() throws IOException {
+    S3SeekableInputStream stream =
+        new S3SeekableInputStream(
+            S3URI.of(Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key));
+
+    String content = IoUtils.toUtf8String(stream);
+    System.out.println(content.hashCode());
+  }
+}
diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java
@@ -0,0 +1,117 @@
+package com.amazon.connector.s3.datagen;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import java.util.List;
+import lombok.Builder;
+import lombok.Data;
+
+/**
+ * Class defining objects used by microbenchmarks and defining the shapes of some read patterns we
+ * want to test for.
+ */
+public class BenchmarkData {
+
+  /**
+   * Class describing a benchmark object and helper methods defining read patterns (forward seeking,
+   * backward seeking, parquet-like jumps) on the object.
+   */
+  @Data
+  @Builder
+  public static class BenchmarkObject {
+    private final String keyName;
+    private final long size;
+
+    /**
+     * Returns a read pattern that 'jumps through the object' forward by 20% increments and reads
+     * 10% of the object each time. Essentially, this results half of the content being read and
+     * half of the content being ignored.
+     *
+     * <p>Illustration of the pattern (number denotes order of read):
+     * 1111111111---2222222222---3333333333--- ... --- 5555555555--- | 0% | 20% | 40 % | 80% | 100%
+     */
+    public List<Read> getForwardSeekReadPattern() {
+      return ImmutableList.of(
+          Read.builder().start(0).length(percent(10)).build(),
+          Read.builder().start(percent(20)).length(percent(10)).build(),
+          Read.builder().start(percent(40)).length(percent(10)).build(),
+          Read.builder().start(percent(60)).length(percent(10)).build(),
+          Read.builder().start(percent(80)).length(percent(10)).build());
+    }
+
+    /** Just reverse the forward pattern */
+    public List<Read> getBackwardSeekReadPattern() {
+      return Lists.reverse(getForwardSeekReadPattern());
+    }
+
+    /** Define a tail dance + read 50% of the object */
+    public List<Read> getParquetLikeReadPattern() {
+      return ImmutableList.of(
+          // Tail dance
+          Read.builder().start(size - 1 - 4).length(4).build(),
+          Read.builder()
+              .start(size - 8 * Constants.ONE_KB_IN_BYTES)
+              .length(8 * Constants.ONE_KB_IN_BYTES)
+              .build(),
+
+          // Read some contiguous chunks
+          Read.builder().start(percent(50)).length(percent(30)).build(),
+          Read.builder().start(0).length(percent(20)).build());
+    }
+
+    /**
+     * Returns x% of an integer. Used above to seek into specific relative positions in the object
+     * when defining read patterns.
+     */
+    private int percent(int x) {
+      return (int) ((size / 100) * x);
+    }
+  }
+
+  /** Object representing a read. Has a start and a length. */
+  @Data
+  @Builder
+  public static class Read {
+    long start;
+    long length;
+  }
+
+  /**
+   * (object-name, object) pairs -- we need this due to the limitation that JMH only allows us to
+   * parameterise with Strings. So we use this Map under the hood to implement a keymap.
+   */
+  public static final List<BenchmarkObject> BENCHMARK_OBJECTS =
+      ImmutableList.of(
+          BenchmarkObject.builder()
+              .keyName("random-1mb.txt")
+              .size(1 * Constants.ONE_MB_IN_BYTES)
+              .build(),
+          BenchmarkObject.builder()
+              .keyName("random-4mb.txt")
+              .size(4 * Constants.ONE_MB_IN_BYTES)
+              .build(),
+          BenchmarkObject.builder()
+              .keyName("random-16mb.txt")
+              .size(16 * Constants.ONE_MB_IN_BYTES)
+              .build(),
+          BenchmarkObject.builder()
+              .keyName("random-64mb.txt")
+              .size(64 * Constants.ONE_MB_IN_BYTES)
+              .build(),
+          BenchmarkObject.builder()
+              .keyName("random-128mb.txt")
+              .size(128 * Constants.ONE_MB_IN_BYTES)
+              .build(),
+          BenchmarkObject.builder()
+              .keyName("random-256mb.txt")
+              .size(256 * Constants.ONE_MB_IN_BYTES)
+              .build());
+
+  /** Returns a benchmark object by name. */
+  public static BenchmarkObject getBenchMarkObjectByName(String name) {
+    return BENCHMARK_OBJECTS.stream()
+        .filter(o -> o.getKeyName().equals(name))
+        .findFirst()
+        .orElseThrow(() -> new RuntimeException("Cannot find benchmark object with name " + name));
+  }
+}