From acc2b49c407106c0f761ec44eda9da4a13cd96be Mon Sep 17 00:00:00 2001 From: Geza Csenger Date: Tue, 30 Apr 2024 19:54:11 +0000 Subject: [PATCH 1/5] Add sequential read microbenchmarks --- input-stream/build.gradle.kts | 10 +++ .../s3/benchmark/SequentialReadBenchmark.java | 76 +++++++++++++++++++ .../connector/s3/datagen/Constants.java | 8 ++ .../datagen/SequentialReadDataGenerator.java | 72 ++++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java create mode 100644 input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java create mode 100644 input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java diff --git a/input-stream/build.gradle.kts b/input-stream/build.gradle.kts index 1af0d3c8..2478026d 100644 --- a/input-stream/build.gradle.kts +++ b/input-stream/build.gradle.kts @@ -5,6 +5,7 @@ plugins { id("buildlogic.java-library-conventions") id("io.freefair.lombok") version "8.6" + id("me.champeau.jmh") version "0.7.2" } dependencies { @@ -43,3 +44,12 @@ tasks.test { languageVersion = JavaLanguageVersion.of(17) } } + +jmh { + jmhVersion = "1.37" + failOnError = true + forceGC = true + includeTests = false + resultFormat = "JSON" + zip64 = true +} diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java new file mode 100644 index 00000000..44acf5d6 --- /dev/null +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java @@ -0,0 +1,76 @@ +package com.amazon.connector.s3.benchmark; + +import com.amazon.connector.s3.S3SeekableInputStream; +import com.amazon.connector.s3.datagen.Constants; +import com.amazon.connector.s3.util.S3URI; +import java.io.IOException; +import java.util.concurrent.CompletableFuture; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import software.amazon.awssdk.core.ResponseInputStream; +import software.amazon.awssdk.core.async.AsyncResponseTransformer; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import software.amazon.awssdk.utils.IoUtils; + +/** + * Benchmarks which just read data sequentially. Useful for catching regressions in prefetching and + * regressions in how much we utilise CRT. + */ +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 15) +@BenchmarkMode(Mode.SingleShotTime) +public class SequentialReadBenchmark { + + @Param( + value = { + "random-1mb.txt", + "random-4mb.txt", + "random-16mb.txt", + // For now, these objects are not feasible to fetch with S3SeekableStream + // "random-64mb.txt", + // "random-128mb.txt", + // "random-256mb.txt" + }) + private String key; + + /** + * Not a perfect baseline but will do for now. Use the standard S3Async client for sequential + * reads and compare its performance to seekable stream. + */ + @Benchmark + public void testSequentialRead__withStandardAsyncClient() throws IOException { + S3AsyncClient client = S3AsyncClient.create(); + CompletableFuture> response = + client.getObject( + GetObjectRequest.builder() + .bucket(Constants.BENCHMARK_BUCKET) + .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) + .build(), + AsyncResponseTransformer.toBlockingInputStream()); + + String content = IoUtils.toUtf8String(response.join()); + System.out.println(content.hashCode()); + } + + /** Test sequential reads with seekable streams. */ + @Benchmark + public void testSequentialRead__withSeekableStream() throws IOException { + S3SeekableInputStream stream = + new S3SeekableInputStream( + S3URI.of(Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key)); + + String content = IoUtils.toUtf8String(stream); + System.out.println(content.hashCode()); + } +} diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java new file mode 100644 index 00000000..1967dfde --- /dev/null +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java @@ -0,0 +1,8 @@ +package com.amazon.connector.s3.datagen; + +/** Constants related to microbenchmark data generation */ +public class Constants { + public static final String BENCHMARK_BUCKET = "gccsenge-microbenchmarks-dub"; + public static final String BENCHMARK_DATA_PREFIX_SEQUENTIAL = + "s3-connector-framework-benchmark/sequential/"; +} diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java new file mode 100644 index 00000000..82460db3 --- /dev/null +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java @@ -0,0 +1,72 @@ +package com.amazon.connector.s3.datagen; + +import com.google.common.collect.ImmutableMap; +import java.util.Map; +import java.util.Random; +import software.amazon.awssdk.core.async.AsyncRequestBody; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +/** + * This class implements data generation for the sequential read micro-benchmarks. This allows for + * deterministic data generation which in turn allows to run reproducible micro-benchmarks. The + * results of microbenchmarks are not to be compared across different computers (Mac of engineer A + * and DevDesk of engineer B will have different results), but runs on the same computer should be + * comparable. + */ +public class SequentialReadDataGenerator { + + private static final int ONE_MB_IN_BYTES = 1 * 1024 * 1024; + + /** (object-name, size) pairs */ + private static final Map OBJECTS = + ImmutableMap.of( + "random-1mb.txt", ONE_MB_IN_BYTES, + "random-4mb.txt", 4 * ONE_MB_IN_BYTES, + "random-16mb.txt", 16 * ONE_MB_IN_BYTES, + "random-64mb.txt", 64 * ONE_MB_IN_BYTES, + "random-128mb.txt", 128 * ONE_MB_IN_BYTES, + "random-256mb.txt", 256 * ONE_MB_IN_BYTES); + + /** + * Entry point: set bucket name and prefix in {@link Constants} to generate a dataset of random + * objects + */ + public static void main(String[] args) { + OBJECTS.forEach(SequentialReadDataGenerator::generateObject); + } + + private static void generateObject(String key, Integer size) { + String fullKeyName = + String.format( + "s3://%s/%s", + Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key); + System.out.println("Generating " + fullKeyName + " and uploading it to S3..."); + + S3AsyncClient s3AsyncClient = S3AsyncClient.create(); + s3AsyncClient + .putObject( + PutObjectRequest.builder() + .bucket(Constants.BENCHMARK_BUCKET) + .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) + .build(), + AsyncRequestBody.fromBytes(generateBytes(size))) + .join(); + } + + private static byte[] generateBytes(int len) { + byte[] buf = new byte[len]; + Random random = new Random(); + random.nextBytes(buf); + return buf; + } + + /** + * PRBS31-style pseudo-random number generator for producing deterministic 'random' data of given + * size. https://en.wikipedia.org/wiki/Pseudorandom_binary_sequence + */ + private static int prbs31(int state) { + int feedback = ((state >> 30) ^ (state >> 27)) & 1; + return ((state << 1) | feedback) & 0xffffffff; + } +} From b01f8cf21e3d230041b456c014b107c996009320 Mon Sep 17 00:00:00 2001 From: Geza Csenger Date: Wed, 1 May 2024 10:08:28 +0000 Subject: [PATCH 2/5] Make the shape of objects and access patterns precise --- .../s3/benchmark/SeekingReadBenchmarks.java | 83 +++++++++++++ .../connector/s3/datagen/BenchmarkData.java | 117 ++++++++++++++++++ .../connector/s3/datagen/Constants.java | 2 + .../datagen/SequentialReadDataGenerator.java | 45 ++----- 4 files changed, 211 insertions(+), 36 deletions(-) create mode 100644 input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java create mode 100644 input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java new file mode 100644 index 00000000..9ca09fdb --- /dev/null +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java @@ -0,0 +1,83 @@ +package com.amazon.connector.s3.benchmark; + +import com.amazon.connector.s3.datagen.BenchmarkData; +import com.amazon.connector.s3.datagen.BenchmarkData.Read; +import com.amazon.connector.s3.datagen.Constants; +import java.util.concurrent.CompletableFuture; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import software.amazon.awssdk.core.ResponseInputStream; +import software.amazon.awssdk.core.async.AsyncResponseTransformer; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import software.amazon.awssdk.utils.IoUtils; + +/** + * Benchmarks following a read pattern which jumps around in the stream. This is useful to catch + * regressions in column-oriented read patterns. We also have tests for backwards seeks. + */ +@Fork(1) +@State(Scope.Benchmark) +@Warmup(iterations = 3) +@Measurement(iterations = 15) +@BenchmarkMode(Mode.SingleShotTime) +public class SeekingReadBenchmarks { + + private static final S3AsyncClient client = S3AsyncClient.create(); + + @Param(value = {"random-1mb.txt", "random-4mb.txt", "random-16mb.txt"}) + private String key; + + /** Test backwards seek */ + @Benchmark + public void testBackwardsSeeks__withStandardAsyncClient() { + BenchmarkData.getBenchMarkObjectByName(key) + .getBackwardSeekReadPattern() + .forEach(range -> doReadWithAsyncClient(client, range)); + } + + /** Test forwards seek */ + @Benchmark + public void testForwardSeeks__withStandardAsyncClient() { + BenchmarkData.getBenchMarkObjectByName(key) + .getForwardSeekReadPattern() + .forEach(range -> doReadWithAsyncClient(client, range)); + } + + /** Test parquet-like reads */ + @Benchmark + public void testParquetLikeRead__withStandardAsyncClient() { + BenchmarkData.getBenchMarkObjectByName(key) + .getParquetLikeReadPattern() + .forEach(range -> doReadWithAsyncClient(client, range)); + } + + private void doReadWithAsyncClient(S3AsyncClient client, Read read) { + CompletableFuture> response = + client.getObject( + GetObjectRequest.builder() + .bucket(Constants.BENCHMARK_BUCKET) + .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) + .range(rangeOf(read.getStart(), read.getStart() + read.getLength())) + .build(), + AsyncResponseTransformer.toBlockingInputStream()); + + try { + IoUtils.toUtf8String(response.get()); + } catch (Exception e) { + throw new RuntimeException("Could not finish read", e); + } + } + + private String rangeOf(long start, long end) { + return String.format("bytes=%s-%s", start, end); + } +} diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java new file mode 100644 index 00000000..27e4ce44 --- /dev/null +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java @@ -0,0 +1,117 @@ +package com.amazon.connector.s3.datagen; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import java.util.List; +import lombok.Builder; +import lombok.Data; + +/** + * Class defining objects used by microbenchmarks and defining the shapes of some read patterns we + * want to test for. + */ +public class BenchmarkData { + + /** + * Class describing a benchmark object and helper methods defining read patterns (forward seeking, + * backward seeking, parquet-like jumps) on the object. + */ + @Data + @Builder + public static class BenchmarkObject { + private final String keyName; + private final long size; + + /** + * Returns a read pattern that 'jumps through the object' forward by 20% increments and reads + * 10% of the object each time. Essentially, this results half of the content being read and + * half of the content being ignored. + * + *

Illustration of the pattern (number denotes order of read): + * 1111111111---2222222222---3333333333--- ... --- 5555555555--- | 0% | 20% | 40 % | 80% | 100% + */ + public List getForwardSeekReadPattern() { + return ImmutableList.of( + Read.builder().start(0).length(percent(10)).build(), + Read.builder().start(percent(20)).length(percent(10)).build(), + Read.builder().start(percent(40)).length(percent(10)).build(), + Read.builder().start(percent(60)).length(percent(10)).build(), + Read.builder().start(percent(80)).length(percent(10)).build()); + } + + /** Just reverse the forward pattern */ + public List getBackwardSeekReadPattern() { + return Lists.reverse(getForwardSeekReadPattern()); + } + + /** Define a tail dance + read 50% of the object */ + public List getParquetLikeReadPattern() { + return ImmutableList.of( + // Tail dance + Read.builder().start(size - 1 - 4).length(4).build(), + Read.builder() + .start(size - 8 * Constants.ONE_KB_IN_BYTES) + .length(8 * Constants.ONE_KB_IN_BYTES) + .build(), + + // Read some contiguous chunks + Read.builder().start(percent(50)).length(percent(30)).build(), + Read.builder().start(0).length(percent(20)).build()); + } + + /** + * Returns x% of an integer. Used above to seek into specific relative positions in the object + * when defining read patterns. + */ + private int percent(int x) { + return (int) ((size / 100) * x); + } + } + + /** Object representing a read. Has a start and a length. */ + @Data + @Builder + public static class Read { + long start; + long length; + } + + /** + * (object-name, object) pairs -- we need this due to the limitation that JMH only allows us to + * parameterise with Strings. So we use this Map under the hood to implement a keymap. + */ + public static final List BENCHMARK_OBJECTS = + ImmutableList.of( + BenchmarkObject.builder() + .keyName("random-1mb.txt") + .size(1 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-4mb.txt") + .size(4 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-16mb.txt") + .size(16 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-64mb.txt") + .size(64 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-128mb.txt") + .size(128 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-256mb.txt") + .size(256 * Constants.ONE_MB_IN_BYTES) + .build()); + + /** Returns a benchmark object by name. */ + public static BenchmarkObject getBenchMarkObjectByName(String name) { + return BENCHMARK_OBJECTS.stream() + .filter(o -> o.getKeyName().equals(name)) + .findFirst() + .orElseThrow(() -> new RuntimeException("Cannot find benchmark object")); + } +} diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java index 1967dfde..62f9ca4d 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java @@ -2,6 +2,8 @@ /** Constants related to microbenchmark data generation */ public class Constants { + public static final int ONE_KB_IN_BYTES = 1024; + public static final int ONE_MB_IN_BYTES = 1024 * ONE_KB_IN_BYTES; public static final String BENCHMARK_BUCKET = "gccsenge-microbenchmarks-dub"; public static final String BENCHMARK_DATA_PREFIX_SEQUENTIAL = "s3-connector-framework-benchmark/sequential/"; diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java index 82460db3..3c59e55c 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/SequentialReadDataGenerator.java @@ -1,7 +1,6 @@ package com.amazon.connector.s3.datagen; -import com.google.common.collect.ImmutableMap; -import java.util.Map; +import com.amazon.connector.s3.datagen.BenchmarkData.BenchmarkObject; import java.util.Random; import software.amazon.awssdk.core.async.AsyncRequestBody; import software.amazon.awssdk.services.s3.S3AsyncClient; @@ -16,57 +15,31 @@ */ public class SequentialReadDataGenerator { - private static final int ONE_MB_IN_BYTES = 1 * 1024 * 1024; - - /** (object-name, size) pairs */ - private static final Map OBJECTS = - ImmutableMap.of( - "random-1mb.txt", ONE_MB_IN_BYTES, - "random-4mb.txt", 4 * ONE_MB_IN_BYTES, - "random-16mb.txt", 16 * ONE_MB_IN_BYTES, - "random-64mb.txt", 64 * ONE_MB_IN_BYTES, - "random-128mb.txt", 128 * ONE_MB_IN_BYTES, - "random-256mb.txt", 256 * ONE_MB_IN_BYTES); - /** * Entry point: set bucket name and prefix in {@link Constants} to generate a dataset of random * objects */ public static void main(String[] args) { - OBJECTS.forEach(SequentialReadDataGenerator::generateObject); + BenchmarkData.BENCHMARK_OBJECTS.forEach(SequentialReadDataGenerator::generateObject); } - private static void generateObject(String key, Integer size) { - String fullKeyName = - String.format( - "s3://%s/%s", - Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key); + private static void generateObject(BenchmarkObject benchmarkObject) { + String key = Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + benchmarkObject.getKeyName(); + String fullKeyName = String.format("s3://%s/%s", Constants.BENCHMARK_BUCKET, key); System.out.println("Generating " + fullKeyName + " and uploading it to S3..."); S3AsyncClient s3AsyncClient = S3AsyncClient.create(); s3AsyncClient .putObject( - PutObjectRequest.builder() - .bucket(Constants.BENCHMARK_BUCKET) - .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) - .build(), - AsyncRequestBody.fromBytes(generateBytes(size))) + PutObjectRequest.builder().bucket(Constants.BENCHMARK_BUCKET).key(key).build(), + AsyncRequestBody.fromBytes(generateBytes(benchmarkObject.getSize()))) .join(); } - private static byte[] generateBytes(int len) { - byte[] buf = new byte[len]; + private static byte[] generateBytes(long len) { + byte[] buf = new byte[(int) len]; Random random = new Random(); random.nextBytes(buf); return buf; } - - /** - * PRBS31-style pseudo-random number generator for producing deterministic 'random' data of given - * size. https://en.wikipedia.org/wiki/Pseudorandom_binary_sequence - */ - private static int prbs31(int state) { - int feedback = ((state >> 30) ^ (state >> 27)) & 1; - return ((state << 1) | feedback) & 0xffffffff; - } } From e8dbda70cace1f7bb3b710da984d0c6aa3559b98 Mon Sep 17 00:00:00 2001 From: Geza Csenger Date: Wed, 1 May 2024 10:59:48 +0000 Subject: [PATCH 3/5] wrap up first version of microbenchmarks --- .../s3/benchmark/SeekingReadBenchmarks.java | 81 +++++++++++++++++-- .../s3/benchmark/SequentialReadBenchmark.java | 3 +- .../connector/s3/datagen/BenchmarkData.java | 42 +++++----- .../connector/s3/datagen/Constants.java | 34 +++++++- 4 files changed, 129 insertions(+), 31 deletions(-) diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java index 9ca09fdb..0dc34635 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SeekingReadBenchmarks.java @@ -1,8 +1,12 @@ package com.amazon.connector.s3.benchmark; +import com.amazon.connector.s3.S3SeekableInputStream; import com.amazon.connector.s3.datagen.BenchmarkData; import com.amazon.connector.s3.datagen.BenchmarkData.Read; import com.amazon.connector.s3.datagen.Constants; +import com.amazon.connector.s3.util.S3URI; +import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.concurrent.CompletableFuture; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -33,18 +37,38 @@ public class SeekingReadBenchmarks { private static final S3AsyncClient client = S3AsyncClient.create(); - @Param(value = {"random-1mb.txt", "random-4mb.txt", "random-16mb.txt"}) + @Param( + value = { + "random-1mb.txt", + "random-4mb.txt", + "random-16mb.txt", + // TODO: Extend this parameter to bigger objects once we improve performance + // https://app.asana.com/0/1206885953994785/1207212328457565/f + // "random-64mb.txt", + // "random-128mb.txt", + // "random-256mb.txt" + }) private String key; - /** Test backwards seek */ + /** Test backward seeks with S3 client */ @Benchmark - public void testBackwardsSeeks__withStandardAsyncClient() { + public void testBackwardSeeks__withStandardAsyncClient() { BenchmarkData.getBenchMarkObjectByName(key) .getBackwardSeekReadPattern() .forEach(range -> doReadWithAsyncClient(client, range)); } - /** Test forwards seek */ + /** Test backward seeks with SeekableStream */ + @Benchmark + public void testBackwardSeeks__withSeekableStream() { + S3SeekableInputStream stream = getStreamForKey(key); + + BenchmarkData.getBenchMarkObjectByName(key) + .getBackwardSeekReadPattern() + .forEach(range -> doReadWithStream(stream, range)); + } + + /** Test forward seeks with S3 client */ @Benchmark public void testForwardSeeks__withStandardAsyncClient() { BenchmarkData.getBenchMarkObjectByName(key) @@ -52,7 +76,17 @@ public void testForwardSeeks__withStandardAsyncClient() { .forEach(range -> doReadWithAsyncClient(client, range)); } - /** Test parquet-like reads */ + /** Test forward seeks with Seekable Stream */ + @Benchmark + public void testForwardSeeks__withSeekableStream() { + S3SeekableInputStream stream = getStreamForKey(key); + + BenchmarkData.getBenchMarkObjectByName(key) + .getForwardSeekReadPattern() + .forEach(range -> doReadWithStream(stream, range)); + } + + /** Test parquet-like reads with S3 client */ @Benchmark public void testParquetLikeRead__withStandardAsyncClient() { BenchmarkData.getBenchMarkObjectByName(key) @@ -60,24 +94,57 @@ public void testParquetLikeRead__withStandardAsyncClient() { .forEach(range -> doReadWithAsyncClient(client, range)); } + /** Test parquet-like reads with Seekable Stream */ + @Benchmark + public void testParquetLikeRead__withSeekableStream() { + S3SeekableInputStream stream = getStreamForKey(key); + + BenchmarkData.getBenchMarkObjectByName(key) + .getParquetLikeReadPattern() + .forEach(range -> doReadWithStream(stream, range)); + } + private void doReadWithAsyncClient(S3AsyncClient client, Read read) { CompletableFuture> response = client.getObject( GetObjectRequest.builder() .bucket(Constants.BENCHMARK_BUCKET) .key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) - .range(rangeOf(read.getStart(), read.getStart() + read.getLength())) + .range(rangeOf(read.getStart(), read.getStart() + read.getLength() - 1)) .build(), AsyncResponseTransformer.toBlockingInputStream()); try { - IoUtils.toUtf8String(response.get()); + System.out.println(IoUtils.toUtf8String(response.get()).hashCode()); } catch (Exception e) { throw new RuntimeException("Could not finish read", e); } } + private void doReadWithStream(S3SeekableInputStream stream, Read range) { + try { + stream.seek(range.getStart()); + + int len = (int) range.getLength(); + byte[] buf = new byte[len]; + stream.read(buf, 0, len); + String content = new String(buf, StandardCharsets.UTF_8); + System.out.println(content.hashCode()); + } catch (IOException e) { + new RuntimeException( + String.format( + "Could not fully read range %s-%s with SeekableStream", + range.getStart(), range.getStart() + range.getLength()), + e); + } + } + private String rangeOf(long start, long end) { return String.format("bytes=%s-%s", start, end); } + + private S3SeekableInputStream getStreamForKey(String key) { + return new S3SeekableInputStream( + S3URI.of(Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key)); + } } diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java index 44acf5d6..43a5d35b 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/benchmark/SequentialReadBenchmark.java @@ -37,7 +37,8 @@ public class SequentialReadBenchmark { "random-1mb.txt", "random-4mb.txt", "random-16mb.txt", - // For now, these objects are not feasible to fetch with S3SeekableStream + // TODO: Extend this parameter to bigger objects once we improve performance + // https://app.asana.com/0/1206885953994785/1207212328457565/f // "random-64mb.txt", // "random-128mb.txt", // "random-256mb.txt" diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java index 27e4ce44..e9181ac6 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java @@ -86,32 +86,32 @@ public static class Read { .keyName("random-1mb.txt") .size(1 * Constants.ONE_MB_IN_BYTES) .build(), - BenchmarkObject.builder() - .keyName("random-4mb.txt") - .size(4 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-16mb.txt") - .size(16 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-64mb.txt") - .size(64 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-128mb.txt") - .size(128 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-256mb.txt") - .size(256 * Constants.ONE_MB_IN_BYTES) - .build()); + BenchmarkObject.builder() + .keyName("random-4mb.txt") + .size(4 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-16mb.txt") + .size(16 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-64mb.txt") + .size(64 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-128mb.txt") + .size(128 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-256mb.txt") + .size(256 * Constants.ONE_MB_IN_BYTES) + .build()); /** Returns a benchmark object by name. */ public static BenchmarkObject getBenchMarkObjectByName(String name) { return BENCHMARK_OBJECTS.stream() .filter(o -> o.getKeyName().equals(name)) .findFirst() - .orElseThrow(() -> new RuntimeException("Cannot find benchmark object")); + .orElseThrow(() -> new RuntimeException("Cannot find benchmark object with name " + name)); } } diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java index 62f9ca4d..9da3d7fe 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/Constants.java @@ -1,10 +1,40 @@ package com.amazon.connector.s3.datagen; +import software.amazon.awssdk.services.s3.S3AsyncClient; +import software.amazon.awssdk.services.s3.model.HeadBucketRequest; + /** Constants related to microbenchmark data generation */ public class Constants { + + public static final String BENCHMARK_BUCKET; + private static final String BENCHMARK_BUCKET_ENV_VAR = "BENCHMARK_BUCKET"; + + public static final String BENCHMARK_DATA_PREFIX; + private static final String BENCHMARK_DATA_PREFIX_ENV_VAR = "BENCHMARK_DATA_PREFIX"; + + static { + BENCHMARK_BUCKET = System.getenv(BENCHMARK_BUCKET_ENV_VAR); + BENCHMARK_DATA_PREFIX = System.getenv(BENCHMARK_DATA_PREFIX_ENV_VAR); + + // Sanity check setup (credentials + does the bucket exist) by sending a HeadBucket to the + // bucket + try { + S3AsyncClient client = S3AsyncClient.create(); + client.headBucket(HeadBucketRequest.builder().bucket(BENCHMARK_BUCKET).build()).join(); + } catch (Exception e) { + throw new RuntimeException( + String.format( + "Could not initialise micro-benchmarks because the environment setup is likely incorrect. We could not " + + "do a HeadBucket against your bucket. Please ensure you correctly populate environment variables %s and " + + "%s before running ./gradlew jmh", + BENCHMARK_BUCKET_ENV_VAR, BENCHMARK_DATA_PREFIX_ENV_VAR), + e); + } + } + public static final int ONE_KB_IN_BYTES = 1024; public static final int ONE_MB_IN_BYTES = 1024 * ONE_KB_IN_BYTES; - public static final String BENCHMARK_BUCKET = "gccsenge-microbenchmarks-dub"; + public static final String BENCHMARK_DATA_PREFIX_SEQUENTIAL = - "s3-connector-framework-benchmark/sequential/"; + BENCHMARK_DATA_PREFIX + "/sequential/"; } From 9f62de4f7d33d3cf5440d7a925cc71aa54841bce Mon Sep 17 00:00:00 2001 From: Geza Csenger Date: Wed, 1 May 2024 11:13:42 +0000 Subject: [PATCH 4/5] update README --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 1994b439..dde9749a 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,26 @@ The project is configured to be built via Gradle (Gradle 8.7). It also targets J * To skip tests: `./gradlew build -x test -x integrationTest` * To apply formatting `./gradlew spotlessApply` +# Microbenchmarks + +We have a basic set of micro-benchmarks which test full sequential read, forward seeks, backward seeks and a +Parquet-like ("jumping around") read pattern. + +## Data Generation + +Our JMH micro-benchmarks run against S3. To run the micro-benchmarks, you have to first generate data. To generate data +you first have to tell us where to put this random data. You can do this by populating the following two environment +variables: `BENCHMARK_BUCKET` (the bucket you want to use) and `BENCHMARK_DATA_PREFIX` (you might want to use a common +prefix in your bucket for all micro-benchmark related stuff). Now, to generate some random data, you can run `main` in +`SequentialReadDataGenerator`. + +## Running the Benchmarks + +Just run `./gradlew jmh --rerun`. (The reason for re-run is a Gradle-quirk. You may want to re-run benchmarks even when +you did not actually change the source of your project: `--rerun` turns off the Gradle optimisation that falls through +build steps when nothing changed.) + + ## Security See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. From ad0d96f9a98ed2f320cc731249953fb49879a8a0 Mon Sep 17 00:00:00 2001 From: Geza Csenger Date: Wed, 1 May 2024 11:14:38 +0000 Subject: [PATCH 5/5] apply spotless --- .../connector/s3/datagen/BenchmarkData.java | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java index e9181ac6..a14bcad6 100644 --- a/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java +++ b/input-stream/src/jmh/java/com/amazon/connector/s3/datagen/BenchmarkData.java @@ -86,26 +86,26 @@ public static class Read { .keyName("random-1mb.txt") .size(1 * Constants.ONE_MB_IN_BYTES) .build(), - BenchmarkObject.builder() - .keyName("random-4mb.txt") - .size(4 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-16mb.txt") - .size(16 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-64mb.txt") - .size(64 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-128mb.txt") - .size(128 * Constants.ONE_MB_IN_BYTES) - .build(), - BenchmarkObject.builder() - .keyName("random-256mb.txt") - .size(256 * Constants.ONE_MB_IN_BYTES) - .build()); + BenchmarkObject.builder() + .keyName("random-4mb.txt") + .size(4 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-16mb.txt") + .size(16 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-64mb.txt") + .size(64 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-128mb.txt") + .size(128 * Constants.ONE_MB_IN_BYTES) + .build(), + BenchmarkObject.builder() + .keyName("random-256mb.txt") + .size(256 * Constants.ONE_MB_IN_BYTES) + .build()); /** Returns a benchmark object by name. */ public static BenchmarkObject getBenchMarkObjectByName(String name) {