-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement first version of JMH microbenchmarks #18
Changes from all commits
acc2b49
b01f8cf
e8dbda7
9f62de4
ad0d96f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
package com.amazon.connector.s3.benchmark; | ||
|
||
import com.amazon.connector.s3.S3SeekableInputStream; | ||
import com.amazon.connector.s3.datagen.BenchmarkData; | ||
import com.amazon.connector.s3.datagen.BenchmarkData.Read; | ||
import com.amazon.connector.s3.datagen.Constants; | ||
import com.amazon.connector.s3.util.S3URI; | ||
import java.io.IOException; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.concurrent.CompletableFuture; | ||
import org.openjdk.jmh.annotations.Benchmark; | ||
import org.openjdk.jmh.annotations.BenchmarkMode; | ||
import org.openjdk.jmh.annotations.Fork; | ||
import org.openjdk.jmh.annotations.Measurement; | ||
import org.openjdk.jmh.annotations.Mode; | ||
import org.openjdk.jmh.annotations.Param; | ||
import org.openjdk.jmh.annotations.Scope; | ||
import org.openjdk.jmh.annotations.State; | ||
import org.openjdk.jmh.annotations.Warmup; | ||
import software.amazon.awssdk.core.ResponseInputStream; | ||
import software.amazon.awssdk.core.async.AsyncResponseTransformer; | ||
import software.amazon.awssdk.services.s3.S3AsyncClient; | ||
import software.amazon.awssdk.services.s3.model.GetObjectRequest; | ||
import software.amazon.awssdk.services.s3.model.GetObjectResponse; | ||
import software.amazon.awssdk.utils.IoUtils; | ||
|
||
/** | ||
* Benchmarks following a read pattern which jumps around in the stream. This is useful to catch | ||
* regressions in column-oriented read patterns. We also have tests for backwards seeks. | ||
*/ | ||
@Fork(1) | ||
@State(Scope.Benchmark) | ||
@Warmup(iterations = 3) | ||
@Measurement(iterations = 15) | ||
@BenchmarkMode(Mode.SingleShotTime) | ||
public class SeekingReadBenchmarks { | ||
|
||
private static final S3AsyncClient client = S3AsyncClient.create(); | ||
|
||
@Param( | ||
value = { | ||
"random-1mb.txt", | ||
"random-4mb.txt", | ||
"random-16mb.txt", | ||
// TODO: Extend this parameter to bigger objects once we improve performance | ||
// https://app.asana.com/0/1206885953994785/1207212328457565/f | ||
// "random-64mb.txt", | ||
// "random-128mb.txt", | ||
// "random-256mb.txt" | ||
}) | ||
private String key; | ||
|
||
/** Test backward seeks with S3 client */ | ||
@Benchmark | ||
public void testBackwardSeeks__withStandardAsyncClient() { | ||
BenchmarkData.getBenchMarkObjectByName(key) | ||
.getBackwardSeekReadPattern() | ||
.forEach(range -> doReadWithAsyncClient(client, range)); | ||
} | ||
|
||
/** Test backward seeks with SeekableStream */ | ||
@Benchmark | ||
public void testBackwardSeeks__withSeekableStream() { | ||
S3SeekableInputStream stream = getStreamForKey(key); | ||
|
||
BenchmarkData.getBenchMarkObjectByName(key) | ||
.getBackwardSeekReadPattern() | ||
.forEach(range -> doReadWithStream(stream, range)); | ||
} | ||
|
||
/** Test forward seeks with S3 client */ | ||
@Benchmark | ||
public void testForwardSeeks__withStandardAsyncClient() { | ||
BenchmarkData.getBenchMarkObjectByName(key) | ||
.getForwardSeekReadPattern() | ||
.forEach(range -> doReadWithAsyncClient(client, range)); | ||
} | ||
|
||
/** Test forward seeks with Seekable Stream */ | ||
@Benchmark | ||
public void testForwardSeeks__withSeekableStream() { | ||
S3SeekableInputStream stream = getStreamForKey(key); | ||
|
||
BenchmarkData.getBenchMarkObjectByName(key) | ||
.getForwardSeekReadPattern() | ||
.forEach(range -> doReadWithStream(stream, range)); | ||
} | ||
|
||
/** Test parquet-like reads with S3 client */ | ||
@Benchmark | ||
public void testParquetLikeRead__withStandardAsyncClient() { | ||
BenchmarkData.getBenchMarkObjectByName(key) | ||
.getParquetLikeReadPattern() | ||
.forEach(range -> doReadWithAsyncClient(client, range)); | ||
} | ||
|
||
/** Test parquet-like reads with Seekable Stream */ | ||
@Benchmark | ||
public void testParquetLikeRead__withSeekableStream() { | ||
S3SeekableInputStream stream = getStreamForKey(key); | ||
|
||
BenchmarkData.getBenchMarkObjectByName(key) | ||
.getParquetLikeReadPattern() | ||
.forEach(range -> doReadWithStream(stream, range)); | ||
} | ||
|
||
private void doReadWithAsyncClient(S3AsyncClient client, Read read) { | ||
CompletableFuture<ResponseInputStream<GetObjectResponse>> response = | ||
client.getObject( | ||
GetObjectRequest.builder() | ||
.bucket(Constants.BENCHMARK_BUCKET) | ||
.key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) | ||
.range(rangeOf(read.getStart(), read.getStart() + read.getLength() - 1)) | ||
.build(), | ||
AsyncResponseTransformer.toBlockingInputStream()); | ||
|
||
try { | ||
System.out.println(IoUtils.toUtf8String(response.get()).hashCode()); | ||
} catch (Exception e) { | ||
throw new RuntimeException("Could not finish read", e); | ||
} | ||
} | ||
|
||
private void doReadWithStream(S3SeekableInputStream stream, Read range) { | ||
try { | ||
stream.seek(range.getStart()); | ||
|
||
int len = (int) range.getLength(); | ||
byte[] buf = new byte[len]; | ||
stream.read(buf, 0, len); | ||
String content = new String(buf, StandardCharsets.UTF_8); | ||
System.out.println(content.hashCode()); | ||
} catch (IOException e) { | ||
new RuntimeException( | ||
String.format( | ||
"Could not fully read range %s-%s with SeekableStream", | ||
range.getStart(), range.getStart() + range.getLength()), | ||
e); | ||
} | ||
} | ||
|
||
private String rangeOf(long start, long end) { | ||
return String.format("bytes=%s-%s", start, end); | ||
} | ||
|
||
private S3SeekableInputStream getStreamForKey(String key) { | ||
return new S3SeekableInputStream( | ||
S3URI.of(Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key)); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package com.amazon.connector.s3.benchmark; | ||
|
||
import com.amazon.connector.s3.S3SeekableInputStream; | ||
import com.amazon.connector.s3.datagen.Constants; | ||
import com.amazon.connector.s3.util.S3URI; | ||
import java.io.IOException; | ||
import java.util.concurrent.CompletableFuture; | ||
import org.openjdk.jmh.annotations.Benchmark; | ||
import org.openjdk.jmh.annotations.BenchmarkMode; | ||
import org.openjdk.jmh.annotations.Fork; | ||
import org.openjdk.jmh.annotations.Measurement; | ||
import org.openjdk.jmh.annotations.Mode; | ||
import org.openjdk.jmh.annotations.Param; | ||
import org.openjdk.jmh.annotations.Scope; | ||
import org.openjdk.jmh.annotations.State; | ||
import org.openjdk.jmh.annotations.Warmup; | ||
import software.amazon.awssdk.core.ResponseInputStream; | ||
import software.amazon.awssdk.core.async.AsyncResponseTransformer; | ||
import software.amazon.awssdk.services.s3.S3AsyncClient; | ||
import software.amazon.awssdk.services.s3.model.GetObjectRequest; | ||
import software.amazon.awssdk.services.s3.model.GetObjectResponse; | ||
import software.amazon.awssdk.utils.IoUtils; | ||
|
||
/** | ||
* Benchmarks which just read data sequentially. Useful for catching regressions in prefetching and | ||
* regressions in how much we utilise CRT. | ||
*/ | ||
@Fork(1) | ||
@State(Scope.Benchmark) | ||
@Warmup(iterations = 3) | ||
@Measurement(iterations = 15) | ||
@BenchmarkMode(Mode.SingleShotTime) | ||
public class SequentialReadBenchmark { | ||
|
||
@Param( | ||
value = { | ||
"random-1mb.txt", | ||
"random-4mb.txt", | ||
"random-16mb.txt", | ||
// TODO: Extend this parameter to bigger objects once we improve performance | ||
// https://app.asana.com/0/1206885953994785/1207212328457565/f | ||
// "random-64mb.txt", | ||
// "random-128mb.txt", | ||
// "random-256mb.txt" | ||
}) | ||
private String key; | ||
|
||
/** | ||
* Not a perfect baseline but will do for now. Use the standard S3Async client for sequential | ||
* reads and compare its performance to seekable stream. | ||
*/ | ||
@Benchmark | ||
public void testSequentialRead__withStandardAsyncClient() throws IOException { | ||
S3AsyncClient client = S3AsyncClient.create(); | ||
CompletableFuture<ResponseInputStream<GetObjectResponse>> response = | ||
client.getObject( | ||
GetObjectRequest.builder() | ||
.bucket(Constants.BENCHMARK_BUCKET) | ||
.key(Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key) | ||
.build(), | ||
AsyncResponseTransformer.toBlockingInputStream()); | ||
|
||
String content = IoUtils.toUtf8String(response.join()); | ||
System.out.println(content.hashCode()); | ||
} | ||
|
||
/** Test sequential reads with seekable streams. */ | ||
@Benchmark | ||
public void testSequentialRead__withSeekableStream() throws IOException { | ||
S3SeekableInputStream stream = | ||
new S3SeekableInputStream( | ||
S3URI.of(Constants.BENCHMARK_BUCKET, Constants.BENCHMARK_DATA_PREFIX_SEQUENTIAL + key)); | ||
|
||
String content = IoUtils.toUtf8String(stream); | ||
System.out.println(content.hashCode()); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
package com.amazon.connector.s3.datagen; | ||
|
||
import com.google.common.collect.ImmutableList; | ||
import com.google.common.collect.Lists; | ||
import java.util.List; | ||
import lombok.Builder; | ||
import lombok.Data; | ||
|
||
/** | ||
* Class defining objects used by microbenchmarks and defining the shapes of some read patterns we | ||
* want to test for. | ||
*/ | ||
public class BenchmarkData { | ||
|
||
/** | ||
* Class describing a benchmark object and helper methods defining read patterns (forward seeking, | ||
* backward seeking, parquet-like jumps) on the object. | ||
*/ | ||
@Data | ||
@Builder | ||
public static class BenchmarkObject { | ||
private final String keyName; | ||
private final long size; | ||
|
||
/** | ||
* Returns a read pattern that 'jumps through the object' forward by 20% increments and reads | ||
* 10% of the object each time. Essentially, this results half of the content being read and | ||
* half of the content being ignored. | ||
* | ||
* <p>Illustration of the pattern (number denotes order of read): | ||
* 1111111111---2222222222---3333333333--- ... --- 5555555555--- | 0% | 20% | 40 % | 80% | 100% | ||
*/ | ||
public List<Read> getForwardSeekReadPattern() { | ||
return ImmutableList.of( | ||
Read.builder().start(0).length(percent(10)).build(), | ||
Read.builder().start(percent(20)).length(percent(10)).build(), | ||
Read.builder().start(percent(40)).length(percent(10)).build(), | ||
Read.builder().start(percent(60)).length(percent(10)).build(), | ||
Read.builder().start(percent(80)).length(percent(10)).build()); | ||
} | ||
|
||
/** Just reverse the forward pattern */ | ||
public List<Read> getBackwardSeekReadPattern() { | ||
return Lists.reverse(getForwardSeekReadPattern()); | ||
} | ||
|
||
/** Define a tail dance + read 50% of the object */ | ||
public List<Read> getParquetLikeReadPattern() { | ||
return ImmutableList.of( | ||
// Tail dance | ||
Read.builder().start(size - 1 - 4).length(4).build(), | ||
Read.builder() | ||
.start(size - 8 * Constants.ONE_KB_IN_BYTES) | ||
.length(8 * Constants.ONE_KB_IN_BYTES) | ||
.build(), | ||
|
||
// Read some contiguous chunks | ||
Read.builder().start(percent(50)).length(percent(30)).build(), | ||
Read.builder().start(0).length(percent(20)).build()); | ||
} | ||
|
||
/** | ||
* Returns x% of an integer. Used above to seek into specific relative positions in the object | ||
* when defining read patterns. | ||
*/ | ||
private int percent(int x) { | ||
return (int) ((size / 100) * x); | ||
} | ||
} | ||
|
||
/** Object representing a read. Has a start and a length. */ | ||
@Data | ||
@Builder | ||
public static class Read { | ||
long start; | ||
long length; | ||
} | ||
|
||
/** | ||
* (object-name, object) pairs -- we need this due to the limitation that JMH only allows us to | ||
* parameterise with Strings. So we use this Map under the hood to implement a keymap. | ||
*/ | ||
public static final List<BenchmarkObject> BENCHMARK_OBJECTS = | ||
ImmutableList.of( | ||
BenchmarkObject.builder() | ||
.keyName("random-1mb.txt") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIT: can we have these key names defined as constants? They seem to be accessed from multiple files There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I don't know how I didn't notice this :( I will address this in a next PR. |
||
.size(1 * Constants.ONE_MB_IN_BYTES) | ||
.build(), | ||
BenchmarkObject.builder() | ||
.keyName("random-4mb.txt") | ||
.size(4 * Constants.ONE_MB_IN_BYTES) | ||
.build(), | ||
BenchmarkObject.builder() | ||
.keyName("random-16mb.txt") | ||
.size(16 * Constants.ONE_MB_IN_BYTES) | ||
.build(), | ||
BenchmarkObject.builder() | ||
.keyName("random-64mb.txt") | ||
.size(64 * Constants.ONE_MB_IN_BYTES) | ||
.build(), | ||
BenchmarkObject.builder() | ||
.keyName("random-128mb.txt") | ||
.size(128 * Constants.ONE_MB_IN_BYTES) | ||
.build(), | ||
BenchmarkObject.builder() | ||
.keyName("random-256mb.txt") | ||
.size(256 * Constants.ONE_MB_IN_BYTES) | ||
.build()); | ||
|
||
/** Returns a benchmark object by name. */ | ||
public static BenchmarkObject getBenchMarkObjectByName(String name) { | ||
return BENCHMARK_OBJECTS.stream() | ||
.filter(o -> o.getKeyName().equals(name)) | ||
.findFirst() | ||
.orElseThrow(() -> new RuntimeException("Cannot find benchmark object with name " + name)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we have some script which can take bucket name/prefix as an command line argument and decide to create the bucket and generate the data if it does not exist and run the benchmark as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a great suggestion. It will be very useful for new contributors too once this gets open sourced. For now, I will try to not block on it so created a backlog item for this.