Skip to content

Commit

Permalink
Kyligence#6 Configurable record counts for block size checks
Browse files Browse the repository at this point in the history
  • Loading branch information
7mming7 committed Aug 27, 2019
1 parent dc3d6b2 commit 1ee2128
Show file tree
Hide file tree
Showing 36 changed files with 194 additions and 72 deletions.
2 changes: 1 addition & 1 deletion parquet-arrow/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-avro/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-benchmarks/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-cascading/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-cascading3/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-cli/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-column/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ public class ParquetProperties {
public static final int DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK = 10000;
public static final int DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH = 64;
public static final int DEFAULT_PAGE_ROW_COUNT_LIMIT = 20_000;
public static final boolean DEFAULT_PAGE_ROW_COUNT_CHECK_ENABLE = true;
public static final int DEFAULT_PAGE_ROW_COUNT_CHECK_INTERVAL = 1;
public static final long DEFAULT_SINGLE_ROW_SIZE_LIMIT = 512 * 1024;

public static final ValuesWriterFactory DEFAULT_VALUES_WRITER_FACTORY = new DefaultValuesWriterFactory();

Expand Down Expand Up @@ -87,10 +90,14 @@ public static WriterVersion fromString(String name) {
private final ValuesWriterFactory valuesWriterFactory;
private final int columnIndexTruncateLength;
private final int pageRowCountLimit;
private final boolean pageSizeCheckEnable;
private final int pageSizeCheckInterval;
private final long singleRowSizeLimit;

private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPageSize, boolean enableDict, int minRowCountForPageSizeCheck,
int maxRowCountForPageSizeCheck, boolean estimateNextSizeCheck, ByteBufferAllocator allocator,
ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength, int pageRowCountLimit) {
ValuesWriterFactory writerFactory, int columnIndexMinMaxTruncateLength, int pageRowCountLimit,
boolean pageSizeCheckEnable, int pageSizeCheckInterval, long singleRowSizeLimit) {
this.pageSizeThreshold = pageSize;
this.initialSlabSize = CapacityByteArrayOutputStream
.initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10);
Expand All @@ -105,6 +112,9 @@ private ParquetProperties(WriterVersion writerVersion, int pageSize, int dictPag
this.valuesWriterFactory = writerFactory;
this.columnIndexTruncateLength = columnIndexMinMaxTruncateLength;
this.pageRowCountLimit = pageRowCountLimit;
this.pageSizeCheckEnable = pageSizeCheckEnable;
this.pageSizeCheckInterval = pageSizeCheckInterval;
this.singleRowSizeLimit = singleRowSizeLimit;
}

public ValuesWriter newRepetitionLevelWriter(ColumnDescriptor path) {
Expand Down Expand Up @@ -145,6 +155,18 @@ public int getPageSizeThreshold() {
return pageSizeThreshold;
}

public static boolean isDefaultPageRowCountCheckEnable() {
return DEFAULT_PAGE_ROW_COUNT_CHECK_ENABLE;
}

public static int getDefaultPageRowCountCheckInterval() {
return DEFAULT_PAGE_ROW_COUNT_CHECK_INTERVAL;
}

public static long getDefaultSingleRowSizeLimit() {
return DEFAULT_SINGLE_ROW_SIZE_LIMIT;
}

public int getInitialSlabSize() {
return initialSlabSize;
}
Expand Down Expand Up @@ -181,6 +203,14 @@ public int getMinRowCountForPageSizeCheck() {
return minRowCountForPageSizeCheck;
}

public boolean isPageSizeCheckEnable() {
return pageSizeCheckEnable;
}

public int getPageSizeCheckInterval() {
return pageSizeCheckInterval;
}

public int getMaxRowCountForPageSizeCheck() {
return maxRowCountForPageSizeCheck;
}
Expand All @@ -201,6 +231,10 @@ public int getPageRowCountLimit() {
return pageRowCountLimit;
}

public long getSingleRowSizeLimit() {
return singleRowSizeLimit;
}

public static Builder builder() {
return new Builder();
}
Expand All @@ -221,6 +255,9 @@ public static class Builder {
private ValuesWriterFactory valuesWriterFactory = DEFAULT_VALUES_WRITER_FACTORY;
private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
private int pageSizeCheckInterval = DEFAULT_PAGE_ROW_COUNT_CHECK_INTERVAL;
private boolean pageSizeCheckEnable = DEFAULT_PAGE_ROW_COUNT_CHECK_ENABLE;
private long singleRowSizeLimit = DEFAULT_SINGLE_ROW_SIZE_LIMIT;

private Builder() {
}
Expand All @@ -236,6 +273,7 @@ private Builder(ParquetProperties toCopy) {
this.valuesWriterFactory = toCopy.valuesWriterFactory;
this.allocator = toCopy.allocator;
this.pageRowCountLimit = toCopy.pageRowCountLimit;
this.singleRowSizeLimit = toCopy.singleRowSizeLimit;
}

/**
Expand Down Expand Up @@ -330,11 +368,27 @@ public Builder withPageRowCountLimit(int rowCount) {
return this;
}

public Builder withPageRowCheckEnable(boolean enable) {
pageSizeCheckEnable = enable;
return this;
}

public Builder withPageRowCheckInterval(int interval) {
pageSizeCheckInterval = interval;
return this;
}

public Builder withSingleRowSizeLimit(long size) {
singleRowSizeLimit = size;
return this;
}

public ParquetProperties build() {
ParquetProperties properties =
new ParquetProperties(writerVersion, pageSize, dictPageSize,
enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck,
estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, pageRowCountLimit);
estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength,
pageRowCountLimit, pageSizeCheckEnable, pageSizeCheckInterval, singleRowSizeLimit);
// we pass a constructed but uninitialized factory to ParquetProperties above as currently
// creation of ValuesWriters is invoked from within ParquetProperties. In the future
// we'd like to decouple that and won't need to pass an object to properties and then pass the
Expand Down
2 changes: 1 addition & 1 deletion parquet-common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-encoding/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-format-structures/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<artifactId>parquet-format-structures</artifactId>
Expand Down
2 changes: 1 addition & 1 deletion parquet-generator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-hadoop-bundle/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
2 changes: 1 addition & 1 deletion parquet-hadoop/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
<relativePath>../pom.xml</relativePath>
<version>1.12.0-kylin-r2</version>
<version>1.12.0-kylin-r4</version>
</parent>

<modelVersion>4.0.0</modelVersion>
Expand Down
Loading

0 comments on commit 1ee2128

Please sign in to comment.