Skip to content

Commit

Permalink
Enable bloom filters and adapt row group size.
Browse files Browse the repository at this point in the history
  • Loading branch information
kenwenzel committed Jul 18, 2024
1 parent 1501728 commit 1fe5e46
Showing 1 changed file with 5 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package io.github.linkedfactory.core.kvin.parquet;

import io.github.linkedfactory.core.kvin.partitioned.KvinPartitioned;
import net.enilink.commons.util.Pair;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
Expand All @@ -16,18 +15,17 @@

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class ParquetHelpers {
static final Logger log = LoggerFactory.getLogger(ParquetHelpers.class);
static final ReflectData reflectData = new ReflectData(ParquetHelpers.class.getClassLoader());

// parquet file writer config
static final long ROW_GROUP_SIZE = 1048576; // 1 MB
static final long ROW_GROUP_SIZE_MAPPINGS = 1048576L; // 1 MB
static final long ROW_GROUP_SIZE = 134217728L; // Parquet Java default
static final int PAGE_SIZE = 8192; // 8 KB
static final int DICT_PAGE_SIZE = 1048576; // 1 MB
static final int ZSTD_COMPRESSION_LEVEL = 12; // 1 - 22
Expand Down Expand Up @@ -64,6 +62,7 @@ static ParquetWriter<KvinTupleInternal> getParquetDataWriter(Path dataFile) thro
.withPageSize(PAGE_SIZE)
.withDictionaryPageSize(DICT_PAGE_SIZE)
.withDataModel(reflectData)
.withBloomFilterEnabled("id", true)
.build();
}

Expand All @@ -79,9 +78,10 @@ static ParquetWriter<Object> getParquetMappingWriter(Path dataFile) throws IOExc
.withDictionaryEncoding(true)
//.withCompressionCodec(CompressionCodecName.ZSTD)
.withCompressionCodec(CompressionCodecName.SNAPPY)
.withRowGroupSize(ROW_GROUP_SIZE)
.withRowGroupSize(ROW_GROUP_SIZE_MAPPINGS)
.withPageSize(PAGE_SIZE)
.withDictionaryPageSize(DICT_PAGE_SIZE)
.withBloomFilterEnabled("value", true)
.withDataModel(reflectData)
.build();
}
Expand Down

0 comments on commit 1fe5e46

Please sign in to comment.