From 2cb7a16a2f9c917b90a3749e732aed42ed998515 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 23 Oct 2024 16:33:31 +0900 Subject: [PATCH] [SPARK-50080][SQL][TESTS] Add benchmark cases for parquet adaptive bloom filter in BloomFilterBenchmark ### What changes were proposed in this pull request? Parquet's AdaptiveBlockSplitBloomFilter is a technique for generating a bloom filter with the optimal bit size according to the number of distinct real data values. It may not come at no cost because it uses multiple BloomFilter candidates at runtime, which could increase CPU usage or time. This pull request adds benchmark cases to compare with those that use the default BloomFilter size. ### Why are the changes needed? Improvement benchmark coverage for common user-orient features from parquet datasource ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? benchmarking golden files attached ### Was this patch authored or co-authored using generative AI tooling? no Closes #48609 from yaooqinn/SPARK-50080. Authored-by: Kent Yao Signed-off-by: Hyukjin Kwon --- .../BloomFilterBenchmark-jdk21-results.txt | 104 +++++++++--------- .../BloomFilterBenchmark-results.txt | 104 +++++++++--------- .../benchmark/BloomFilterBenchmark.scala | 10 ++ 3 files changed, 118 insertions(+), 100 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt index b24a9ad0bf023..5cf56352fa761 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-jdk21-results.txt @@ -2,191 +2,195 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 7996 8147 214 12.5 80.0 1.0X -With bloom filter 9835 9843 13 10.2 98.3 0.8X +Without bloom filter 8070 8132 88 12.4 80.7 1.0X +With bloom filter 10025 10082 81 10.0 100.2 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 857 882 27 116.7 8.6 1.0X -With bloom filter, blocksize: 2097152 578 599 18 173.1 5.8 1.5X +Without bloom filter, blocksize: 2097152 882 890 7 113.4 8.8 1.0X +With bloom filter, blocksize: 2097152 567 577 10 176.4 5.7 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 844 851 9 118.5 8.4 1.0X -With bloom filter, blocksize: 4194304 551 588 27 181.4 5.5 1.5X +Without bloom filter, blocksize: 4194304 810 836 22 123.4 8.1 1.0X +With bloom filter, blocksize: 4194304 550 568 22 181.8 5.5 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 837 861 23 119.4 8.4 1.0X -With bloom filter, blocksize: 6291456 555 591 54 180.2 5.5 1.5X +Without bloom filter, blocksize: 6291456 823 836 11 121.5 8.2 1.0X +With bloom filter, blocksize: 6291456 540 563 17 185.3 5.4 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 828 847 16 120.7 8.3 1.0X -With bloom filter, blocksize: 8388608 529 560 39 189.0 5.3 1.6X +Without bloom filter, blocksize: 8388608 797 821 21 125.5 8.0 1.0X +With bloom filter, blocksize: 8388608 533 553 23 187.5 5.3 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 845 851 7 118.4 8.4 1.0X -With bloom filter, blocksize: 12582912 547 578 44 182.7 5.5 1.5X +Without bloom filter, blocksize: 12582912 859 876 15 116.4 8.6 1.0X +With bloom filter, blocksize: 12582912 545 576 22 183.4 5.5 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 815 832 15 122.7 8.1 1.0X -With bloom filter, blocksize: 16777216 534 559 26 187.1 5.3 1.5X +Without bloom filter, blocksize: 16777216 810 841 26 123.4 8.1 1.0X +With bloom filter, blocksize: 16777216 554 575 15 180.5 5.5 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 801 817 23 124.8 8.0 1.0X -With bloom filter, blocksize: 33554432 528 538 11 189.4 5.3 1.5X +Without bloom filter, blocksize: 33554432 845 852 7 118.4 8.4 1.0X +With bloom filter, blocksize: 33554432 545 564 16 183.4 5.5 1.5X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor -Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter 12129 12161 46 8.2 121.3 1.0X -With bloom filter 20231 20267 50 4.9 202.3 0.6X +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +Without bloom filter 12141 12156 21 8.2 121.4 1.0X +With bloom filter 21175 21296 172 4.7 211.7 0.6X +With adaptive bloom filter & 3 candidates 20846 20897 71 4.8 208.5 0.6X +With adaptive bloom filter & 5 candidates 20731 20989 365 4.8 207.3 0.6X +With adaptive bloom filter & 9 candidates 23208 23264 79 4.3 232.1 0.5X +With adaptive bloom filter & 15 candidates 23293 23349 78 4.3 232.9 0.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 422 461 41 237.1 4.2 1.0X -With bloom filter, blocksize: 2097152 170 179 6 589.5 1.7 2.5X +Without bloom filter, blocksize: 2097152 451 502 37 221.9 4.5 1.0X +With bloom filter, blocksize: 2097152 174 186 12 573.8 1.7 2.6X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 397 421 17 251.6 4.0 1.0X -With bloom filter, blocksize: 4194304 126 140 11 791.4 1.3 3.1X +Without bloom filter, blocksize: 4194304 404 409 4 247.6 4.0 1.0X +With bloom filter, blocksize: 4194304 139 150 7 719.2 1.4 2.9X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 388 397 5 257.8 3.9 1.0X -With bloom filter, blocksize: 6291456 150 159 9 667.1 1.5 2.6X +Without bloom filter, blocksize: 6291456 416 423 7 240.5 4.2 1.0X +With bloom filter, blocksize: 6291456 141 152 10 709.9 1.4 3.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 380 387 5 263.1 3.8 1.0X -With bloom filter, blocksize: 8388608 170 183 9 587.9 1.7 2.2X +Without bloom filter, blocksize: 8388608 419 432 10 238.6 4.2 1.0X +With bloom filter, blocksize: 8388608 210 223 7 476.2 2.1 2.0X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 396 401 5 252.2 4.0 1.0X -With bloom filter, blocksize: 12582912 301 335 20 332.0 3.0 1.3X +Without bloom filter, blocksize: 12582912 422 430 9 236.8 4.2 1.0X +With bloom filter, blocksize: 12582912 325 330 4 307.2 3.3 1.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 404 414 7 247.7 4.0 1.0X -With bloom filter, blocksize: 16777216 357 361 5 280.1 3.6 1.1X +Without bloom filter, blocksize: 16777216 420 436 22 238.3 4.2 1.0X +With bloom filter, blocksize: 16777216 398 428 29 251.2 4.0 1.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 408 419 19 244.8 4.1 1.0X -With bloom filter, blocksize: 33554432 410 419 9 244.1 4.1 1.0X +Without bloom filter, blocksize: 33554432 428 439 9 233.5 4.3 1.0X +With bloom filter, blocksize: 33554432 430 441 15 232.4 4.3 1.0X diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 1f77c27604fab..286df98479f97 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,191 +2,195 @@ ORC Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter 8051 8351 425 12.4 80.5 1.0X -With bloom filter 9881 9902 29 10.1 98.8 0.8X +Without bloom filter 8021 8137 165 12.5 80.2 1.0X +With bloom filter 10132 10186 76 9.9 101.3 0.8X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 902 927 26 110.8 9.0 1.0X -With bloom filter, blocksize: 2097152 549 594 38 182.0 5.5 1.6X +Without bloom filter, blocksize: 2097152 876 940 61 114.2 8.8 1.0X +With bloom filter, blocksize: 2097152 588 618 21 169.9 5.9 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 804 807 2 124.4 8.0 1.0X -With bloom filter, blocksize: 4194304 522 537 22 191.6 5.2 1.5X +Without bloom filter, blocksize: 4194304 837 839 2 119.4 8.4 1.0X +With bloom filter, blocksize: 4194304 579 601 34 172.7 5.8 1.4X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 776 789 11 128.9 7.8 1.0X -With bloom filter, blocksize: 6291456 514 526 11 194.7 5.1 1.5X +Without bloom filter, blocksize: 6291456 787 797 9 127.0 7.9 1.0X +With bloom filter, blocksize: 6291456 532 548 12 188.1 5.3 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 784 790 7 127.5 7.8 1.0X -With bloom filter, blocksize: 8388608 512 522 15 195.2 5.1 1.5X +Without bloom filter, blocksize: 8388608 796 799 4 125.7 8.0 1.0X +With bloom filter, blocksize: 8388608 534 548 10 187.1 5.3 1.5X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 826 832 9 121.0 8.3 1.0X -With bloom filter, blocksize: 12582912 500 506 6 200.0 5.0 1.7X +Without bloom filter, blocksize: 12582912 836 839 3 119.7 8.4 1.0X +With bloom filter, blocksize: 12582912 517 544 19 193.4 5.2 1.6X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 773 775 2 129.3 7.7 1.0X -With bloom filter, blocksize: 16777216 521 528 7 192.0 5.2 1.5X +Without bloom filter, blocksize: 16777216 793 796 4 126.1 7.9 1.0X +With bloom filter, blocksize: 16777216 570 574 5 175.3 5.7 1.4X ================================================================================================ ORC Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 765 772 7 130.8 7.6 1.0X -With bloom filter, blocksize: 33554432 499 503 4 200.4 5.0 1.5X +Without bloom filter, blocksize: 33554432 784 794 12 127.5 7.8 1.0X +With bloom filter, blocksize: 33554432 565 587 27 177.1 5.6 1.4X ================================================================================================ Parquet Write ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor -Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter 11138 11369 327 9.0 111.4 1.0X -With bloom filter 18980 19055 106 5.3 189.8 0.6X +Write 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +Without bloom filter 11173 11180 11 9.0 111.7 1.0X +With bloom filter 19387 19485 138 5.2 193.9 0.6X +With adaptive bloom filter & 3 candidates 19252 19395 202 5.2 192.5 0.6X +With adaptive bloom filter & 5 candidates 19204 19337 188 5.2 192.0 0.6X +With adaptive bloom filter & 9 candidates 19267 19380 160 5.2 192.7 0.6X +With adaptive bloom filter & 15 candidates 19144 19184 57 5.2 191.4 0.6X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 2097152 715 743 28 139.9 7.2 1.0X -With bloom filter, blocksize: 2097152 170 182 7 589.1 1.7 4.2X +Without bloom filter, blocksize: 2097152 447 476 24 223.6 4.5 1.0X +With bloom filter, blocksize: 2097152 177 185 5 565.6 1.8 2.5X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 4194304 665 671 7 150.4 6.6 1.0X -With bloom filter, blocksize: 4194304 127 136 8 788.1 1.3 5.2X +Without bloom filter, blocksize: 4194304 424 440 14 236.0 4.2 1.0X +With bloom filter, blocksize: 4194304 127 135 7 790.4 1.3 3.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 6291456 658 665 7 151.9 6.6 1.0X -With bloom filter, blocksize: 6291456 226 236 6 443.2 2.3 2.9X +Without bloom filter, blocksize: 6291456 423 439 16 236.2 4.2 1.0X +With bloom filter, blocksize: 6291456 130 139 9 768.6 1.3 3.3X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Without bloom filter, blocksize: 8388608 662 668 7 151.2 6.6 1.0X -With bloom filter, blocksize: 8388608 293 303 6 341.8 2.9 2.3X +Without bloom filter, blocksize: 8388608 426 435 7 235.0 4.3 1.0X +With bloom filter, blocksize: 8388608 204 214 6 489.3 2.0 2.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 12582912 671 674 5 149.1 6.7 1.0X -With bloom filter, blocksize: 12582912 550 564 11 181.8 5.5 1.2X +Without bloom filter, blocksize: 12582912 426 447 23 234.5 4.3 1.0X +With bloom filter, blocksize: 12582912 295 306 8 339.2 2.9 1.4X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 16777216 671 676 4 149.0 6.7 1.0X -With bloom filter, blocksize: 16777216 571 578 7 175.1 5.7 1.2X +Without bloom filter, blocksize: 16777216 427 441 9 234.0 4.3 1.0X +With bloom filter, blocksize: 16777216 372 392 12 268.5 3.7 1.1X ================================================================================================ Parquet Read ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Without bloom filter, blocksize: 33554432 769 785 17 130.0 7.7 1.0X -With bloom filter, blocksize: 33554432 704 766 54 142.1 7.0 1.1X +Without bloom filter, blocksize: 33554432 508 524 14 197.0 5.1 1.0X +With bloom filter, blocksize: 33554432 439 463 31 227.7 4.4 1.2X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 523da0d606346..c34dbdcfcde60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -98,8 +98,18 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { benchmark.addCase("With bloom filter") { _ => df.write.mode("overwrite") .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option(ParquetOutputFormat.ADAPTIVE_BLOOM_FILTER_ENABLED + "#value", false) .parquet(path + "/withBF") } + Seq(3, 5, 9, 15).foreach { candidates => + benchmark.addCase(s"With adaptive bloom filter & $candidates candidates ") { _ => + df.write.mode("overwrite") + .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .option(ParquetOutputFormat.ADAPTIVE_BLOOM_FILTER_ENABLED + "#value", true) + .option(ParquetOutputFormat.BLOOM_FILTER_CANDIDATES_NUMBER + "#value", candidates) + .parquet(s"$path/withBF$candidates") + } + } benchmark.run() } }