diff --git a/Cargo.lock b/Cargo.lock index a2757ff..fcb331c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -130,9 +130,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" +checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c" dependencies = [ "arrow-arith", "arrow-array", @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" +checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" +checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da" dependencies = [ "ahash", "arrow-buffer", @@ -178,15 +178,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown", + "hashbrown 0.14.5", "num", ] [[package]] name = "arrow-buffer" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" +checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80" dependencies = [ "bytes", "half", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" +checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" +checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" +checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" +checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" +checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,9 +282,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" +checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,9 +297,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" +checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6" dependencies = [ "ahash", "arrow-array", @@ -311,18 +311,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" +checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a" dependencies = [ "ahash", "arrow-array", @@ -334,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" +checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305" dependencies = [ "arrow-array", "arrow-buffer", @@ -351,9 +351,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" +checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" dependencies = [ "bzip2", "flate2", @@ -369,13 +369,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.82" +version = "0.1.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -395,9 +395,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "backtrace" @@ -498,9 +498,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" +checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" [[package]] name = "bzip2" @@ -525,9 +525,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.19" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d74707dde2ba56f86ae90effb3b43ddd369504387e718014de010cec7959800" +checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" dependencies = [ "jobserver", "libc", @@ -555,9 +555,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" +checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" dependencies = [ "chrono", "chrono-tz-build", @@ -566,12 +566,11 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" dependencies = [ "parse-zoneinfo", - "phf", "phf_codegen", ] @@ -712,7 +711,7 @@ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", - "hashbrown", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core", @@ -720,9 +719,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4fd4a99fc70d40ef7e52b243b4a399c3f8d353a40d5ecb200deee05e49c61bb" +checksum = "ee907b081e45e1d14e1f327e89ef134f91fcebad0bfc2dc229fa9f6044379682" dependencies = [ "ahash", "apache-avro", @@ -744,6 +743,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", + "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -754,9 +754,9 @@ dependencies = [ "futures", "glob", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", - "itertools 0.12.1", + "itertools 0.13.0", "log", "num-traits", "num_cpus", @@ -778,9 +778,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13b3cfbd84c6003594ae1972314e3df303a27ce8ce755fcea3240c90f4c0529" +checksum = "6c2b914f6e33c429af7d8696c72a47ed9225d7e2b82c747ebdfa2408ed53579f" dependencies = [ "arrow-schema", "async-trait", @@ -788,13 +788,14 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "parking_lot", ] [[package]] name = "datafusion-common" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44fdbc877e3e40dcf88cc8f283d9f5c8851f0a3aa07fee657b1b75ac1ad49b9c" +checksum = "3a84f8e76330c582a6b8ada0b2c599ca46cfe46b7585e458fc3f4092bc722a18" dependencies = [ "ahash", "apache-avro", @@ -804,30 +805,33 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown", + "hashbrown 0.14.5", "instant", "libc", "num_cpus", "object_store", "parquet", + "paste", "pyo3", "sqlparser", + "tokio", ] [[package]] name = "datafusion-common-runtime" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7496d1f664179f6ce3a5cbef6566056ccaf3ea4aa72cc455f80e62c1dd86b1" +checksum = "cf08cc30d92720d557df13bd5a5696213bd5ea0f38a866d8d85055d866fba774" dependencies = [ + "log", "tokio", ] [[package]] name = "datafusion-execution" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799e70968c815b611116951e3dd876aef04bf217da31b72eec01ee6a959336a1" +checksum = "86bc4183d5c45b9f068a6f351678a0d1eb1225181424542bb75db18ec280b822" dependencies = [ "arrow", "chrono", @@ -835,7 +839,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "futures", - "hashbrown", + "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -846,9 +850,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c1841c409d9518c17971d15c9bae62e629eb937e6fb6c68cd32e9186f8b30d2" +checksum = "202119ce58e4d103e37ae64aab40d4e574c97bdd2bea994bf307b175fcbfa74d" dependencies = [ "ahash", "arrow", @@ -856,6 +860,9 @@ dependencies = [ "arrow-buffer", "chrono", "datafusion-common", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", "paste", "serde_json", "sqlparser", @@ -863,11 +870,22 @@ dependencies = [ "strum_macros 0.26.4", ] +[[package]] +name = "datafusion-expr-common" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8b181ce8569216abb01ef3294aa16c0a40d7d39350c2ff01ede00f167a535f2" +dependencies = [ + "arrow", + "datafusion-common", + "paste", +] + [[package]] name = "datafusion-functions" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8e481cf34d2a444bd8fa09b65945f0ce83dc92df8665b761505b3d9f351bebb" +checksum = "6e4124b8066444e05a24472f852e94cf56546c0f4d92d00f018f207216902712" dependencies = [ "arrow", "arrow-buffer", @@ -878,9 +896,9 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "hashbrown", + "hashbrown 0.14.5", "hex", - "itertools 0.12.1", + "itertools 0.13.0", "log", "md-5", "rand", @@ -892,9 +910,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b4ece19f73c02727e5e8654d79cd5652de371352c1df3c4ac3e419ecd6943fb" +checksum = "b94acdac235ea21810150a89751617ef2db7e32eba27f54be48a81bde2bfe119" dependencies = [ "ahash", "arrow", @@ -902,17 +920,34 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", "datafusion-physical-expr-common", + "half", "log", "paste", "sqlparser", ] +[[package]] +name = "datafusion-functions-aggregate-common" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c9ea085bbf900bf16e2ca0f56fc56236b2e4f2e1a2cccb67bcd83c5ab4ad0ef" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", + "rand", +] + [[package]] name = "datafusion-functions-nested" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1474552cc824e8c9c88177d454db5781d4b66757d4aca75719306b8343a5e8d" +checksum = "6c882e61665ed60c5ce9b061c1e587aeb8ae5ae4bcb5e5f2465139ab25328e0f" dependencies = [ "arrow", "arrow-array", @@ -924,17 +959,30 @@ dependencies = [ "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", - "itertools 0.12.1", + "datafusion-physical-expr-common", + "itertools 0.13.0", "log", "paste", "rand", ] +[[package]] +name = "datafusion-functions-window" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98a354ce96df3ca6d025093adac9fd55ca09931c9b6f2630140721a95873fde4" +dependencies = [ + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr-common", + "log", +] + [[package]] name = "datafusion-optimizer" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "791ff56f55608bc542d1ea7a68a64bdc86a9413f5a381d06a39fd49c2a3ab906" +checksum = "baf677c74fb7b5a1899ef52709e4a70fff3ed80bdfb4bbe495909810e83d5f39" dependencies = [ "arrow", "async-trait", @@ -942,9 +990,9 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown", + "hashbrown 0.14.5", "indexmap", - "itertools 0.12.1", + "itertools 0.13.0", "log", "paste", "regex-syntax", @@ -952,9 +1000,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a223962b3041304a3e20ed07a21d5de3d88d7e4e71ca192135db6d24e3365a4" +checksum = "30b077999f6eb6c43d6b25bc66332a3be2f693c382840f008dd763b8540f9530" dependencies = [ "ahash", "arrow", @@ -968,12 +1016,14 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown", + "hashbrown 0.14.5", "hex", "indexmap", - "itertools 0.12.1", + "itertools 0.13.0", "log", "paste", "petgraph", @@ -982,35 +1032,37 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db5e7d8532a1601cd916881db87a70b0a599900d23f3db2897d389032da53bc6" +checksum = "dce847f885c2b13bbe29f5c8b7948797131aa470af6e16d2a94f4428b4f4f1bd" dependencies = [ "ahash", "arrow", "datafusion-common", - "datafusion-expr", - "hashbrown", + "datafusion-expr-common", + "hashbrown 0.14.5", "rand", ] [[package]] name = "datafusion-physical-optimizer" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb9c78f308e050f5004671039786a925c3fee83b90004e9fcfd328d7febdcc0" +checksum = "d13238e3b9fdd62a4c18760bfef714bb990d1e1d3430e9f416aae4b3cfaa71af" dependencies = [ + "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-physical-expr", "datafusion-physical-plan", + "itertools 0.13.0", ] [[package]] name = "datafusion-physical-plan" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d1116949432eb2d30f6362707e2846d942e491052a206f2ddcb42d08aea1ffe" +checksum = "faba6f55a7eaf0241d07d12c2640de52742646b10f754485d5192bdfe2c9ceae" dependencies = [ "ahash", "arrow", @@ -1025,13 +1077,14 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", - "itertools 0.12.1", + "itertools 0.13.0", "log", "once_cell", "parking_lot", @@ -1042,9 +1095,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1d25864c18178d0e51438648f5e0fa08417dbbc39b642c1752cbbb1013abf0" +checksum = "585357d621fa03ea85a7fefca79ebc5ef0ee13a7f82be0762a414879a4d190a7" dependencies = [ "arrow", "chrono", @@ -1053,42 +1106,42 @@ dependencies = [ "datafusion-expr", "datafusion-proto-common", "object_store", - "prost 0.12.6", + "prost 0.13.3", ] [[package]] name = "datafusion-proto-common" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96a683253732334526b1cc5314a73a0f786803831f7e189ed3fe387ac50d7222" +checksum = "4db6534382f92f528bdb5d925b4214c31ffd84fa7fe1eff3ed0d2f1286851ab8" dependencies = [ "arrow", "chrono", "datafusion-common", "object_store", - "prost 0.12.6", + "prost 0.13.3", ] [[package]] name = "datafusion-python" version = "41.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f984077524d65d5ab574aab8ac8503f54fb2756d29278ede471d9298d8c87c8d" +source = "git+https://github.com/apache/datafusion-python#ec8246da3b45e766fe6fb515ade01e0bae73af98" dependencies = [ "arrow", "async-trait", "datafusion", + "datafusion-proto", "futures", "mimalloc", "object_store", "parking_lot", - "prost 0.12.6", - "prost-types 0.12.6", + "prost 0.13.3", + "prost-types 0.13.3", "pyo3", "pyo3-build-config", "rand", "regex-syntax", - "syn 2.0.77", + "syn 2.0.79", "tokio", "url", "uuid", @@ -1096,9 +1149,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45d0180711165fe94015d7c4123eb3e1cf5fb60b1506453200b8d1ce666bef0" +checksum = "dad8d96a9b52e1aa24f9373696a815be828193efce7cb0bbd2140b6bb67d1819" dependencies = [ "arrow", "arrow-array", @@ -1122,8 +1175,8 @@ dependencies = [ "futures", "log", "pretty_assertions", - "prost 0.12.6", - "prost-types 0.12.6", + "prost 0.13.3", + "prost-types 0.13.3", "pyo3", "regex", "rustc_version", @@ -1148,12 +1201,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "either" version = "1.13.0" @@ -1200,9 +1247,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.33" +version = "1.0.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" dependencies = [ "crc32fast", "miniz_oxide", @@ -1225,9 +1272,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1240,9 +1287,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1250,15 +1297,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1267,38 +1314,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1335,9 +1382,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" @@ -1385,6 +1432,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "heck" version = "0.4.1" @@ -1454,9 +1507,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.4" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" [[package]] name = "humantime" @@ -1495,7 +1548,7 @@ dependencies = [ "hyper", "hyper-util", "rustls", - "rustls-native-certs 0.8.0", + "rustls-native-certs", "rustls-pki-types", "tokio", "tokio-rustls", @@ -1504,9 +1557,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba" +checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" dependencies = [ "bytes", "futures-channel", @@ -1517,16 +1570,15 @@ dependencies = [ "pin-project-lite", "socket2", "tokio", - "tower", "tower-service", "tracing", ] [[package]] name = "iana-time-zone" -version = "0.1.60" +version = "0.1.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1557,12 +1609,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.0", ] [[package]] @@ -1591,9 +1643,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "itertools" @@ -1604,15 +1656,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.13.0" @@ -1654,9 +1697,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1667,9 +1710,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -1678,9 +1721,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" dependencies = [ "lexical-util", "static_assertions", @@ -1688,18 +1731,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" dependencies = [ "lexical-util", "lexical-write-integer", @@ -1708,9 +1751,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" dependencies = [ "lexical-util", "static_assertions", @@ -1718,9 +1761,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.158" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "libflate" @@ -1742,7 +1785,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" dependencies = [ "core2", - "hashbrown", + "hashbrown 0.14.5", "rle-decode-fast", ] @@ -1957,18 +2000,18 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.10.2" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" +checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" dependencies = [ "async-trait", "base64", @@ -1997,9 +2040,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ea5043e58958ee56f3e15a90aee535795cd7dfd319846288d93c5b57d85cbe" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" @@ -2041,9 +2084,9 @@ dependencies = [ [[package]] name = "parquet" -version = "52.2.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" +checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e" dependencies = [ "ahash", "arrow-array", @@ -2060,7 +2103,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "lz4_flex", "num", "num-bigint", @@ -2144,26 +2187,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.77", -] - [[package]] name = "pin-project-lite" version = "0.2.14" @@ -2178,15 +2201,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "portable-atomic" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" [[package]] name = "ppv-lite86" @@ -2238,12 +2261,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.12.6" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" dependencies = [ "bytes", - "prost-derive 0.12.6", + "prost-derive 0.13.3", ] [[package]] @@ -2283,15 +2306,15 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.12.6" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -2305,24 +2328,24 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.12.6" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ - "prost 0.12.6", + "prost 0.13.3", ] [[package]] name = "pyo3" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" +checksum = "15ee168e30649f7f234c3d49ef5a7a6cbf5134289bc46c29ff3155fa3221c225" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", - "parking_lot", + "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -2332,9 +2355,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" +checksum = "e61cef80755fe9e46bb8a0b8f20752ca7676dcc07a5277d8b7768c6172e529b3" dependencies = [ "once_cell", "target-lexicon", @@ -2342,9 +2365,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" +checksum = "67ce096073ec5405f5ee2b8b31f03a68e02aa10d5d4f565eca04acc41931fa1c" dependencies = [ "libc", "pyo3-build-config", @@ -2352,27 +2375,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" +checksum = "2440c6d12bc8f3ae39f1e775266fa5122fd0c8891ce7520fa6048e683ad3de28" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] name = "pyo3-macros-backend" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" +checksum = "1be962f0e06da8f8465729ea2cb71a416d2257dff56cbe40a70d3e62a93ae5d1" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -2383,9 +2406,9 @@ checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" [[package]] name = "quick-xml" -version = "0.36.1" +version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" +checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" dependencies = [ "memchr", "serde", @@ -2480,9 +2503,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.4" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags 2.6.0", ] @@ -2524,9 +2547,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" -version = "0.12.7" +version = "0.12.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" +checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" dependencies = [ "base64", "bytes", @@ -2548,7 +2571,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls", - "rustls-native-certs 0.7.3", + "rustls-native-certs", "rustls-pemfile", "rustls-pki-types", "serde", @@ -2624,9 +2647,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" dependencies = [ "once_cell", "ring", @@ -2636,19 +2659,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe", - "rustls-pemfile", - "rustls-pki-types", - "schannel", - "security-framework", -] - [[package]] name = "rustls-native-certs" version = "0.8.0" @@ -2664,19 +2674,18 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.3" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" +checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" [[package]] name = "rustls-webpki" @@ -2740,9 +2749,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.11.1" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" +checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" dependencies = [ "core-foundation-sys", "libc", @@ -2777,7 +2786,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -2844,24 +2853,23 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.7.5" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" dependencies = [ - "doc-comment", "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.7.5" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.79", ] [[package]] @@ -2888,9 +2896,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.49.0" +version = "0.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a404d0e14905361b918cb8afdb73605e25c1d5029312bd9785142dcb3aa49e" +checksum = "b2e5b515a2bd5168426033e9efbfd05500114833916f1d5c268f938b4ee130ac" dependencies = [ "log", "sqlparser_derive", @@ -2904,7 +2912,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -2938,7 +2946,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -2951,7 +2959,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -2973,9 +2981,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.77" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", @@ -2999,9 +3007,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" dependencies = [ "cfg-if", "fastrand", @@ -3012,22 +3020,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -3089,7 +3097,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -3129,27 +3137,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - [[package]] name = "tower-service" version = "0.3.3" @@ -3175,7 +3162,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -3220,7 +3207,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -3231,9 +3218,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" @@ -3243,9 +3230,9 @@ checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -3258,9 +3245,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unindent" @@ -3348,7 +3335,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", "wasm-bindgen-shared", ] @@ -3382,7 +3369,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3395,9 +3382,9 @@ checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +checksum = "4e072d4e72f700fb3443d8fe94a39315df013eef1104903cdb0a2abd322bbecd" dependencies = [ "futures-util", "js-sys", @@ -3591,7 +3578,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.79", ] [[package]] @@ -3639,9 +3626,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 2f024f8..eb91943 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,17 +29,20 @@ rust-version = "1.62" build = "build.rs" [dependencies] -datafusion = { version = "41.0.0", features = ["pyarrow", "avro"] } -datafusion-proto = "41.0.0" -datafusion-python = "41.0.0" +datafusion = { version = "42.0.0", features = ["pyarrow", "avro"] } +datafusion-proto = "42.0.0" + +# temporarily point to revision until version 42 is released +datafusion-python = { git = "https://github.com/apache/datafusion-python" } + futures = "0.3" log = "0.4" -prost = "0.12" -pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] } +prost = "0.13" +pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } tokio = { version = "1.40", features = ["macros", "rt", "rt-multi-thread", "sync"] } [build-dependencies] -prost-types = "0.12" +prost-types = "0.13" rustc_version = "0.4.0" tonic-build = { version = "0.8", default-features = false, features = ["transport", "prost"] } diff --git a/README.md b/README.md index 33d8ab8..d69ee68 100644 --- a/README.md +++ b/README.md @@ -19,23 +19,34 @@ # DataFusion on Ray -> This was originally a research project donated from [ray-sql](https://github.com/datafusion-contrib/ray-sql) to evaluate performing distributed SQL queries from Python, using -> [Ray](https://www.ray.io/) and [DataFusion](https://github.com/apache/arrow-datafusion). +> This was originally a research project donated from [ray-sql] to evaluate performing distributed SQL queries from +> Python, using [Ray] and [Apache DataFusion] -DataFusion Ray is a distributed SQL query engine powered by the Rust implementation of [Apache Arrow](https://arrow.apache.org/), [Apache DataFusion](https://datafusion.apache.org/) and [Ray](https://www.ray.io/). +[ray-sql]: https://github.com/datafusion-contrib/ray-sql -## Goals +DataFusion Ray is a distributed Python DataFrame and SQL query engine powered by the Rust implementation +of [Apache Arrow], [Apache DataFusion], and [Ray]. -- Demonstrate how easily new systems can be built on top of DataFusion. See the [design documentation](./docs/README.md) - to understand how RaySQL works. -- Drive requirements for DataFusion's [Python bindings](https://github.com/apache/arrow-datafusion-python). -- Create content for an interesting blog post or conference talk. +[Ray]: https://www.ray.io/ +[Apache Arrow]: https://arrow.apache.org/ +[Apache DataFusion]: https://datafusion.apache.org/ -## Non Goals +## Comparison to other DataFusion projects -- Re-build the cluster scheduling systems like what [Ballista](https://datafusion.apache.org/ballista/) did. - - Ballista is extremely complex and utilizing Ray feels like it abstracts some of that complexity away. - - Datafusion Ray is delegating cluster management to Ray. +### Comparison to DataFusion Ballista + +- Unlike [DataFusion Ballista], DataFusion Ray does not provide its own distributed scheduler and instead relies on + Ray for this functionality. As a result of this design choice, DataFusion Ray is a much smaller and simpler project. +- DataFusion Ray is Python-first, and DataFusion Ballista is Rust-first + +[DataFusion Ballista]: https://github.com/apache/datafusion-ballista + +### Comparison to DataFusion Python + +- [DataFusion Python] provides a Python DataFrame and SQL API for in-process execution. DataFusion Ray extends + DataFusion Python to provide scalability across multiple nodes. + +[DataFusion Python]: https://github.com/apache/datafusion-python ## Example @@ -43,7 +54,6 @@ Run the following example live in your browser using a Google Colab [notebook](h ```python import os -import pandas as pd import ray from datafusion_ray import DatafusionRayContext @@ -54,7 +64,7 @@ SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) ray.init(resources={"worker": 1}) # Create a context and register a table -ctx = DatafusionRayContext(2, use_ray_shuffle=True) +ctx = DatafusionRayContext(2) # Register either a CSV or Parquet file # ctx.register_csv("tips", f"{SCRIPT_DIR}/tips.csv", True) ctx.register_parquet("tips", f"{SCRIPT_DIR}/tips.parquet") @@ -75,36 +85,6 @@ for record_batch in result_set: - Mature SQL support (CTEs, joins, subqueries, etc) thanks to DataFusion - Support for CSV and Parquet files -## Limitations - -- Requires a shared file system currently. Check details [here](./docs/README.md#distributed-shuffle). - -## Performance - -This chart shows the performance of DataFusion Ray compared to Apache Spark for -[SQLBench-H](https://sqlbenchmarks.io/sqlbench-h/) at a very small data set (10GB), running on a desktop (Threadripper -with 24 physical cores). Both DataFusion Ray and Spark are configured with 24 executors. - -### Overall Time - -DataFusion Ray is ~1.9x faster overall for this scale factor and environment with disk-based shuffle. - -![SQLBench-H Total](./docs/sqlbench-h-total.png) - -### Per Query Time - -Spark is much faster on some queries, likely due to broadcast exchanges, which DataFusion Ray hasn't implemented yet. - -![SQLBench-H Per Query](./docs/sqlbench-h-per-query.png) - -### Performance Plan - -Plans on experimenting with the following changes to improve performance: - -- Make better use of Ray futures to run more tasks in parallel -- Use Ray object store for shuffle data transfer to reduce disk I/O cost -- Keep upgrading to newer versions of DataFusion to pick up the latest optimizations - ## Building ```bash diff --git a/src/context.rs b/src/context.rs index 560f506..2702e92 100644 --- a/src/context.rs +++ b/src/context.rs @@ -174,7 +174,7 @@ pub fn deserialize_execution_plan(bytes: Vec) -> PyResult { /// Iterate down an ExecutionPlan and set the input objects for RayShuffleReaderExec. fn _set_inputs_for_ray_shuffle_reader( plan: Arc, - input_partitions: &PyList, + input_partitions: &Bound<'_, PyList>, ) -> Result<()> { if let Some(reader_exec) = plan.as_any().downcast_ref::() { let exec_stage_id = reader_exec.stage_id; @@ -200,8 +200,8 @@ fn _set_inputs_for_ray_shuffle_reader( .map_err(|e| DataFusionError::Execution(format!("{}", e)))? .extract::() .map_err(|e| DataFusionError::Execution(format!("{}", e)))?; - let batch = RecordBatch::from_pyarrow( - pytuple + let batch = RecordBatch::from_pyarrow_bound( + &pytuple .get_item(2) .map_err(|e| DataFusionError::Execution(format!("{}", e)))?, ) @@ -235,7 +235,7 @@ fn _execute_partition( )); Python::with_gil(|py| { let input_partitions = inputs - .as_ref(py) + .bind(py) .downcast::() .map_err(|e| DataFusionError::Execution(format!("{}", e)))?; _set_inputs_for_ray_shuffle_reader(plan.plan.clone(), input_partitions) diff --git a/src/proto/datafusion.proto b/src/proto/datafusion.proto index 8a9a8c0..800632c 100644 --- a/src/proto/datafusion.proto +++ b/src/proto/datafusion.proto @@ -24,24 +24,7 @@ option java_multiple_files = true; option java_package = "org.apache.arrow.datafusion.protobuf"; option java_outer_classname = "DatafusionProto"; -message ColumnRelation { - string relation = 1; -} - -message Column { - string name = 1; - ColumnRelation relation = 2; -} - -message DfField{ - Field field = 1; - ColumnRelation qualifier = 2; -} - -message DfSchema { - repeated DfField columns = 1; - map metadata = 2; -} +import "datafusion_common.proto"; // logical plan // LogicalPlan is a nested type @@ -72,6 +55,10 @@ message LogicalPlanNode { ViewTableScanNode view_scan = 24; CustomTableScanNode custom_scan = 25; PrepareNode prepare = 26; + DropViewNode drop_view = 27; + DistinctOnNode distinct_on = 28; + CopyToNode copy_to = 29; + UnnestNode unnest = 30; } } @@ -84,49 +71,49 @@ message ProjectionColumns { repeated string columns = 1; } -message CsvFormat { - bool has_header = 1; - string delimiter = 2; +message LogicalExprNodeCollection { + repeated LogicalExprNode logical_expr_nodes = 1; } -message ParquetFormat { - // Used to be bool enable_pruning = 1; - reserved 1; +message SortExprNodeCollection { + repeated SortExprNode sort_expr_nodes = 1; } -message AvroFormat {} - message ListingTableScanNode { - string table_name = 1; + reserved 1; // was string table_name + TableReference table_name = 14; repeated string paths = 2; string file_extension = 3; ProjectionColumns projection = 4; - Schema schema = 5; + datafusion_common.Schema schema = 5; repeated LogicalExprNode filters = 6; repeated string table_partition_cols = 7; bool collect_stat = 8; uint32 target_partitions = 9; oneof FileFormatType { - CsvFormat csv = 10; - ParquetFormat parquet = 11; - AvroFormat avro = 12; + datafusion_common.CsvFormat csv = 10; + datafusion_common.ParquetFormat parquet = 11; + datafusion_common.AvroFormat avro = 12; + datafusion_common.NdJsonFormat json = 15; } - repeated LogicalExprNode file_sort_order = 13; + repeated SortExprNodeCollection file_sort_order = 13; } message ViewTableScanNode { - string table_name = 1; + reserved 1; // was string table_name + TableReference table_name = 6; LogicalPlanNode input = 2; - Schema schema = 3; + datafusion_common.Schema schema = 3; ProjectionColumns projection = 4; string definition = 5; } // Logical Plan to Scan a CustomTableProvider registered at runtime message CustomTableScanNode { - string table_name = 1; + reserved 1; // was string table_name + TableReference table_name = 6; ProjectionColumns projection = 2; - Schema schema = 3; + datafusion_common.Schema schema = 3; repeated LogicalExprNode filters = 4; bytes custom_table_data = 5; } @@ -146,7 +133,7 @@ message SelectionNode { message SortNode { LogicalPlanNode input = 1; - repeated LogicalExprNode expr = 2; + repeated SortExprNode expr = 2; // Maximum number of highest/lowest rows to fetch; negative means no limit int64 fetch = 3; } @@ -170,40 +157,47 @@ message EmptyRelationNode { message CreateExternalTableNode { reserved 1; // was string name - OwnedTableReference name = 12; + TableReference name = 9; string location = 2; string file_type = 3; - bool has_header = 4; - DfSchema schema = 5; - repeated string table_partition_cols = 6; - bool if_not_exists = 7; - string delimiter = 8; - string definition = 9; - string file_compression_type = 10; - map options = 11; + datafusion_common.DfSchema schema = 4; + repeated string table_partition_cols = 5; + bool if_not_exists = 6; + string definition = 7; + repeated SortExprNodeCollection order_exprs = 10; + bool unbounded = 11; + map options = 8; + datafusion_common.Constraints constraints = 12; + map column_defaults = 13; } message PrepareNode { string name = 1; - repeated ArrowType data_types = 2; + repeated datafusion_common.ArrowType data_types = 2; LogicalPlanNode input = 3; } message CreateCatalogSchemaNode { string schema_name = 1; bool if_not_exists = 2; - DfSchema schema = 3; + datafusion_common.DfSchema schema = 3; } message CreateCatalogNode { string catalog_name = 1; bool if_not_exists = 2; - DfSchema schema = 3; + datafusion_common.DfSchema schema = 3; +} + +message DropViewNode { + TableReference name = 1; + bool if_exists = 2; + datafusion_common.DfSchema schema = 3; } message CreateViewNode { reserved 1; // was string name - OwnedTableReference name = 5; + TableReference name = 5; LogicalPlanNode input = 2; bool or_replace = 3; string definition = 4; @@ -237,27 +231,11 @@ message WindowNode { repeated LogicalExprNode window_expr = 2; } -enum JoinType { - INNER = 0; - LEFT = 1; - RIGHT = 2; - FULL = 3; - LEFTSEMI = 4; - LEFTANTI = 5; - RIGHTSEMI = 6; - RIGHTANTI = 7; -} - -enum JoinConstraint { - ON = 0; - USING = 1; -} - message JoinNode { LogicalPlanNode left = 1; LogicalPlanNode right = 2; - JoinType join_type = 3; - JoinConstraint join_constraint = 4; + datafusion_common.JoinType join_type = 3; + datafusion_common.JoinConstraint join_constraint = 4; repeated LogicalExprNode left_join_key = 5; repeated LogicalExprNode right_join_key = 6; bool null_equals_null = 7; @@ -268,6 +246,34 @@ message DistinctNode { LogicalPlanNode input = 1; } +message DistinctOnNode { + repeated LogicalExprNode on_expr = 1; + repeated LogicalExprNode select_expr = 2; + repeated SortExprNode sort_expr = 3; + LogicalPlanNode input = 4; +} + +message CopyToNode { + LogicalPlanNode input = 1; + string output_url = 2; + bytes file_type = 3; + repeated string partition_by = 7; +} + +message UnnestNode { + LogicalPlanNode input = 1; + repeated datafusion_common.Column exec_columns = 2; + repeated uint64 list_type_columns = 3; + repeated uint64 struct_type_columns = 4; + repeated uint64 dependency_indices = 5; + datafusion_common.DfSchema schema = 6; + UnnestOptions options = 7; +} + +message UnnestOptions { + bool preserve_nulls = 1; +} + message UnionNode { repeated LogicalPlanNode inputs = 1; } @@ -290,26 +296,25 @@ message SelectionExecNode { } message SubqueryAliasNode { + reserved 2; // Was string alias LogicalPlanNode input = 1; - string alias = 2; + TableReference alias = 3; } // logical expressions message LogicalExprNode { oneof ExprType { // column references - Column column = 1; + datafusion_common.Column column = 1; // alias AliasNode alias = 2; - ScalarValue literal = 3; + datafusion_common.ScalarValue literal = 3; // binary expressions BinaryExprNode binary_expr = 4; - // aggregate expressions - AggregateExprNode aggregate_expr = 5; // null checks IsNull is_null_expr = 6; @@ -319,11 +324,10 @@ message LogicalExprNode { BetweenNode between = 9; CaseNode case_ = 10; CastNode cast = 11; - SortExprNode sort = 12; NegativeNode negative = 13; InListNode in_list = 14; - bool wildcard = 15; - ScalarFunctionNode scalar_function = 16; + Wildcard wildcard = 15; + // was ScalarFunctionNode scalar_function = 16; TryCastNode try_cast = 17; // window expressions @@ -335,7 +339,7 @@ message LogicalExprNode { // Scalar UDF expressions ScalarUDFExprNode scalar_udf_expr = 20; - GetIndexedField get_indexed_field = 21; + // GetIndexedField get_indexed_field = 21; GroupingSetNode grouping_set = 22; @@ -355,12 +359,18 @@ message LogicalExprNode { PlaceholderNode placeholder = 34; + Unnest unnest = 35; + } } +message Wildcard { + TableReference qualifier = 1; +} + message PlaceholderNode { string id = 1; - ArrowType data_type = 2; + datafusion_common.ArrowType data_type = 2; } message LogicalExprList { @@ -379,11 +389,18 @@ message RollupNode { repeated LogicalExprNode expr = 1; } +message NamedStructField { + datafusion_common.ScalarValue name = 1; +} +message ListIndex { + LogicalExprNode key = 1; +} -message GetIndexedField { - LogicalExprNode expr = 1; - ScalarValue key = 2; +message ListRange { + LogicalExprNode start = 1; + LogicalExprNode stop = 2; + LogicalExprNode stride = 3; } message IsNull { @@ -425,6 +442,7 @@ message Not { message AliasNode { LogicalExprNode expr = 1; string alias = 2; + repeated TableReference relation = 3; } message BinaryExprNode { @@ -439,135 +457,35 @@ message NegativeNode { LogicalExprNode expr = 1; } +message Unnest { + repeated LogicalExprNode exprs = 1; +} + message InListNode { LogicalExprNode expr = 1; repeated LogicalExprNode list = 2; bool negated = 3; } -enum ScalarFunction { - Abs = 0; - Acos = 1; - Asin = 2; - Atan = 3; - Ascii = 4; - Ceil = 5; - Cos = 6; - Digest = 7; - Exp = 8; - Floor = 9; - Ln = 10; - Log = 11; - Log10 = 12; - Log2 = 13; - Round = 14; - Signum = 15; - Sin = 16; - Sqrt = 17; - Tan = 18; - Trunc = 19; - Array = 20; - RegexpMatch = 21; - BitLength = 22; - Btrim = 23; - CharacterLength = 24; - Chr = 25; - Concat = 26; - ConcatWithSeparator = 27; - DatePart = 28; - DateTrunc = 29; - InitCap = 30; - Left = 31; - Lpad = 32; - Lower = 33; - Ltrim = 34; - MD5 = 35; - NullIf = 36; - OctetLength = 37; - Random = 38; - RegexpReplace = 39; - Repeat = 40; - Replace = 41; - Reverse = 42; - Right = 43; - Rpad = 44; - Rtrim = 45; - SHA224 = 46; - SHA256 = 47; - SHA384 = 48; - SHA512 = 49; - SplitPart = 50; - StartsWith = 51; - Strpos = 52; - Substr = 53; - ToHex = 54; - ToTimestamp = 55; - ToTimestampMillis = 56; - ToTimestampMicros = 57; - ToTimestampSeconds = 58; - Now = 59; - Translate = 60; - Trim = 61; - Upper = 62; - Coalesce = 63; - Power = 64; - StructFun = 65; - FromUnixtime = 66; - Atan2 = 67; - DateBin = 68; - ArrowTypeof = 69; - CurrentDate = 70; - CurrentTime = 71; - Uuid = 72; -} - -message ScalarFunctionNode { - ScalarFunction fun = 1; - repeated LogicalExprNode args = 2; -} - -enum AggregateFunction { - MIN = 0; - MAX = 1; - SUM = 2; - AVG = 3; - COUNT = 4; - APPROX_DISTINCT = 5; - ARRAY_AGG = 6; - VARIANCE = 7; - VARIANCE_POP = 8; - COVARIANCE = 9; - COVARIANCE_POP = 10; - STDDEV = 11; - STDDEV_POP = 12; - CORRELATION = 13; - APPROX_PERCENTILE_CONT = 14; - APPROX_MEDIAN = 15; - APPROX_PERCENTILE_CONT_WITH_WEIGHT = 16; - GROUPING = 17; - MEDIAN = 18; -} - -message AggregateExprNode { - AggregateFunction aggr_function = 1; - repeated LogicalExprNode expr = 2; - bool distinct = 3; - LogicalExprNode filter = 4; -} message AggregateUDFExprNode { string fun_name = 1; repeated LogicalExprNode args = 2; + bool distinct = 5; LogicalExprNode filter = 3; + repeated SortExprNode order_by = 4; + optional bytes fun_definition = 6; } message ScalarUDFExprNode { string fun_name = 1; repeated LogicalExprNode args = 2; + optional bytes fun_definition = 3; } enum BuiltInWindowFunction { - ROW_NUMBER = 0; + UNSPECIFIED = 0; // https://protobuf.dev/programming-guides/dos-donts/#unspecified-enum + // ROW_NUMBER = 0; RANK = 1; DENSE_RANK = 2; PERCENT_RANK = 3; @@ -582,15 +500,16 @@ enum BuiltInWindowFunction { message WindowExprNode { oneof window_function { - AggregateFunction aggr_function = 1; BuiltInWindowFunction built_in_function = 2; - // udaf = 3 + string udaf = 3; + string udwf = 9; } LogicalExprNode expr = 4; repeated LogicalExprNode partition_by = 5; - repeated LogicalExprNode order_by = 6; + repeated SortExprNode order_by = 6; // repeated LogicalExprNode filter = 7; WindowFrame window_frame = 8; + optional bytes fun_definition = 10; } message BetweenNode { @@ -634,12 +553,12 @@ message WhenThen { message CastNode { LogicalExprNode expr = 1; - ArrowType arrow_type = 2; + datafusion_common.ArrowType arrow_type = 2; } message TryCastNode { LogicalExprNode expr = 1; - ArrowType arrow_type = 2; + datafusion_common.ArrowType arrow_type = 2; } message SortExprNode { @@ -672,242 +591,26 @@ enum WindowFrameBoundType { message WindowFrameBound { WindowFrameBoundType window_frame_bound_type = 1; - ScalarValue bound_value = 2; + datafusion_common.ScalarValue bound_value = 2; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Arrow Data Types /////////////////////////////////////////////////////////////////////////////////////////////////// -message Schema { - repeated Field columns = 1; -} - -message Field { - // name of the field - string name = 1; - ArrowType arrow_type = 2; - bool nullable = 3; - // for complex data types like structs, unions - repeated Field children = 4; -} - message FixedSizeBinary{ int32 length = 1; } -message Timestamp{ - TimeUnit time_unit = 1; - string timezone = 2; -} - enum DateUnit{ Day = 0; DateMillisecond = 1; } -enum TimeUnit{ - Second = 0; - Millisecond = 1; - Microsecond = 2; - Nanosecond = 3; -} - -enum IntervalUnit{ - YearMonth = 0; - DayTime = 1; - MonthDayNano = 2; -} - -message Decimal{ - reserved 1, 2; - uint32 precision = 3; - int32 scale = 4; -} - -message List{ - Field field_type = 1; -} - -message FixedSizeList{ - Field field_type = 1; - int32 list_size = 2; -} - -message Dictionary{ - ArrowType key = 1; - ArrowType value = 2; -} - -message Struct{ - repeated Field sub_field_types = 1; -} - -enum UnionMode{ - sparse = 0; - dense = 1; -} - -message Union{ - repeated Field union_types = 1; - UnionMode union_mode = 2; - repeated int32 type_ids = 3; -} - -message ScalarListValue{ - // encode null explicitly to distinguish a list with a null value - // from a list with no values) - bool is_null = 3; - Field field = 1; - repeated ScalarValue values = 2; -} - -message ScalarTime32Value { - oneof value { - int32 time32_second_value = 1; - int32 time32_millisecond_value = 2; - }; -} - -message ScalarTime64Value { - oneof value { - int64 time64_microsecond_value = 1; - int64 time64_nanosecond_value = 2; - }; -} - -message ScalarTimestampValue { - oneof value { - int64 time_microsecond_value = 1; - int64 time_nanosecond_value = 2; - int64 time_second_value = 3; - int64 time_millisecond_value = 4; - }; - string timezone = 5; -} - -message ScalarDictionaryValue { - ArrowType index_type = 1; - ScalarValue value = 2; -} - -message IntervalMonthDayNanoValue { - int32 months = 1; - int32 days = 2; - int64 nanos = 3; -} - -message StructValue { - // Note that a null struct value must have one or more fields, so we - // encode a null StructValue as one witth an empty field_values - // list. - repeated ScalarValue field_values = 2; - repeated Field fields = 3; -} - -message ScalarFixedSizeBinary{ - bytes values = 1; - int32 length = 2; -} - -message ScalarValue{ - // was PrimitiveScalarType null_value = 19; - reserved 19; - - oneof value { - // was PrimitiveScalarType null_value = 19; - // Null value of any type - ArrowType null_value = 33; - - bool bool_value = 1; - string utf8_value = 2; - string large_utf8_value = 3; - int32 int8_value = 4; - int32 int16_value = 5; - int32 int32_value = 6; - int64 int64_value = 7; - uint32 uint8_value = 8; - uint32 uint16_value = 9; - uint32 uint32_value = 10; - uint64 uint64_value = 11; - float float32_value = 12; - double float64_value = 13; - // Literal Date32 value always has a unit of day - int32 date_32_value = 14; - ScalarTime32Value time32_value = 15; - ScalarListValue list_value = 17; - //WAS: ScalarType null_list_value = 18; - - Decimal128 decimal128_value = 20; - int64 date_64_value = 21; - int32 interval_yearmonth_value = 24; - int64 interval_daytime_value = 25; - ScalarTimestampValue timestamp_value = 26; - ScalarDictionaryValue dictionary_value = 27; - bytes binary_value = 28; - bytes large_binary_value = 29; - ScalarTime64Value time64_value = 30; - IntervalMonthDayNanoValue interval_month_day_nano = 31; - StructValue struct_value = 32; - ScalarFixedSizeBinary fixed_size_binary_value = 34; - } -} - -message Decimal128{ - bytes value = 1; - int64 p = 2; - int64 s = 3; -} - -// Serialized data type -message ArrowType{ - oneof arrow_type_enum { - EmptyMessage NONE = 1; // arrow::Type::NA - EmptyMessage BOOL = 2; // arrow::Type::BOOL - EmptyMessage UINT8 = 3; // arrow::Type::UINT8 - EmptyMessage INT8 = 4; // arrow::Type::INT8 - EmptyMessage UINT16 = 5; // represents arrow::Type fields in src/arrow/type.h - EmptyMessage INT16 = 6; - EmptyMessage UINT32 = 7; - EmptyMessage INT32 = 8; - EmptyMessage UINT64 = 9; - EmptyMessage INT64 = 10 ; - EmptyMessage FLOAT16 = 11 ; - EmptyMessage FLOAT32 = 12 ; - EmptyMessage FLOAT64 = 13 ; - EmptyMessage UTF8 = 14 ; - EmptyMessage LARGE_UTF8 = 32; - EmptyMessage BINARY = 15 ; - int32 FIXED_SIZE_BINARY = 16 ; - EmptyMessage LARGE_BINARY = 31; - EmptyMessage DATE32 = 17 ; - EmptyMessage DATE64 = 18 ; - TimeUnit DURATION = 19; - Timestamp TIMESTAMP = 20 ; - TimeUnit TIME32 = 21 ; - TimeUnit TIME64 = 22 ; - IntervalUnit INTERVAL = 23 ; - Decimal DECIMAL = 24 ; - List LIST = 25; - List LARGE_LIST = 26; - FixedSizeList FIXED_SIZE_LIST = 27; - Struct STRUCT = 28; - Union UNION = 29; - Dictionary DICTIONARY = 30; - } +message AnalyzedLogicalPlanType { + string analyzer_name = 1; } -//Useful for representing an empty enum variant in rust -// E.G. enum example{One, Two(i32)} -// maps to -// message example{ -// oneof{ -// EmptyMessage One = 1; -// i32 Two = 2; -// } -//} -message EmptyMessage{} - message OptimizedLogicalPlanType { string optimizer_name = 1; } @@ -918,12 +621,18 @@ message OptimizedPhysicalPlanType { message PlanType { oneof plan_type_enum { - EmptyMessage InitialLogicalPlan = 1; + datafusion_common.EmptyMessage InitialLogicalPlan = 1; + AnalyzedLogicalPlanType AnalyzedLogicalPlan = 7; + datafusion_common.EmptyMessage FinalAnalyzedLogicalPlan = 8; OptimizedLogicalPlanType OptimizedLogicalPlan = 2; - EmptyMessage FinalLogicalPlan = 3; - EmptyMessage InitialPhysicalPlan = 4; + datafusion_common.EmptyMessage FinalLogicalPlan = 3; + datafusion_common.EmptyMessage InitialPhysicalPlan = 4; + datafusion_common.EmptyMessage InitialPhysicalPlanWithStats = 9; + datafusion_common.EmptyMessage InitialPhysicalPlanWithSchema = 11; OptimizedPhysicalPlanType OptimizedPhysicalPlan = 5; - EmptyMessage FinalPhysicalPlan = 6; + datafusion_common.EmptyMessage FinalPhysicalPlan = 6; + datafusion_common.EmptyMessage FinalPhysicalPlanWithStats = 10; + datafusion_common.EmptyMessage FinalPhysicalPlanWithSchema = 12; } } @@ -947,7 +656,7 @@ message FullTableReference { string table = 3; } -message OwnedTableReference { +message TableReference { oneof table_reference_enum { BareTableReference bare = 1; PartialTableReference partial = 2; @@ -980,9 +689,80 @@ message PhysicalPlanNode { UnionExecNode union = 19; ExplainExecNode explain = 20; SortPreservingMergeExecNode sort_preserving_merge = 21; + NestedLoopJoinExecNode nested_loop_join = 22; + AnalyzeExecNode analyze = 23; + JsonSinkExecNode json_sink = 24; + SymmetricHashJoinExecNode symmetric_hash_join = 25; + InterleaveExecNode interleave = 26; + PlaceholderRowExecNode placeholder_row = 27; + CsvSinkExecNode csv_sink = 28; + ParquetSinkExecNode parquet_sink = 29; + UnnestExecNode unnest = 30; } } +message PartitionColumn { + string name = 1; + datafusion_common.ArrowType arrow_type = 2; +} + + +message FileSinkConfig { + reserved 6; // writer_mode + + string object_store_url = 1; + repeated PartitionedFile file_groups = 2; + repeated string table_paths = 3; + datafusion_common.Schema output_schema = 4; + repeated PartitionColumn table_partition_cols = 5; + bool overwrite = 8; + bool keep_partition_by_columns = 9; +} + +message JsonSink { + FileSinkConfig config = 1; + datafusion_common.JsonWriterOptions writer_options = 2; +} + +message JsonSinkExecNode { + PhysicalPlanNode input = 1; + JsonSink sink = 2; + datafusion_common.Schema sink_schema = 3; + PhysicalSortExprNodeCollection sort_order = 4; +} + +message CsvSink { + FileSinkConfig config = 1; + datafusion_common.CsvWriterOptions writer_options = 2; +} + +message CsvSinkExecNode { + PhysicalPlanNode input = 1; + CsvSink sink = 2; + datafusion_common.Schema sink_schema = 3; + PhysicalSortExprNodeCollection sort_order = 4; +} + +message ParquetSink { + FileSinkConfig config = 1; + datafusion_common.TableParquetOptions parquet_options = 2; +} + +message ParquetSinkExecNode { + PhysicalPlanNode input = 1; + ParquetSink sink = 2; + datafusion_common.Schema sink_schema = 3; + PhysicalSortExprNodeCollection sort_order = 4; +} + +message UnnestExecNode { + PhysicalPlanNode input = 1; + datafusion_common.Schema schema = 2; + repeated uint64 list_type_columns = 3; + repeated uint64 struct_type_columns = 4; + UnnestOptions options = 5; +} + message PhysicalExtensionNode { bytes node = 1; repeated PhysicalPlanNode inputs = 2; @@ -990,11 +770,14 @@ message PhysicalExtensionNode { // physical expressions message PhysicalExprNode { + // Was date_time_interval_expr + reserved 17; + oneof ExprType { // column references PhysicalColumn column = 1; - ScalarValue literal = 2; + datafusion_common.ScalarValue literal = 2; // binary expressions PhysicalBinaryExprNode binary_expr = 3; @@ -1012,39 +795,49 @@ message PhysicalExprNode { PhysicalSortExprNode sort = 10; PhysicalNegativeNode negative = 11; PhysicalInListNode in_list = 12; - PhysicalScalarFunctionNode scalar_function = 13; + // was PhysicalScalarFunctionNode scalar_function = 13; PhysicalTryCastNode try_cast = 14; - // window expressions PhysicalWindowExprNode window_expr = 15; PhysicalScalarUdfNode scalar_udf = 16; - - PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17; + // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17; PhysicalLikeExprNode like_expr = 18; + + PhysicalExtensionExprNode extension = 19; } } message PhysicalScalarUdfNode { string name = 1; repeated PhysicalExprNode args = 2; - ArrowType return_type = 4; + optional bytes fun_definition = 3; + datafusion_common.ArrowType return_type = 4; } message PhysicalAggregateExprNode { - AggregateFunction aggr_function = 1; + oneof AggregateFunction { + string user_defined_aggr_function = 4; + } repeated PhysicalExprNode expr = 2; + repeated PhysicalSortExprNode ordering_req = 5; bool distinct = 3; + bool ignore_nulls = 6; + optional bytes fun_definition = 7; } message PhysicalWindowExprNode { oneof window_function { - AggregateFunction aggr_function = 1; BuiltInWindowFunction built_in_function = 2; - // udaf = 3 + string user_defined_aggr_function = 3; } - PhysicalExprNode expr = 4; + repeated PhysicalExprNode args = 4; + repeated PhysicalExprNode partition_by = 5; + repeated PhysicalSortExprNode order_by = 6; + WindowFrame window_frame = 7; + string name = 8; + optional bytes fun_definition = 9; } message PhysicalIsNull { @@ -1106,30 +899,30 @@ message PhysicalCaseNode { PhysicalExprNode else_expr = 3; } -message PhysicalScalarFunctionNode { - string name = 1; - ScalarFunction fun = 2; - repeated PhysicalExprNode args = 3; - ArrowType return_type = 4; -} - message PhysicalTryCastNode { PhysicalExprNode expr = 1; - ArrowType arrow_type = 2; + datafusion_common.ArrowType arrow_type = 2; } message PhysicalCastNode { PhysicalExprNode expr = 1; - ArrowType arrow_type = 2; + datafusion_common.ArrowType arrow_type = 2; } message PhysicalNegativeNode { PhysicalExprNode expr = 1; } +message PhysicalExtensionExprNode { + bytes expr = 1; + repeated PhysicalExprNode inputs = 2; +} + message FilterExecNode { PhysicalPlanNode input = 1; PhysicalExprNode expr = 2; + uint32 default_filter_selectivity = 3; + repeated uint32 projection = 9; } message FileGroup { @@ -1141,29 +934,45 @@ message ScanLimit { uint32 limit = 1; } +message PhysicalSortExprNodeCollection { + repeated PhysicalSortExprNode physical_sort_expr_nodes = 1; +} + message FileScanExecConf { // Was repeated ConfigOption options = 10; reserved 10; repeated FileGroup file_groups = 1; - Schema schema = 2; + datafusion_common.Schema schema = 2; repeated uint32 projection = 4; ScanLimit limit = 5; - Statistics statistics = 6; + datafusion_common.Statistics statistics = 6; repeated string table_partition_cols = 7; string object_store_url = 8; - repeated PhysicalSortExprNode output_ordering = 9; + repeated PhysicalSortExprNodeCollection output_ordering = 9; } message ParquetScanExecNode { FileScanExecConf base_conf = 1; - LogicalExprNode pruning_predicate = 2; + + // Was pruning predicate based on a logical expr. + reserved 2; + + PhysicalExprNode predicate = 3; } message CsvScanExecNode { FileScanExecConf base_conf = 1; bool has_header = 2; string delimiter = 3; + string quote = 4; + oneof optional_escape { + string escape = 5; + } + oneof optional_comment { + string comment = 6; + } + bool newlines_in_values = 7; } message AvroScanExecNode { @@ -1180,10 +989,32 @@ message HashJoinExecNode { PhysicalPlanNode left = 1; PhysicalPlanNode right = 2; repeated JoinOn on = 3; - JoinType join_type = 4; + datafusion_common.JoinType join_type = 4; PartitionMode partition_mode = 6; bool null_equals_null = 7; JoinFilter filter = 8; + repeated uint32 projection = 9; +} + +enum StreamPartitionMode { + SINGLE_PARTITION = 0; + PARTITIONED_EXEC = 1; +} + +message SymmetricHashJoinExecNode { + PhysicalPlanNode left = 1; + PhysicalPlanNode right = 2; + repeated JoinOn on = 3; + datafusion_common.JoinType join_type = 4; + StreamPartitionMode partition_mode = 6; + bool null_equals_null = 7; + JoinFilter filter = 8; + repeated PhysicalSortExprNode left_sort_exprs = 9; + repeated PhysicalSortExprNode right_sort_exprs = 10; +} + +message InterleaveExecNode { + repeated PhysicalPlanNode inputs = 1; } message UnionExecNode { @@ -1191,11 +1022,18 @@ message UnionExecNode { } message ExplainExecNode { - Schema schema = 1; + datafusion_common.Schema schema = 1; repeated StringifiedPlan stringified_plans = 2; bool verbose = 3; } +message AnalyzeExecNode { + bool verbose = 1; + bool show_statistics = 2; + PhysicalPlanNode input = 3; + datafusion_common.Schema schema = 4; +} + message CrossJoinExecNode { PhysicalPlanNode left = 1; PhysicalPlanNode right = 2; @@ -1207,13 +1045,16 @@ message PhysicalColumn { } message JoinOn { - PhysicalColumn left = 1; - PhysicalColumn right = 2; + PhysicalExprNode left = 1; + PhysicalExprNode right = 2; } message EmptyExecNode { - bool produce_one_row = 1; - Schema schema = 2; + datafusion_common.Schema schema = 1; +} + +message PlaceholderRowExecNode { + datafusion_common.Schema schema = 1; } message ProjectionExecNode { @@ -1226,13 +1067,37 @@ enum AggregateMode { PARTIAL = 0; FINAL = 1; FINAL_PARTITIONED = 2; + SINGLE = 3; + SINGLE_PARTITIONED = 4; +} + +message PartiallySortedInputOrderMode { + repeated uint64 columns = 6; } message WindowAggExecNode { PhysicalPlanNode input = 1; - repeated PhysicalExprNode window_expr = 2; - repeated string window_expr_name = 3; - Schema input_schema = 4; + repeated PhysicalWindowExprNode window_expr = 2; + repeated PhysicalExprNode partition_keys = 5; + // Set optional to `None` for `BoundedWindowAggExec`. + oneof input_order_mode { + datafusion_common.EmptyMessage linear = 7; + PartiallySortedInputOrderMode partially_sorted = 8; + datafusion_common.EmptyMessage sorted = 9; + } +} + +message MaybeFilter { + PhysicalExprNode expr = 1; +} + +message MaybePhysicalSortExprs { + repeated PhysicalSortExprNode sort_expr = 1; +} + +message AggLimit { + // wrap into a message to make it optional + uint64 limit = 1; } message AggregateExecNode { @@ -1243,9 +1108,11 @@ message AggregateExecNode { repeated string group_expr_name = 5; repeated string aggr_expr_name = 6; // we need the input schema to the partial aggregate to pass to the final aggregate - Schema input_schema = 7; + datafusion_common.Schema input_schema = 7; repeated PhysicalExprNode null_expr = 8; repeated bool groups = 9; + repeated MaybeFilter filter_expr = 10; + AggLimit limit = 11; } message GlobalLimitExecNode { @@ -1266,16 +1133,27 @@ message SortExecNode { repeated PhysicalExprNode expr = 2; // Maximum number of highest/lowest rows to fetch; negative means no limit int64 fetch = 3; + bool preserve_partitioning = 4; } message SortPreservingMergeExecNode { PhysicalPlanNode input = 1; repeated PhysicalExprNode expr = 2; + // Maximum number of highest/lowest rows to fetch; negative means no limit + int64 fetch = 3; +} + +message NestedLoopJoinExecNode { + PhysicalPlanNode left = 1; + PhysicalPlanNode right = 2; + datafusion_common.JoinType join_type = 3; + JoinFilter filter = 4; } message CoalesceBatchesExecNode { PhysicalPlanNode input = 1; uint32 target_batch_size = 2; + optional uint32 fetch = 3; } message CoalescePartitionsExecNode { @@ -1289,35 +1167,40 @@ message PhysicalHashRepartition { message RepartitionExecNode{ PhysicalPlanNode input = 1; + // oneof partition_method { + // uint64 round_robin = 2; + // PhysicalHashRepartition hash = 3; + // uint64 unknown = 4; + // } + Partitioning partitioning = 5; +} + +message Partitioning { oneof partition_method { - uint64 round_robin = 2; - PhysicalHashRepartition hash = 3; - uint64 unknown = 4; + uint64 round_robin = 1; + PhysicalHashRepartition hash = 2; + uint64 unknown = 3; } } message JoinFilter{ PhysicalExprNode expression = 1; repeated ColumnIndex column_indices = 2; - Schema schema = 3; + datafusion_common.Schema schema = 3; } message ColumnIndex{ uint32 index = 1; - JoinSide side = 2; -} - -enum JoinSide{ - LEFT_SIDE = 0; - RIGHT_SIDE = 1; + datafusion_common.JoinSide side = 2; } message PartitionedFile { string path = 1; uint64 size = 2; uint64 last_modified_ns = 3; - repeated ScalarValue partition_values = 4; + repeated datafusion_common.ScalarValue partition_values = 4; FileRange range = 5; + datafusion_common.Statistics statistics = 6; } message FileRange { @@ -1329,19 +1212,5 @@ message PartitionStats { int64 num_rows = 1; int64 num_batches = 2; int64 num_bytes = 3; - repeated ColumnStats column_stats = 4; + repeated datafusion_common.ColumnStats column_stats = 4; } - -message Statistics { - int64 num_rows = 1; - int64 total_byte_size = 2; - repeated ColumnStats column_stats = 3; - bool is_exact = 4; -} - -message ColumnStats { - ScalarValue min_value = 1; - ScalarValue max_value = 2; - uint32 null_count = 3; - uint32 distinct_count = 4; -} \ No newline at end of file diff --git a/src/proto/datafusion_common.proto b/src/proto/datafusion_common.proto new file mode 100644 index 0000000..d1506fc --- /dev/null +++ b/src/proto/datafusion_common.proto @@ -0,0 +1,570 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package datafusion_common; + +message ColumnRelation { + string relation = 1; +} + +message Column { + string name = 1; + ColumnRelation relation = 2; +} + +message DfField{ + Field field = 1; + ColumnRelation qualifier = 2; +} + +message DfSchema { + repeated DfField columns = 1; + map metadata = 2; +} + +message CsvFormat { + CsvOptions options = 5; +} + +message ParquetFormat { + // Used to be bool enable_pruning = 1; + reserved 1; + TableParquetOptions options = 2; +} + +message AvroFormat {} + +message NdJsonFormat { + JsonOptions options = 1; +} + + +message PrimaryKeyConstraint{ + repeated uint64 indices = 1; +} + +message UniqueConstraint{ + repeated uint64 indices = 1; +} + +message Constraint{ + oneof constraint_mode{ + PrimaryKeyConstraint primary_key = 1; + UniqueConstraint unique = 2; + } +} + +message Constraints{ + repeated Constraint constraints = 1; +} + +enum JoinType { + INNER = 0; + LEFT = 1; + RIGHT = 2; + FULL = 3; + LEFTSEMI = 4; + LEFTANTI = 5; + RIGHTSEMI = 6; + RIGHTANTI = 7; +} + +enum JoinConstraint { + ON = 0; + USING = 1; +} + +message AvroOptions {} +message ArrowOptions {} + +message Schema { + repeated Field columns = 1; + map metadata = 2; +} + +message Field { + // name of the field + string name = 1; + ArrowType arrow_type = 2; + bool nullable = 3; + // for complex data types like structs, unions + repeated Field children = 4; + map metadata = 5; + int64 dict_id = 6; + bool dict_ordered = 7; +} + +message Timestamp{ + TimeUnit time_unit = 1; + string timezone = 2; +} + +enum TimeUnit{ + Second = 0; + Millisecond = 1; + Microsecond = 2; + Nanosecond = 3; +} + +enum IntervalUnit{ + YearMonth = 0; + DayTime = 1; + MonthDayNano = 2; +} + +message Decimal{ + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal256Type{ + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message List{ + Field field_type = 1; +} + +message FixedSizeList{ + Field field_type = 1; + int32 list_size = 2; +} + +message Dictionary{ + ArrowType key = 1; + ArrowType value = 2; +} + +message Struct{ + repeated Field sub_field_types = 1; +} + +message Map { + Field field_type = 1; + bool keys_sorted = 2; +} + +enum UnionMode{ + sparse = 0; + dense = 1; +} + +message Union{ + repeated Field union_types = 1; + UnionMode union_mode = 2; + repeated int32 type_ids = 3; +} + +// Used for List/FixedSizeList/LargeList/Struct/Map +message ScalarNestedValue { + message Dictionary { + bytes ipc_message = 1; + bytes arrow_data = 2; + } + + bytes ipc_message = 1; + bytes arrow_data = 2; + Schema schema = 3; + repeated Dictionary dictionaries = 4; +} + +message ScalarTime32Value { + oneof value { + int32 time32_second_value = 1; + int32 time32_millisecond_value = 2; + }; +} + +message ScalarTime64Value { + oneof value { + int64 time64_microsecond_value = 1; + int64 time64_nanosecond_value = 2; + }; +} + +message ScalarTimestampValue { + oneof value { + int64 time_microsecond_value = 1; + int64 time_nanosecond_value = 2; + int64 time_second_value = 3; + int64 time_millisecond_value = 4; + }; + string timezone = 5; +} + +message ScalarDictionaryValue { + ArrowType index_type = 1; + ScalarValue value = 2; +} + +message IntervalDayTimeValue { + int32 days = 1; + int32 milliseconds = 2; +} + +message IntervalMonthDayNanoValue { + int32 months = 1; + int32 days = 2; + int64 nanos = 3; +} + +message UnionField { + int32 field_id = 1; + Field field = 2; +} + +message UnionValue { + // Note that a null union value must have one or more fields, so we + // encode a null UnionValue as one with value_id == 128 + int32 value_id = 1; + ScalarValue value = 2; + repeated UnionField fields = 3; + UnionMode mode = 4; +} + +message ScalarFixedSizeBinary{ + bytes values = 1; + int32 length = 2; +} + +message ScalarValue{ + // was PrimitiveScalarType null_value = 19; + reserved 19; + + oneof value { + // was PrimitiveScalarType null_value = 19; + // Null value of any type + ArrowType null_value = 33; + + bool bool_value = 1; + string utf8_value = 2; + string large_utf8_value = 3; + string utf8_view_value = 23; + int32 int8_value = 4; + int32 int16_value = 5; + int32 int32_value = 6; + int64 int64_value = 7; + uint32 uint8_value = 8; + uint32 uint16_value = 9; + uint32 uint32_value = 10; + uint64 uint64_value = 11; + float float32_value = 12; + double float64_value = 13; + // Literal Date32 value always has a unit of day + int32 date_32_value = 14; + ScalarTime32Value time32_value = 15; + ScalarNestedValue large_list_value = 16; + ScalarNestedValue list_value = 17; + ScalarNestedValue fixed_size_list_value = 18; + ScalarNestedValue struct_value = 32; + ScalarNestedValue map_value = 41; + + Decimal128 decimal128_value = 20; + Decimal256 decimal256_value = 39; + + int64 date_64_value = 21; + int32 interval_yearmonth_value = 24; + + int64 duration_second_value = 35; + int64 duration_millisecond_value = 36; + int64 duration_microsecond_value = 37; + int64 duration_nanosecond_value = 38; + + ScalarTimestampValue timestamp_value = 26; + ScalarDictionaryValue dictionary_value = 27; + bytes binary_value = 28; + bytes large_binary_value = 29; + bytes binary_view_value = 22; + ScalarTime64Value time64_value = 30; + IntervalDayTimeValue interval_daytime_value = 25; + IntervalMonthDayNanoValue interval_month_day_nano = 31; + ScalarFixedSizeBinary fixed_size_binary_value = 34; + UnionValue union_value = 42; + } +} + +message Decimal128{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + +message Decimal256{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + +// Serialized data type +message ArrowType{ + oneof arrow_type_enum { + EmptyMessage NONE = 1; // arrow::Type::NA + EmptyMessage BOOL = 2; // arrow::Type::BOOL + EmptyMessage UINT8 = 3; // arrow::Type::UINT8 + EmptyMessage INT8 = 4; // arrow::Type::INT8 + EmptyMessage UINT16 = 5; // represents arrow::Type fields in src/arrow/type.h + EmptyMessage INT16 = 6; + EmptyMessage UINT32 = 7; + EmptyMessage INT32 = 8; + EmptyMessage UINT64 = 9; + EmptyMessage INT64 = 10 ; + EmptyMessage FLOAT16 = 11 ; + EmptyMessage FLOAT32 = 12 ; + EmptyMessage FLOAT64 = 13 ; + EmptyMessage UTF8 = 14 ; + EmptyMessage UTF8_VIEW = 35; + EmptyMessage LARGE_UTF8 = 32; + EmptyMessage BINARY = 15 ; + EmptyMessage BINARY_VIEW = 34; + int32 FIXED_SIZE_BINARY = 16 ; + EmptyMessage LARGE_BINARY = 31; + EmptyMessage DATE32 = 17 ; + EmptyMessage DATE64 = 18 ; + TimeUnit DURATION = 19; + Timestamp TIMESTAMP = 20 ; + TimeUnit TIME32 = 21 ; + TimeUnit TIME64 = 22 ; + IntervalUnit INTERVAL = 23 ; + Decimal DECIMAL = 24 ; + Decimal256Type DECIMAL256 = 36; + List LIST = 25; + List LARGE_LIST = 26; + FixedSizeList FIXED_SIZE_LIST = 27; + Struct STRUCT = 28; + Union UNION = 29; + Dictionary DICTIONARY = 30; + Map MAP = 33; + } +} + +//Useful for representing an empty enum variant in rust +// E.G. enum example{One, Two(i32)} +// maps to +// message example{ +// oneof{ +// EmptyMessage One = 1; +// i32 Two = 2; +// } +//} +message EmptyMessage{} + +enum CompressionTypeVariant { + GZIP = 0; + BZIP2 = 1; + XZ = 2; + ZSTD = 3; + UNCOMPRESSED = 4; +} + +message JsonWriterOptions { + CompressionTypeVariant compression = 1; +} + + +message CsvWriterOptions { + // Compression type + CompressionTypeVariant compression = 1; + // Optional column delimiter. Defaults to `b','` + string delimiter = 2; + // Whether to write column names as file headers. Defaults to `true` + bool has_header = 3; + // Optional date format for date arrays + string date_format = 4; + // Optional datetime format for datetime arrays + string datetime_format = 5; + // Optional timestamp format for timestamp arrays + string timestamp_format = 6; + // Optional time format for time arrays + string time_format = 7; + // Optional value to represent null + string null_value = 8; + // Optional quote. Defaults to `b'"'` + string quote = 9; + // Optional escape. Defaults to `'\\'` + string escape = 10; + // Optional flag whether to double quotes, instead of escaping. Defaults to `true` + bool double_quote = 11; +} + +// Options controlling CSV format +message CsvOptions { + bytes has_header = 1; // Indicates if the CSV has a header row + bytes delimiter = 2; // Delimiter character as a byte + bytes quote = 3; // Quote character as a byte + bytes escape = 4; // Optional escape character as a byte + CompressionTypeVariant compression = 5; // Compression type + uint64 schema_infer_max_rec = 6; // Max records for schema inference + string date_format = 7; // Optional date format + string datetime_format = 8; // Optional datetime format + string timestamp_format = 9; // Optional timestamp format + string timestamp_tz_format = 10; // Optional timestamp with timezone format + string time_format = 11; // Optional time format + string null_value = 12; // Optional representation of null value + bytes comment = 13; // Optional comment character as a byte + bytes double_quote = 14; // Indicates if quotes are doubled + bytes newlines_in_values = 15; // Indicates if newlines are supported in values + bytes terminator = 16; // Optional terminator character as a byte +} + +// Options controlling CSV format +message JsonOptions { + CompressionTypeVariant compression = 1; // Compression type + uint64 schema_infer_max_rec = 2; // Max records for schema inference +} + +message TableParquetOptions { + ParquetOptions global = 1; + repeated ParquetColumnSpecificOptions column_specific_options = 2; + map key_value_metadata = 3; +} + +message ParquetColumnSpecificOptions { + string column_name = 1; + ParquetColumnOptions options = 2; +} + +message ParquetColumnOptions { + oneof bloom_filter_enabled_opt { + bool bloom_filter_enabled = 1; + } + + oneof encoding_opt { + string encoding = 2; + } + + oneof dictionary_enabled_opt { + bool dictionary_enabled = 3; + } + + oneof compression_opt { + string compression = 4; + } + + oneof statistics_enabled_opt { + string statistics_enabled = 5; + } + + oneof bloom_filter_fpp_opt { + double bloom_filter_fpp = 6; + } + + oneof bloom_filter_ndv_opt { + uint64 bloom_filter_ndv = 7; + } + + oneof max_statistics_size_opt { + uint32 max_statistics_size = 8; + } +} + +message ParquetOptions { + // Regular fields + bool enable_page_index = 1; // default = true + bool pruning = 2; // default = true + bool skip_metadata = 3; // default = true + bool pushdown_filters = 5; // default = false + bool reorder_filters = 6; // default = false + uint64 data_pagesize_limit = 7; // default = 1024 * 1024 + uint64 write_batch_size = 8; // default = 1024 + string writer_version = 9; // default = "1.0" + // bool bloom_filter_enabled = 20; // default = false + bool allow_single_file_parallelism = 23; // default = true + uint64 maximum_parallel_row_group_writers = 24; // default = 1 + uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2 + bool bloom_filter_on_read = 26; // default = true + bool bloom_filter_on_write = 27; // default = false + bool schema_force_view_types = 28; // default = false + + oneof metadata_size_hint_opt { + uint64 metadata_size_hint = 4; + } + + oneof compression_opt { + string compression = 10; + } + + oneof dictionary_enabled_opt { + bool dictionary_enabled = 11; + } + + oneof statistics_enabled_opt { + string statistics_enabled = 13; + } + + oneof max_statistics_size_opt { + uint64 max_statistics_size = 14; + } + + oneof column_index_truncate_length_opt { + uint64 column_index_truncate_length = 17; + } + + oneof encoding_opt { + string encoding = 19; + } + + oneof bloom_filter_fpp_opt { + double bloom_filter_fpp = 21; + } + + oneof bloom_filter_ndv_opt { + uint64 bloom_filter_ndv = 22; + } + + uint64 dictionary_page_size_limit = 12; + + uint64 data_page_row_count_limit = 18; + + uint64 max_row_group_size = 15; + + string created_by = 16; +} + +enum JoinSide{ + LEFT_SIDE = 0; + RIGHT_SIDE = 1; +} + +message Precision{ + PrecisionInfo precision_info = 1; + ScalarValue val = 2; +} + +enum PrecisionInfo { + EXACT = 0; + INEXACT = 1; + ABSENT = 2; +} + +message Statistics { + Precision num_rows = 1; + Precision total_byte_size = 2; + repeated ColumnStats column_stats = 3; +} + +message ColumnStats { + Precision min_value = 1; + Precision max_value = 2; + Precision null_count = 3; + Precision distinct_count = 4; +} diff --git a/src/shuffle/codec.rs b/src/shuffle/codec.rs index ca20e92..a59ed85 100644 --- a/src/shuffle/codec.rs +++ b/src/shuffle/codec.rs @@ -121,9 +121,7 @@ fn encode_partitioning_scheme(partitioning: &Partitioning) -> Result Ok(protobuf::PhysicalHashRepartition { hash_expr: expr .iter() - .map(|expr| { - serialize_physical_expr(expr.clone(), &DefaultPhysicalExtensionCodec {}) - }) + .map(|expr| serialize_physical_expr(expr, &DefaultPhysicalExtensionCodec {})) .collect::, DataFusionError>>()?, partition_count: *partition_count as u64, }), diff --git a/testdata/expected-plans/q1.txt b/testdata/expected-plans/q1.txt index 4dc185b..856c066 100644 --- a/testdata/expected-plans/q1.txt +++ b/testdata/expected-plans/q1.txt @@ -20,7 +20,7 @@ SortPreservingMergeExec: [l_returnflag@0 ASC NULLS LAST,l_linestatus@1 ASC NULLS AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)] ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@6 <= 1998-09-24 + FilterExec: l_shipdate@6 <= 1998-09-24, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5] ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[] DataFusion Ray Distributed Plan @@ -31,7 +31,7 @@ RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "l_ret AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)] ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@6 <= 1998-09-24 + FilterExec: l_shipdate@6 <= 1998-09-24, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5] ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[] Query Stage #1 (2 -> 2): diff --git a/testdata/expected-plans/q10.txt b/testdata/expected-plans/q10.txt index b853fb6..b3e8033 100644 --- a/testdata/expected-plans/q10.txt +++ b/testdata/expected-plans/q10.txt @@ -1,65 +1,61 @@ DataFusion Logical Plan ======================= -Limit: skip=0, fetch=20 - Sort: revenue DESC NULLS FIRST, fetch=20 - Projection: customer.c_custkey, customer.c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, customer.c_acctbal, nation.n_name, customer.c_address, customer.c_phone, customer.c_comment - Aggregate: groupBy=[[customer.c_custkey, customer.c_name, customer.c_acctbal, customer.c_phone, nation.n_name, customer.c_address, customer.c_comment]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_phone, customer.c_acctbal, customer.c_comment, lineitem.l_extendedprice, lineitem.l_discount, nation.n_name - Inner Join: customer.c_nationkey = nation.n_nationkey - Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, orders.o_orderkey - Inner Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] - Projection: orders.o_orderkey, orders.o_custkey - Filter: orders.o_orderdate >= Date32("1993-07-01") AND orders.o_orderdate < Date32("1993-10-01") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-07-01"), orders.o_orderdate < Date32("1993-10-01")] - Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount - Filter: lineitem.l_returnflag = Utf8("R") - TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R")] - TableScan: nation projection=[n_nationkey, n_name] +Sort: revenue DESC NULLS FIRST, fetch=20 + Projection: customer.c_custkey, customer.c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, customer.c_acctbal, nation.n_name, customer.c_address, customer.c_phone, customer.c_comment + Aggregate: groupBy=[[customer.c_custkey, customer.c_name, customer.c_acctbal, customer.c_phone, nation.n_name, customer.c_address, customer.c_comment]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_phone, customer.c_acctbal, customer.c_comment, lineitem.l_extendedprice, lineitem.l_discount, nation.n_name + Inner Join: customer.c_nationkey = nation.n_nationkey + Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, lineitem.l_extendedprice, lineitem.l_discount + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, orders.o_orderkey + Inner Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] + Projection: orders.o_orderkey, orders.o_custkey + Filter: orders.o_orderdate >= Date32("1993-07-01") AND orders.o_orderdate < Date32("1993-10-01") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-07-01"), orders.o_orderdate < Date32("1993-10-01")] + Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount + Filter: lineitem.l_returnflag = Utf8("R") + TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R")] + TableScan: nation projection=[n_nationkey, n_name] DataFusion Physical Plan ======================== -GlobalLimitExec: skip=0, fetch=20 - SortPreservingMergeExec: [revenue@2 DESC], fetch=20 - SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment] - AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_nationkey@3], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@7], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] +SortPreservingMergeExec: [revenue@2 DESC], fetch=20 + SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[true] + ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment] + AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_nationkey@3], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_orderkey@7], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_returnflag@3 = R - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)] + RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01, projection=[o_orderkey@0, o_custkey@1] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)] DataFusion Ray Distributed Plan =========== @@ -74,10 +70,9 @@ RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "c_cus Query Stage #2 (2 -> 2): RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01, projection=[o_orderkey@0, o_custkey@1] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[] Query Stage #3 (2 -> 2): RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "o_orderkey", index: 7 }], 2)) @@ -90,10 +85,9 @@ RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "o_ord Query Stage #4 (2 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_returnflag@3 = R - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)] Query Stage #5 (2 -> 2): RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) @@ -123,8 +117,7 @@ RayShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "c_cus CoalesceBatchesExec: target_batch_size=8192 RayShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 2 }, Column { name: "c_phone", index: 3 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 6 }], 2)) -Query Stage #8 (1 -> 1): -GlobalLimitExec: skip=0, fetch=20 - SortPreservingMergeExec: [revenue@2 DESC], fetch=20 - RayShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 3 }, Column { name: "c_phone", index: 6 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 7 }], 2)) +Query Stage #8 (2 -> 1): +SortPreservingMergeExec: [revenue@2 DESC], fetch=20 + RayShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 3 }, Column { name: "c_phone", index: 6 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 7 }], 2)) diff --git a/testdata/expected-plans/q11.txt b/testdata/expected-plans/q11.txt index a335871..eb29f9f 100644 --- a/testdata/expected-plans/q11.txt +++ b/testdata/expected-plans/q11.txt @@ -43,10 +43,9 @@ SortPreservingMergeExec: [value@1 DESC] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] @@ -68,10 +67,9 @@ SortPreservingMergeExec: [value@1 DESC] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([s_nationkey@3], 2), input_partitions=2 ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@0 as s_nationkey] @@ -90,10 +88,9 @@ DataFusion Ray Distributed Plan Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] Query Stage #1 (1 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) @@ -125,10 +122,9 @@ RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([], 2)) Query Stage #5 (1 -> 2): RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] Query Stage #6 (1 -> 2): RayShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) diff --git a/testdata/expected-plans/q12.txt b/testdata/expected-plans/q12.txt new file mode 100644 index 0000000..3761e77 --- /dev/null +++ b/testdata/expected-plans/q12.txt @@ -0,0 +1,71 @@ +DataFusion Logical Plan +======================= + +Sort: lineitem.l_shipmode ASC NULLS LAST + Projection: lineitem.l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS low_line_count + Aggregate: groupBy=[[lineitem.l_shipmode]], aggr=[[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]] + Projection: orders.o_orderpriority, lineitem.l_shipmode + Inner Join: orders.o_orderkey = lineitem.l_orderkey + TableScan: orders projection=[o_orderkey, o_orderpriority] + Projection: lineitem.l_orderkey, lineitem.l_shipmode + Filter: (lineitem.l_shipmode = Utf8("FOB") OR lineitem.l_shipmode = Utf8("SHIP")) AND lineitem.l_receiptdate > lineitem.l_commitdate AND lineitem.l_shipdate < lineitem.l_commitdate AND lineitem.l_receiptdate >= Date32("1995-01-01") AND lineitem.l_receiptdate < Date32("1996-01-01") + TableScan: lineitem projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], partial_filters=[lineitem.l_shipmode = Utf8("FOB") OR lineitem.l_shipmode = Utf8("SHIP"), lineitem.l_receiptdate > lineitem.l_commitdate, lineitem.l_shipdate < lineitem.l_commitdate, lineitem.l_receiptdate >= Date32("1995-01-01"), lineitem.l_receiptdate < Date32("1996-01-01")] + +DataFusion Physical Plan +======================== + +SortPreservingMergeExec: [l_shipmode@0 ASC NULLS LAST] + SortExec: expr=[l_shipmode@0 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[l_shipmode@0 as l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count] + AggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_shipmode@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[l_shipmode@1 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] + ProjectionExec: expr=[o_orderpriority@1 as o_orderpriority, l_shipmode@0 as l_shipmode] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: (l_shipmode@4 = FOB OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1995-01-01 AND l_receiptdate@3 < 1996-01-01, projection=[l_orderkey@0, l_shipmode@4] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], predicate=(l_shipmode@14 = FOB OR l_shipmode@14 = SHIP) AND l_receiptdate@12 > l_commitdate@11 AND l_shipdate@10 < l_commitdate@11 AND l_receiptdate@12 >= 1995-01-01 AND l_receiptdate@12 < 1996-01-01, pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= FOB AND FOB <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= SHIP AND SHIP <= l_shipmode_max@1 END) AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_max@4 >= 1995-01-01 END AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_min@7 < 1996-01-01 END, required_guarantees=[l_shipmode in (SHIP, FOB)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderpriority] + +DataFusion Ray Distributed Plan +=========== + +Query Stage #0 (2 -> 2): +RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: (l_shipmode@4 = FOB OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1995-01-01 AND l_receiptdate@3 < 1996-01-01, projection=[l_orderkey@0, l_shipmode@4] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], predicate=(l_shipmode@14 = FOB OR l_shipmode@14 = SHIP) AND l_receiptdate@12 > l_commitdate@11 AND l_shipdate@10 < l_commitdate@11 AND l_receiptdate@12 >= 1995-01-01 AND l_receiptdate@12 < 1996-01-01, pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= FOB AND FOB <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= SHIP AND SHIP <= l_shipmode_max@1 END) AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_max@4 >= 1995-01-01 END AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_min@7 < 1996-01-01 END, required_guarantees=[l_shipmode in (SHIP, FOB)] + +Query Stage #1 (2 -> 2): +RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderpriority] + +Query Stage #2 (2 -> 2): +RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) + AggregateExec: mode=Partial, gby=[l_shipmode@1 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] + ProjectionExec: expr=[o_orderpriority@1 as o_orderpriority, l_shipmode@0 as l_shipmode] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) + +Query Stage #3 (2 -> 2): +RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) + SortExec: expr=[l_shipmode@0 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[l_shipmode@0 as l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count] + AggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) + +Query Stage #4 (2 -> 1): +SortPreservingMergeExec: [l_shipmode@0 ASC NULLS LAST] + RayShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) + diff --git a/testdata/expected-plans/q13.txt b/testdata/expected-plans/q13.txt index ddb7050..79d18c3 100644 --- a/testdata/expected-plans/q13.txt +++ b/testdata/expected-plans/q13.txt @@ -33,10 +33,9 @@ SortPreservingMergeExec: [custdist@1 DESC,c_count@0 DESC] ParquetExec: file_groups={ ... }, projection=[c_custkey] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_comment@2 NOT LIKE %express%requests% - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_comment], predicate=o_comment@8 NOT LIKE %express%requests% + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_comment@2 NOT LIKE %express%requests%, projection=[o_orderkey@0, o_custkey@1] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_comment], predicate=o_comment@8 NOT LIKE %express%requests% DataFusion Ray Distributed Plan =========== @@ -47,10 +46,9 @@ RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "c_cus Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_comment@2 NOT LIKE %express%requests% - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_comment], predicate=o_comment@8 NOT LIKE %express%requests% + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_comment@2 NOT LIKE %express%requests%, projection=[o_orderkey@0, o_custkey@1] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_comment], predicate=o_comment@8 NOT LIKE %express%requests% Query Stage #2 (2 -> 2): RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "c_count", index: 0 }], 2)) diff --git a/testdata/expected-plans/q14.txt b/testdata/expected-plans/q14.txt index f022fdc..b706aed 100644 --- a/testdata/expected-plans/q14.txt +++ b/testdata/expected-plans/q14.txt @@ -26,10 +26,9 @@ ProjectionExec: expr=[100 * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") T ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_partkey@0 as l_partkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1995-02-01 AND l_shipdate@3 < 1995-03-01 - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-02-01 AND l_shipdate@10 < 1995-03-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-02-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-03-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 >= 1995-02-01 AND l_shipdate@3 < 1995-03-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-02-01 AND l_shipdate@10 < 1995-03-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-02-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-03-01 END, required_guarantees=[] DataFusion Ray Distributed Plan =========== @@ -40,10 +39,9 @@ RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "p_par Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - ProjectionExec: expr=[l_partkey@0 as l_partkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1995-02-01 AND l_shipdate@3 < 1995-03-01 - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-02-01 AND l_shipdate@10 < 1995-03-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-02-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-03-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 >= 1995-02-01 AND l_shipdate@3 < 1995-03-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-02-01 AND l_shipdate@10 < 1995-03-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-02-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-03-01 END, required_guarantees=[] Query Stage #2 (2 -> 1): RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([], 2)) diff --git a/testdata/expected-plans/q16.txt b/testdata/expected-plans/q16.txt new file mode 100644 index 0000000..e9be1bf --- /dev/null +++ b/testdata/expected-plans/q16.txt @@ -0,0 +1,113 @@ +DataFusion Logical Plan +======================= + +Sort: supplier_cnt DESC NULLS FIRST, part.p_brand ASC NULLS LAST, part.p_type ASC NULLS LAST, part.p_size ASC NULLS LAST + Projection: part.p_brand, part.p_type, part.p_size, count(alias1) AS supplier_cnt + Aggregate: groupBy=[[part.p_brand, part.p_type, part.p_size]], aggr=[[count(alias1)]] + Aggregate: groupBy=[[part.p_brand, part.p_type, part.p_size, partsupp.ps_suppkey AS alias1]], aggr=[[]] + LeftAnti Join: partsupp.ps_suppkey = __correlated_sq_1.s_suppkey + Projection: partsupp.ps_suppkey, part.p_brand, part.p_type, part.p_size + Inner Join: partsupp.ps_partkey = part.p_partkey + TableScan: partsupp projection=[ps_partkey, ps_suppkey] + Filter: part.p_brand != Utf8("Brand#14") AND part.p_type NOT LIKE Utf8("SMALL PLATED%") AND part.p_size IN ([Int32(14), Int32(6), Int32(5), Int32(31), Int32(49), Int32(15), Int32(41), Int32(47)]) + TableScan: part projection=[p_partkey, p_brand, p_type, p_size], partial_filters=[part.p_brand != Utf8("Brand#14"), part.p_type NOT LIKE Utf8("SMALL PLATED%"), part.p_size IN ([Int32(14), Int32(6), Int32(5), Int32(31), Int32(49), Int32(15), Int32(41), Int32(47)])] + SubqueryAlias: __correlated_sq_1 + Projection: supplier.s_suppkey + Filter: supplier.s_comment LIKE Utf8("%Customer%Complaints%") + TableScan: supplier projection=[s_suppkey, s_comment], partial_filters=[supplier.s_comment LIKE Utf8("%Customer%Complaints%")] + +DataFusion Physical Plan +======================== + +SortPreservingMergeExec: [supplier_cnt@3 DESC,p_brand@0 ASC NULLS LAST,p_type@1 ASC NULLS LAST,p_size@2 ASC NULLS LAST] + SortExec: expr=[supplier_cnt@3 DESC,p_brand@0 ASC NULLS LAST,p_type@1 ASC NULLS LAST,p_size@2 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, count(alias1)@3 as supplier_cnt] + AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] + AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=RightAnti, on=[(s_suppkey@0, ps_suppkey@0)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_comment], predicate=s_comment@6 LIKE %Customer%Complaints% + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([ps_suppkey@0], 2), input_partitions=2 + ProjectionExec: expr=[ps_suppkey@3 as ps_suppkey, p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_brand@1, p_type@2, p_size@3, ps_suppkey@5] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_brand@1 != Brand#14 AND p_type@2 NOT LIKE SMALL PLATED% AND Use p_size@3 IN (SET) ([Literal { value: Int32(14) }, Literal { value: Int32(6) }, Literal { value: Int32(5) }, Literal { value: Int32(31) }, Literal { value: Int32(49) }, Literal { value: Int32(15) }, Literal { value: Int32(41) }, Literal { value: Int32(47) }]) + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }]), pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 != Brand#14 OR Brand#14 != p_brand_max@1 END AND (CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 14 AND 14 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 6 AND 6 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 5 AND 5 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 31 AND 31 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 49 AND 49 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 15 AND 15 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 41 AND 41 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 47 AND 47 <= p_size_max@5 END), required_guarantees=[p_brand not in (Brand#14), p_size in (5, 41, 49, 15, 6, 31, 47, 14)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey] + +DataFusion Ray Distributed Plan +=========== + +Query Stage #0 (1 -> 2): +RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0] + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_comment], predicate=s_comment@6 LIKE %Customer%Complaints% + +Query Stage #1 (1 -> 2): +RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_brand@1 != Brand#14 AND p_type@2 NOT LIKE SMALL PLATED% AND Use p_size@3 IN (SET) ([Literal { value: Int32(14) }, Literal { value: Int32(6) }, Literal { value: Int32(5) }, Literal { value: Int32(31) }, Literal { value: Int32(49) }, Literal { value: Int32(15) }, Literal { value: Int32(41) }, Literal { value: Int32(47) }]) + ParquetExec: file_groups={ ... }]), pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 != Brand#14 OR Brand#14 != p_brand_max@1 END AND (CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 14 AND 14 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 6 AND 6 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 5 AND 5 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 31 AND 31 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 49 AND 49 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 15 AND 15 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 41 AND 41 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 47 AND 47 <= p_size_max@5 END), required_guarantees=[p_brand not in (Brand#14), p_size in (5, 41, 49, 15, 6, 31, 47, 14)] + +Query Stage #2 (2 -> 2): +RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) + ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey] + +Query Stage #3 (2 -> 2): +RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) + ProjectionExec: expr=[ps_suppkey@3 as ps_suppkey, p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_brand@1, p_type@2, p_size@3, ps_suppkey@5] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) + +Query Stage #4 (2 -> 2): +RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }, Column { name: "alias1", index: 3 }], 2)) + AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=RightAnti, on=[(s_suppkey@0, ps_suppkey@0)] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) + +Query Stage #5 (2 -> 2): +RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) + AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] + AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }, Column { name: "alias1", index: 3 }], 2)) + +Query Stage #6 (2 -> 2): +RayShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) + SortExec: expr=[supplier_cnt@3 DESC,p_brand@0 ASC NULLS LAST,p_type@1 ASC NULLS LAST,p_size@2 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, count(alias1)@3 as supplier_cnt] + AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) + +Query Stage #7 (2 -> 1): +SortPreservingMergeExec: [supplier_cnt@3 DESC,p_brand@0 ASC NULLS LAST,p_type@1 ASC NULLS LAST,p_size@2 ASC NULLS LAST] + RayShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) + diff --git a/testdata/expected-plans/q17.txt b/testdata/expected-plans/q17.txt index 7b5aac9..f2cca4d 100644 --- a/testdata/expected-plans/q17.txt +++ b/testdata/expected-plans/q17.txt @@ -30,11 +30,10 @@ ProjectionExec: expr=[CAST(sum(lineitem.l_extendedprice)@0 AS Float64) / 7 as av HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], projection=[p_partkey@0, l_quantity@2, l_extendedprice@3] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_brand@1 = Brand#42 AND p_container@2 = LG BAG - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_brand, p_container], predicate=p_brand@3 = Brand#42 AND p_container@6 = LG BAG, pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 <= Brand#42 AND Brand#42 <= p_brand_max@1 END AND CASE WHEN p_container_null_count@6 = p_container_row_count@7 THEN false ELSE p_container_min@4 <= LG BAG AND LG BAG <= p_container_max@5 END, required_guarantees=[p_brand in (Brand#42), p_container in (LG BAG)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_brand@1 = Brand#42 AND p_container@2 = LG BAG, projection=[p_partkey@0] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_brand, p_container], predicate=p_brand@3 = Brand#42 AND p_container@6 = LG BAG, pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 <= Brand#42 AND Brand#42 <= p_brand_max@1 END AND CASE WHEN p_container_null_count@6 = p_container_row_count@7 THEN false ELSE p_container_min@4 <= LG BAG AND LG BAG <= p_container_max@5 END, required_guarantees=[p_brand in (Brand#42), p_container in (LG BAG)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice] @@ -50,10 +49,9 @@ DataFusion Ray Distributed Plan Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_brand@1 = Brand#42 AND p_container@2 = LG BAG - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_brand, p_container], predicate=p_brand@3 = Brand#42 AND p_container@6 = LG BAG, pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 <= Brand#42 AND Brand#42 <= p_brand_max@1 END AND CASE WHEN p_container_null_count@6 = p_container_row_count@7 THEN false ELSE p_container_min@4 <= LG BAG AND LG BAG <= p_container_max@5 END, required_guarantees=[p_brand in (Brand#42), p_container in (LG BAG)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_brand@1 = Brand#42 AND p_container@2 = LG BAG, projection=[p_partkey@0] + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_brand, p_container], predicate=p_brand@3 = Brand#42 AND p_container@6 = LG BAG, pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 <= Brand#42 AND Brand#42 <= p_brand_max@1 END AND CASE WHEN p_container_null_count@6 = p_container_row_count@7 THEN false ELSE p_container_min@4 <= LG BAG AND LG BAG <= p_container_max@5 END, required_guarantees=[p_brand in (Brand#42), p_container in (LG BAG)] Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) diff --git a/testdata/expected-plans/q18.txt b/testdata/expected-plans/q18.txt index a9c18bf..a1da264 100644 --- a/testdata/expected-plans/q18.txt +++ b/testdata/expected-plans/q18.txt @@ -1,58 +1,55 @@ DataFusion Logical Plan ======================= -Limit: skip=0, fetch=100 - Sort: orders.o_totalprice DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST, fetch=100 - Aggregate: groupBy=[[customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice]], aggr=[[sum(lineitem.l_quantity)]] - LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey - Projection: customer.c_custkey, customer.c_name, orders.o_orderkey, orders.o_totalprice, orders.o_orderdate, lineitem.l_quantity - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: customer.c_custkey, customer.c_name, orders.o_orderkey, orders.o_totalprice, orders.o_orderdate - Inner Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey, c_name] - TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] - TableScan: lineitem projection=[l_orderkey, l_quantity] - SubqueryAlias: __correlated_sq_1 - Projection: lineitem.l_orderkey - Filter: sum(lineitem.l_quantity) > Decimal128(Some(31300),21,2) - Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[sum(lineitem.l_quantity)]] - TableScan: lineitem projection=[l_orderkey, l_quantity] +Sort: orders.o_totalprice DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST, fetch=100 + Aggregate: groupBy=[[customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice]], aggr=[[sum(lineitem.l_quantity)]] + LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey + Projection: customer.c_custkey, customer.c_name, orders.o_orderkey, orders.o_totalprice, orders.o_orderdate, lineitem.l_quantity + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Projection: customer.c_custkey, customer.c_name, orders.o_orderkey, orders.o_totalprice, orders.o_orderdate + Inner Join: customer.c_custkey = orders.o_custkey + TableScan: customer projection=[c_custkey, c_name] + TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] + TableScan: lineitem projection=[l_orderkey, l_quantity] + SubqueryAlias: __correlated_sq_1 + Projection: lineitem.l_orderkey + Filter: sum(lineitem.l_quantity) > Decimal128(Some(31300),21,2) + Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[sum(lineitem.l_quantity)]] + TableScan: lineitem projection=[l_orderkey, l_quantity] DataFusion Physical Plan ======================== -GlobalLimitExec: skip=0, fetch=100 - SortPreservingMergeExec: [o_totalprice@4 DESC,o_orderdate@3 ASC NULLS LAST], fetch=100 - SortExec: TopK(fetch=100), expr=[o_totalprice@4 DESC,o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name, c_custkey@1 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@3 as o_orderdate, o_totalprice@4 as o_totalprice], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_name@0, c_custkey@1, o_orderkey@2, o_orderdate@3, o_totalprice@4], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@2)] - ProjectionExec: expr=[l_orderkey@0 as l_orderkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: sum(lineitem.l_quantity)@1 > Some(31300),21,2 - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] +SortPreservingMergeExec: [o_totalprice@4 DESC,o_orderdate@3 ASC NULLS LAST], fetch=100 + SortExec: TopK(fetch=100), expr=[o_totalprice@4 DESC,o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true] + AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name, c_custkey@1 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@3 as o_orderdate, o_totalprice@4 as o_totalprice], aggr=[sum(lineitem.l_quantity)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_name@0, c_custkey@1, o_orderkey@2, o_orderdate@3, o_totalprice@4], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@2)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: sum(lineitem.l_quantity)@1 > Some(31300),21,2, projection=[l_orderkey@0] + AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] + AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_orderkey@2], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] DataFusion Ray Distributed Plan =========== @@ -88,12 +85,11 @@ RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "c_nam AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)] CoalesceBatchesExec: target_batch_size=8192 HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@2)] - ProjectionExec: expr=[l_orderkey@0 as l_orderkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: sum(lineitem.l_quantity)@1 > Some(31300),21,2 - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RayShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: sum(lineitem.l_quantity)@1 > Some(31300),21,2, projection=[l_orderkey@0] + AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) CoalesceBatchesExec: target_batch_size=8192 HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] CoalesceBatchesExec: target_batch_size=8192 @@ -108,8 +104,7 @@ RayShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "c_nam CoalesceBatchesExec: target_batch_size=8192 RayShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) -Query Stage #7 (1 -> 1): -GlobalLimitExec: skip=0, fetch=100 - SortPreservingMergeExec: [o_totalprice@4 DESC,o_orderdate@3 ASC NULLS LAST], fetch=100 - RayShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) +Query Stage #7 (2 -> 1): +SortPreservingMergeExec: [o_totalprice@4 DESC,o_orderdate@3 ASC NULLS LAST], fetch=100 + RayShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) diff --git a/testdata/expected-plans/q19.txt b/testdata/expected-plans/q19.txt new file mode 100644 index 0000000..a03d657 --- /dev/null +++ b/testdata/expected-plans/q19.txt @@ -0,0 +1,65 @@ +DataFusion Logical Plan +======================= + +Projection: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue + Aggregate: groupBy=[[]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Projection: lineitem.l_extendedprice, lineitem.l_discount + Inner Join: lineitem.l_partkey = part.p_partkey Filter: part.p_brand = Utf8("Brand#21") AND part.p_container IN ([Utf8("SM CASE"), Utf8("SM BOX"), Utf8("SM PACK"), Utf8("SM PKG")]) AND lineitem.l_quantity >= Decimal128(Some(800),11,2) AND lineitem.l_quantity <= Decimal128(Some(1800),11,2) AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#13") AND part.p_container IN ([Utf8("MED BAG"), Utf8("MED BOX"), Utf8("MED PKG"), Utf8("MED PACK")]) AND lineitem.l_quantity >= Decimal128(Some(2000),11,2) AND lineitem.l_quantity <= Decimal128(Some(3000),11,2) AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#52") AND part.p_container IN ([Utf8("LG CASE"), Utf8("LG BOX"), Utf8("LG PACK"), Utf8("LG PKG")]) AND lineitem.l_quantity >= Decimal128(Some(3000),11,2) AND lineitem.l_quantity <= Decimal128(Some(4000),11,2) AND part.p_size <= Int32(15) + Projection: lineitem.l_partkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount + Filter: (lineitem.l_quantity >= Decimal128(Some(800),11,2) AND lineitem.l_quantity <= Decimal128(Some(1800),11,2) OR lineitem.l_quantity >= Decimal128(Some(2000),11,2) AND lineitem.l_quantity <= Decimal128(Some(3000),11,2) OR lineitem.l_quantity >= Decimal128(Some(3000),11,2) AND lineitem.l_quantity <= Decimal128(Some(4000),11,2)) AND (lineitem.l_shipmode = Utf8("AIR") OR lineitem.l_shipmode = Utf8("AIR REG")) AND lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON") + TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], partial_filters=[lineitem.l_shipmode = Utf8("AIR") OR lineitem.l_shipmode = Utf8("AIR REG"), lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON"), lineitem.l_quantity >= Decimal128(Some(800),11,2) AND lineitem.l_quantity <= Decimal128(Some(1800),11,2) OR lineitem.l_quantity >= Decimal128(Some(2000),11,2) AND lineitem.l_quantity <= Decimal128(Some(3000),11,2) OR lineitem.l_quantity >= Decimal128(Some(3000),11,2) AND lineitem.l_quantity <= Decimal128(Some(4000),11,2)] + Filter: (part.p_brand = Utf8("Brand#21") AND part.p_container IN ([Utf8("SM CASE"), Utf8("SM BOX"), Utf8("SM PACK"), Utf8("SM PKG")]) AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#13") AND part.p_container IN ([Utf8("MED BAG"), Utf8("MED BOX"), Utf8("MED PKG"), Utf8("MED PACK")]) AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#52") AND part.p_container IN ([Utf8("LG CASE"), Utf8("LG BOX"), Utf8("LG PACK"), Utf8("LG PKG")]) AND part.p_size <= Int32(15)) AND part.p_size >= Int32(1) + TableScan: part projection=[p_partkey, p_brand, p_size, p_container], partial_filters=[part.p_size >= Int32(1), part.p_brand = Utf8("Brand#21") AND part.p_container IN ([Utf8("SM CASE"), Utf8("SM BOX"), Utf8("SM PACK"), Utf8("SM PKG")]) AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#13") AND part.p_container IN ([Utf8("MED BAG"), Utf8("MED BOX"), Utf8("MED PKG"), Utf8("MED PACK")]) AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#52") AND part.p_container IN ([Utf8("LG CASE"), Utf8("LG BOX"), Utf8("LG PACK"), Utf8("LG PKG")]) AND part.p_size <= Int32(15)] + +DataFusion Physical Plan +======================== + +ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@0 as revenue] + AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalescePartitionsExec + AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], filter=p_brand@1 = Brand#21 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("SM CASE") }, Literal { value: Utf8("SM BOX") }, Literal { value: Utf8("SM PACK") }, Literal { value: Utf8("SM PKG") }]) AND l_quantity@0 >= Some(800),11,2 AND l_quantity@0 <= Some(1800),11,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("MED BAG") }, Literal { value: Utf8("MED BOX") }, Literal { value: Utf8("MED PKG") }, Literal { value: Utf8("MED PACK") }]) AND l_quantity@0 >= Some(2000),11,2 AND l_quantity@0 <= Some(3000),11,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("LG CASE") }, Literal { value: Utf8("LG BOX") }, Literal { value: Utf8("LG PACK") }, Literal { value: Utf8("LG PKG") }]) AND l_quantity@0 >= Some(3000),11,2 AND l_quantity@0 <= Some(4000),11,2 AND p_size@2 <= 15, projection=[l_extendedprice@6, l_discount@7] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: (p_brand@1 = Brand#21 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("SM CASE") }, Literal { value: Utf8("SM BOX") }, Literal { value: Utf8("SM PACK") }, Literal { value: Utf8("SM PKG") }]) AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("MED BAG") }, Literal { value: Utf8("MED BOX") }, Literal { value: Utf8("MED PKG") }, Literal { value: Utf8("MED PACK") }]) AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("LG CASE") }, Literal { value: Utf8("LG BOX") }, Literal { value: Utf8("LG PACK") }, Literal { value: Utf8("LG PKG") }]) AND p_size@2 <= 15) AND p_size@2 >= 1 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }]) AND p_size@5 <= 15), pruning_predicate=CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_max@0 >= 1 END AND (CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#21 AND Brand#21 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM CASE AND SM CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM BOX AND SM BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PACK AND SM PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PKG AND SM PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 5 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#13 AND Brand#13 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BAG AND MED BAG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BOX AND MED BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PKG AND MED PKG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PACK AND MED PACK <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 10 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#52 AND Brand#52 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG CASE AND LG CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG BOX AND LG BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PACK AND LG PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PKG AND LG PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 15 END), required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: (l_quantity@1 >= Some(800),11,2 AND l_quantity@1 <= Some(1800),11,2 OR l_quantity@1 >= Some(2000),11,2 AND l_quantity@1 <= Some(3000),11,2 OR l_quantity@1 >= Some(3000),11,2 AND l_quantity@1 <= Some(4000),11,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] + ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], predicate=(l_shipmode@14 = AIR OR l_shipmode@14 = AIR REG) AND l_shipinstruct@13 = DELIVER IN PERSON AND (l_quantity@4 >= Some(800),11,2 AND l_quantity@4 <= Some(1800),11,2 OR l_quantity@4 >= Some(2000),11,2 AND l_quantity@4 <= Some(3000),11,2 OR l_quantity@4 >= Some(3000),11,2 AND l_quantity@4 <= Some(4000),11,2), pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR AND AIR <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR REG AND AIR REG <= l_shipmode_max@1 END) AND CASE WHEN l_shipinstruct_null_count@6 = l_shipinstruct_row_count@7 THEN false ELSE l_shipinstruct_min@4 <= DELIVER IN PERSON AND DELIVER IN PERSON <= l_shipinstruct_max@5 END AND (CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(800),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(1800),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(2000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(3000),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(3000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(4000),11,2 END), required_guarantees=[l_shipmode in (AIR REG, AIR), l_shipinstruct in (DELIVER IN PERSON)] + +DataFusion Ray Distributed Plan +=========== + +Query Stage #0 (1 -> 2): +RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: (p_brand@1 = Brand#21 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("SM CASE") }, Literal { value: Utf8("SM BOX") }, Literal { value: Utf8("SM PACK") }, Literal { value: Utf8("SM PKG") }]) AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("MED BAG") }, Literal { value: Utf8("MED BOX") }, Literal { value: Utf8("MED PKG") }, Literal { value: Utf8("MED PACK") }]) AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("LG CASE") }, Literal { value: Utf8("LG BOX") }, Literal { value: Utf8("LG PACK") }, Literal { value: Utf8("LG PKG") }]) AND p_size@2 <= 15) AND p_size@2 >= 1 + ParquetExec: file_groups={ ... }]) AND p_size@5 <= 15), pruning_predicate=CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_max@0 >= 1 END AND (CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#21 AND Brand#21 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM CASE AND SM CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM BOX AND SM BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PACK AND SM PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PKG AND SM PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 5 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#13 AND Brand#13 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BAG AND MED BAG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BOX AND MED BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PKG AND MED PKG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PACK AND MED PACK <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 10 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#52 AND Brand#52 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG CASE AND LG CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG BOX AND LG BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PACK AND LG PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PKG AND LG PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 15 END), required_guarantees=[] + +Query Stage #1 (2 -> 2): +RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: (l_quantity@1 >= Some(800),11,2 AND l_quantity@1 <= Some(1800),11,2 OR l_quantity@1 >= Some(2000),11,2 AND l_quantity@1 <= Some(3000),11,2 OR l_quantity@1 >= Some(3000),11,2 AND l_quantity@1 <= Some(4000),11,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] + ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], predicate=(l_shipmode@14 = AIR OR l_shipmode@14 = AIR REG) AND l_shipinstruct@13 = DELIVER IN PERSON AND (l_quantity@4 >= Some(800),11,2 AND l_quantity@4 <= Some(1800),11,2 OR l_quantity@4 >= Some(2000),11,2 AND l_quantity@4 <= Some(3000),11,2 OR l_quantity@4 >= Some(3000),11,2 AND l_quantity@4 <= Some(4000),11,2), pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR AND AIR <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR REG AND AIR REG <= l_shipmode_max@1 END) AND CASE WHEN l_shipinstruct_null_count@6 = l_shipinstruct_row_count@7 THEN false ELSE l_shipinstruct_min@4 <= DELIVER IN PERSON AND DELIVER IN PERSON <= l_shipinstruct_max@5 END AND (CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(800),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(1800),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(2000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(3000),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(3000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(4000),11,2 END), required_guarantees=[l_shipmode in (AIR REG, AIR), l_shipinstruct in (DELIVER IN PERSON)] + +Query Stage #2 (2 -> 1): +RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([], 2)) + AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], filter=p_brand@1 = Brand#21 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("SM CASE") }, Literal { value: Utf8("SM BOX") }, Literal { value: Utf8("SM PACK") }, Literal { value: Utf8("SM PKG") }]) AND l_quantity@0 >= Some(800),11,2 AND l_quantity@0 <= Some(1800),11,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("MED BAG") }, Literal { value: Utf8("MED BOX") }, Literal { value: Utf8("MED PKG") }, Literal { value: Utf8("MED PACK") }]) AND l_quantity@0 >= Some(2000),11,2 AND l_quantity@0 <= Some(3000),11,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND Use p_container@3 IN (SET) ([Literal { value: Utf8("LG CASE") }, Literal { value: Utf8("LG BOX") }, Literal { value: Utf8("LG PACK") }, Literal { value: Utf8("LG PKG") }]) AND l_quantity@0 >= Some(3000),11,2 AND l_quantity@0 <= Some(4000),11,2 AND p_size@2 <= 15, projection=[l_extendedprice@6, l_discount@7] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) + +Query Stage #3 (1 -> 1): +ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@0 as revenue] + AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalescePartitionsExec + RayShuffleReaderExec(stage_id=2, input_partitioning=Hash([], 2)) + diff --git a/testdata/expected-plans/q2.txt b/testdata/expected-plans/q2.txt index 456c8da..bca84ad 100644 --- a/testdata/expected-plans/q2.txt +++ b/testdata/expected-plans/q2.txt @@ -1,140 +1,134 @@ DataFusion Logical Plan ======================= -Limit: skip=0, fetch=100 - Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST, fetch=100 - Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment - Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.min(partsupp.ps_supplycost) - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name - Inner Join: nation.n_regionkey = region.r_regionkey - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name, nation.n_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_nationkey, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - Projection: part.p_partkey, part.p_mfgr, partsupp.ps_suppkey, partsupp.ps_supplycost - Inner Join: part.p_partkey = partsupp.ps_partkey - Projection: part.p_partkey, part.p_mfgr - Filter: part.p_size = Int32(48) AND part.p_type LIKE Utf8("%TIN") - TableScan: part projection=[p_partkey, p_mfgr, p_type, p_size], partial_filters=[part.p_size = Int32(48), part.p_type LIKE Utf8("%TIN")] +Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST, fetch=100 + Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment + Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.min(partsupp.ps_supplycost) + Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name + Inner Join: nation.n_regionkey = region.r_regionkey + Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name, nation.n_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_nationkey, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey + Projection: part.p_partkey, part.p_mfgr, partsupp.ps_suppkey, partsupp.ps_supplycost + Inner Join: part.p_partkey = partsupp.ps_partkey + Projection: part.p_partkey, part.p_mfgr + Filter: part.p_size = Int32(48) AND part.p_type LIKE Utf8("%TIN") + TableScan: part projection=[p_partkey, p_mfgr, p_type, p_size], partial_filters=[part.p_size = Int32(48), part.p_type LIKE Utf8("%TIN")] + TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] + TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] + TableScan: nation projection=[n_nationkey, n_name, n_regionkey] + Projection: region.r_regionkey + Filter: region.r_name = Utf8("ASIA") + TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("ASIA")] + SubqueryAlias: __scalar_sq_1 + Projection: min(partsupp.ps_supplycost), partsupp.ps_partkey + Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[min(partsupp.ps_supplycost)]] + Projection: partsupp.ps_partkey, partsupp.ps_supplycost + Inner Join: nation.n_regionkey = region.r_regionkey + Projection: partsupp.ps_partkey, partsupp.ps_supplycost, nation.n_regionkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Projection: partsupp.ps_partkey, partsupp.ps_supplycost, supplier.s_nationkey + Inner Join: partsupp.ps_suppkey = supplier.s_suppkey TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] - TableScan: nation projection=[n_nationkey, n_name, n_regionkey] - Projection: region.r_regionkey - Filter: region.r_name = Utf8("ASIA") - TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("ASIA")] - SubqueryAlias: __scalar_sq_1 - Projection: min(partsupp.ps_supplycost), partsupp.ps_partkey - Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[min(partsupp.ps_supplycost)]] - Projection: partsupp.ps_partkey, partsupp.ps_supplycost - Inner Join: nation.n_regionkey = region.r_regionkey - Projection: partsupp.ps_partkey, partsupp.ps_supplycost, nation.n_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: partsupp.ps_partkey, partsupp.ps_supplycost, supplier.s_nationkey - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_nationkey] - TableScan: nation projection=[n_nationkey, n_regionkey] - Projection: region.r_regionkey - Filter: region.r_name = Utf8("ASIA") - TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("ASIA")] + TableScan: supplier projection=[s_suppkey, s_nationkey] + TableScan: nation projection=[n_nationkey, n_regionkey] + Projection: region.r_regionkey + Filter: region.r_name = Utf8("ASIA") + TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("ASIA")] DataFusion Physical Plan ======================== -GlobalLimitExec: skip=0, fetch=100 - SortPreservingMergeExec: [s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], fetch=100 - SortExec: TopK(fetch=100), expr=[s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@7 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@9)], projection=[p_partkey@1, p_mfgr@2, s_name@3, s_address@4, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@9] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] +SortPreservingMergeExec: [s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], fetch=100 + SortExec: TopK(fetch=100), expr=[s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@7 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@9)], projection=[p_partkey@1, p_mfgr@2, s_name@3, s_address@4, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@9] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_regionkey@9], 2), input_partitions=2 + ProjectionExec: expr=[p_partkey@2 as p_partkey, p_mfgr@3 as p_mfgr, s_name@4 as s_name, s_address@5 as s_address, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@9 as ps_supplycost, n_name@0 as n_name, n_regionkey@1 as n_regionkey] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@4)], projection=[n_name@1, n_regionkey@2, p_partkey@3, p_mfgr@4, s_name@5, s_address@6, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@11] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_regionkey@9], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@2 as p_partkey, p_mfgr@3 as p_mfgr, s_name@4 as s_name, s_address@5 as s_address, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@9 as ps_supplycost, n_name@0 as n_name, n_regionkey@1 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@4)], projection=[n_name@1, n_regionkey@2, p_partkey@3, p_mfgr@4, s_name@5, s_address@6, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@11] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name, n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@4], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@6 as p_partkey, p_mfgr@7 as p_mfgr, s_name@0 as s_name, s_address@1 as s_address, s_nationkey@2 as s_nationkey, s_phone@3 as s_phone, s_acctbal@4 as s_acctbal, s_comment@5 as s_comment, ps_supplycost@8 as ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@2)], projection=[s_name@1, s_address@2, s_nationkey@3, s_phone@4, s_acctbal@5, s_comment@6, p_partkey@7, p_mfgr@8, ps_supplycost@10] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_size@3 = 48 AND p_type@2 LIKE %TIN - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_mfgr, p_type, p_size], predicate=p_size@5 = 48 AND p_type@4 LIKE %TIN, pruning_predicate=CASE WHEN p_size_null_count@2 = p_size_row_count@3 THEN false ELSE p_size_min@0 <= 48 AND 48 <= p_size_max@1 END, required_guarantees=[p_size in (48)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 2), input_partitions=2 - ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] - AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@2)], projection=[ps_partkey@1, ps_supplycost@2] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] + RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name, n_regionkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_nationkey@4], 2), input_partitions=2 + ProjectionExec: expr=[p_partkey@6 as p_partkey, p_mfgr@7 as p_mfgr, s_name@0 as s_name, s_address@1 as s_address, s_nationkey@2 as s_nationkey, s_phone@3 as s_phone, s_acctbal@4 as s_acctbal, s_comment@5 as s_comment, ps_supplycost@8 as ps_supplycost] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@2)], projection=[s_name@1, s_address@2, s_nationkey@3, s_phone@4, s_acctbal@5, s_comment@6, p_partkey@7, p_mfgr@8, ps_supplycost@10] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_regionkey@2], 2), input_partitions=2 - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, n_regionkey@0 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_regionkey@1, ps_partkey@2, ps_supplycost@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] + RepartitionExec: partitioning=Hash([ps_suppkey@2], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4] CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@1)], projection=[s_nationkey@1, ps_partkey@2, ps_supplycost@4] + RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 + FilterExec: p_size@3 = 48 AND p_type@2 LIKE %TIN, projection=[p_partkey@0, p_mfgr@1] RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_mfgr, p_type, p_size], predicate=p_size@5 = 48 AND p_type@4 LIKE %TIN, pruning_predicate=CASE WHEN p_size_null_count@2 = p_size_row_count@3 THEN false ELSE p_size_min@0 <= 48 AND 48 <= p_size_max@1 END, required_guarantees=[p_size in (48)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 2), input_partitions=2 + ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] + AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@2)], projection=[ps_partkey@1, ps_supplycost@2] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_regionkey@2], 2), input_partitions=2 + ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, n_regionkey@0 as n_regionkey] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_regionkey@1, ps_partkey@2, ps_supplycost@3] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_regionkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 + ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@1)], projection=[s_nationkey@1, ps_partkey@2, ps_supplycost@4] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([ps_suppkey@1], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] DataFusion Ray Distributed Plan =========== Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] Query Stage #1 (1 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) @@ -146,10 +140,9 @@ RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_sup Query Stage #3 (1 -> 2): RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_size@3 = 48 AND p_type@2 LIKE %TIN - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_mfgr, p_type, p_size], predicate=p_size@5 = 48 AND p_type@4 LIKE %TIN, pruning_predicate=CASE WHEN p_size_null_count@2 = p_size_row_count@3 THEN false ELSE p_size_min@0 <= 48 AND 48 <= p_size_max@1 END, required_guarantees=[p_size in (48)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_size@3 = 48 AND p_type@2 LIKE %TIN, projection=[p_partkey@0, p_mfgr@1] + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_mfgr, p_type, p_size], predicate=p_size@5 = 48 AND p_type@4 LIKE %TIN, pruning_predicate=CASE WHEN p_size_null_count@2 = p_size_row_count@3 THEN false ELSE p_size_min@0 <= 48 AND 48 <= p_size_max@1 END, required_guarantees=[p_size in (48)] Query Stage #4 (2 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) @@ -195,10 +188,9 @@ RayShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "p_par Query Stage #9 (1 -> 2): RayShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] Query Stage #10 (1 -> 2): RayShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) @@ -260,8 +252,7 @@ RayShuffleWriterExec(stage_id=17, output_partitioning=Hash([Column { name: "p_pa CoalesceBatchesExec: target_batch_size=8192 RayShuffleReaderExec(stage_id=16, input_partitioning=Hash([Column { name: "ps_partkey", index: 1 }, Column { name: "min(partsupp.ps_supplycost)", index: 0 }], 2)) -Query Stage #18 (1 -> 1): -GlobalLimitExec: skip=0, fetch=100 - SortPreservingMergeExec: [s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], fetch=100 - RayShuffleReaderExec(stage_id=17, input_partitioning=Hash([Column { name: "p_partkey", index: 3 }], 2)) +Query Stage #18 (2 -> 1): +SortPreservingMergeExec: [s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], fetch=100 + RayShuffleReaderExec(stage_id=17, input_partitioning=Hash([Column { name: "p_partkey", index: 3 }], 2)) diff --git a/testdata/expected-plans/q20.txt b/testdata/expected-plans/q20.txt index 946f7f3..42bddd2 100644 --- a/testdata/expected-plans/q20.txt +++ b/testdata/expected-plans/q20.txt @@ -40,10 +40,9 @@ SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = KENYA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = KENYA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= KENYA AND KENYA <= n_name_max@1 END, required_guarantees=[n_name in (KENYA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = KENYA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = KENYA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= KENYA AND KENYA <= n_name_max@1 END, required_guarantees=[n_name in (KENYA)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([s_nationkey@3], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -58,11 +57,10 @@ SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, ps_partkey@0)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE blanched% - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE blanched% + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_name@1 LIKE blanched%, projection=[p_partkey@0] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE blanched% CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_availqty] @@ -71,20 +69,18 @@ SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 2), input_partitions=2 AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] - ProjectionExec: expr=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey, l_quantity@2 as l_quantity] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1993-01-01 AND l_shipdate@3 < 1994-01-01 - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], predicate=l_shipdate@10 >= 1993-01-01 AND l_shipdate@10 < 1994-01-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1993-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1994-01-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 >= 1993-01-01 AND l_shipdate@3 < 1994-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2] + ParquetExec: file_groups={ ... }, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], predicate=l_shipdate@10 >= 1993-01-01 AND l_shipdate@10 < 1994-01-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1993-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1994-01-01 END, required_guarantees=[] DataFusion Ray Distributed Plan =========== Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = KENYA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = KENYA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= KENYA AND KENYA <= n_name_max@1 END, required_guarantees=[n_name in (KENYA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = KENYA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = KENYA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= KENYA AND KENYA <= n_name_max@1 END, required_guarantees=[n_name in (KENYA)] Query Stage #1 (1 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) @@ -101,10 +97,9 @@ RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_sup Query Stage #3 (1 -> 2): RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE blanched% - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE blanched% + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_name@1 LIKE blanched%, projection=[p_partkey@0] + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE blanched% Query Stage #4 (2 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) @@ -122,10 +117,9 @@ RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "ps_pa Query Stage #6 (2 -> 2): RayShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }, Column { name: "l_suppkey", index: 1 }], 2)) AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] - ProjectionExec: expr=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey, l_quantity@2 as l_quantity] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1993-01-01 AND l_shipdate@3 < 1994-01-01 - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], predicate=l_shipdate@10 >= 1993-01-01 AND l_shipdate@10 < 1994-01-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1993-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1994-01-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 >= 1993-01-01 AND l_shipdate@3 < 1994-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2] + ParquetExec: file_groups={ ... }, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], predicate=l_shipdate@10 >= 1993-01-01 AND l_shipdate@10 < 1994-01-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1993-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1994-01-01 END, required_guarantees=[] Query Stage #7 (2 -> 2): RayShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) diff --git a/testdata/expected-plans/q21.txt b/testdata/expected-plans/q21.txt index 6121007..6a09362 100644 --- a/testdata/expected-plans/q21.txt +++ b/testdata/expected-plans/q21.txt @@ -1,115 +1,107 @@ DataFusion Logical Plan ======================= -Limit: skip=0, fetch=100 - Sort: numwait DESC NULLS FIRST, supplier.s_name ASC NULLS LAST, fetch=100 - Projection: supplier.s_name, count(*) AS numwait - Aggregate: groupBy=[[supplier.s_name]], aggr=[[count(Int64(1)) AS count(*)]] - Projection: supplier.s_name - LeftAnti Join: l1.l_orderkey = __correlated_sq_2.l_orderkey Filter: __correlated_sq_2.l_suppkey != l1.l_suppkey - LeftSemi Join: l1.l_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_suppkey != l1.l_suppkey - Projection: supplier.s_name, l1.l_orderkey, l1.l_suppkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: supplier.s_name, supplier.s_nationkey, l1.l_orderkey, l1.l_suppkey - Inner Join: l1.l_orderkey = orders.o_orderkey - Projection: supplier.s_name, supplier.s_nationkey, l1.l_orderkey, l1.l_suppkey - Inner Join: supplier.s_suppkey = l1.l_suppkey - TableScan: supplier projection=[s_suppkey, s_name, s_nationkey] - SubqueryAlias: l1 - Projection: lineitem.l_orderkey, lineitem.l_suppkey - Filter: lineitem.l_receiptdate > lineitem.l_commitdate - TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] - Projection: orders.o_orderkey - Filter: orders.o_orderstatus = Utf8("F") - TableScan: orders projection=[o_orderkey, o_orderstatus], partial_filters=[orders.o_orderstatus = Utf8("F")] - Projection: nation.n_nationkey - Filter: nation.n_name = Utf8("ARGENTINA") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("ARGENTINA")] - SubqueryAlias: __correlated_sq_1 - SubqueryAlias: l2 - TableScan: lineitem projection=[l_orderkey, l_suppkey] - SubqueryAlias: __correlated_sq_2 - SubqueryAlias: l3 - Projection: lineitem.l_orderkey, lineitem.l_suppkey - Filter: lineitem.l_receiptdate > lineitem.l_commitdate - TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] +Sort: numwait DESC NULLS FIRST, supplier.s_name ASC NULLS LAST, fetch=100 + Projection: supplier.s_name, count(*) AS numwait + Aggregate: groupBy=[[supplier.s_name]], aggr=[[count(Int64(1)) AS count(*)]] + Projection: supplier.s_name + LeftAnti Join: l1.l_orderkey = __correlated_sq_2.l_orderkey Filter: __correlated_sq_2.l_suppkey != l1.l_suppkey + LeftSemi Join: l1.l_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_suppkey != l1.l_suppkey + Projection: supplier.s_name, l1.l_orderkey, l1.l_suppkey + Inner Join: supplier.s_nationkey = nation.n_nationkey + Projection: supplier.s_name, supplier.s_nationkey, l1.l_orderkey, l1.l_suppkey + Inner Join: l1.l_orderkey = orders.o_orderkey + Projection: supplier.s_name, supplier.s_nationkey, l1.l_orderkey, l1.l_suppkey + Inner Join: supplier.s_suppkey = l1.l_suppkey + TableScan: supplier projection=[s_suppkey, s_name, s_nationkey] + SubqueryAlias: l1 + Projection: lineitem.l_orderkey, lineitem.l_suppkey + Filter: lineitem.l_receiptdate > lineitem.l_commitdate + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] + Projection: orders.o_orderkey + Filter: orders.o_orderstatus = Utf8("F") + TableScan: orders projection=[o_orderkey, o_orderstatus], partial_filters=[orders.o_orderstatus = Utf8("F")] + Projection: nation.n_nationkey + Filter: nation.n_name = Utf8("ARGENTINA") + TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("ARGENTINA")] + SubqueryAlias: __correlated_sq_1 + SubqueryAlias: l2 + TableScan: lineitem projection=[l_orderkey, l_suppkey] + SubqueryAlias: __correlated_sq_2 + SubqueryAlias: l3 + Projection: lineitem.l_orderkey, lineitem.l_suppkey + Filter: lineitem.l_receiptdate > lineitem.l_commitdate + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] DataFusion Physical Plan ======================== -GlobalLimitExec: skip=0, fetch=100 - SortPreservingMergeExec: [numwait@1 DESC,s_name@0 ASC NULLS LAST], fetch=100 - SortExec: TopK(fetch=100), expr=[numwait@1 DESC,s_name@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[s_name@0 as s_name, count(*)@1 as numwait] - AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_name@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0 - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@1)], projection=[s_name@1, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ARGENTINA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ARGENTINA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ARGENTINA AND ARGENTINA <= n_name_max@1 END, required_guarantees=[n_name in (ARGENTINA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@1], 2), input_partitions=2 +SortPreservingMergeExec: [numwait@1 DESC,s_name@0 ASC NULLS LAST], fetch=100 + SortExec: TopK(fetch=100), expr=[numwait@1 DESC,s_name@0 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[s_name@0 as s_name, count(*)@1 as numwait] + AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(*)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_name@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(*)] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0 + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@1)], projection=[s_name@1, l_orderkey@3, l_suppkey@4] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@2)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[o_orderkey@0 as o_orderkey] + FilterExec: n_name@1 = ARGENTINA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ARGENTINA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ARGENTINA AND ARGENTINA <= n_name_max@1 END, required_guarantees=[n_name in (ARGENTINA)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_nationkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@2)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderstatus], predicate=o_orderstatus@2 = F, pruning_predicate=CASE WHEN o_orderstatus_null_count@2 = o_orderstatus_row_count@3 THEN false ELSE o_orderstatus_min@0 <= F AND F <= o_orderstatus_max@1 END, required_guarantees=[o_orderstatus in (F)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@2], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderstatus@1 = F - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderstatus], predicate=o_orderstatus@2 = F, pruning_predicate=CASE WHEN o_orderstatus_null_count@2 = o_orderstatus_row_count@3 THEN false ELSE o_orderstatus_min@0 <= F AND F <= o_orderstatus_max@1 END, required_guarantees=[o_orderstatus in (F)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@1], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_suppkey@1 as l_suppkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_suppkey@1 as l_suppkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 + RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_suppkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 DataFusion Ray Distributed Plan =========== Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ProjectionExec: expr=[n_nationkey@0 as n_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ARGENTINA - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ARGENTINA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ARGENTINA AND ARGENTINA <= n_name_max@1 END, required_guarantees=[n_name in (ARGENTINA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = ARGENTINA, projection=[n_nationkey@0] + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ARGENTINA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ARGENTINA AND ARGENTINA <= n_name_max@1 END, required_guarantees=[n_name in (ARGENTINA)] Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[o_orderkey@0 as o_orderkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderstatus@1 = F - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderstatus], predicate=o_orderstatus@2 = F, pruning_predicate=CASE WHEN o_orderstatus_null_count@2 = o_orderstatus_row_count@3 THEN false ELSE o_orderstatus_min@0 <= F AND F <= o_orderstatus_max@1 END, required_guarantees=[o_orderstatus in (F)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderstatus], predicate=o_orderstatus@2 = F, pruning_predicate=CASE WHEN o_orderstatus_null_count@2 = o_orderstatus_row_count@3 THEN false ELSE o_orderstatus_min@0 <= F AND F <= o_orderstatus_max@1 END, required_guarantees=[o_orderstatus in (F)] Query Stage #2 (1 -> 2): RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) @@ -117,10 +109,9 @@ RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_sup Query Stage #3 (2 -> 2): RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_suppkey@1 as l_suppkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 Query Stage #4 (2 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 2 }], 2)) @@ -155,10 +146,9 @@ RayShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "l_ord Query Stage #8 (2 -> 2): RayShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_suppkey@1 as l_suppkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 Query Stage #9 (2 -> 2): RayShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) @@ -182,8 +172,7 @@ RayShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "s_na CoalesceBatchesExec: target_batch_size=8192 RayShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) -Query Stage #11 (1 -> 1): -GlobalLimitExec: skip=0, fetch=100 - SortPreservingMergeExec: [numwait@1 DESC,s_name@0 ASC NULLS LAST], fetch=100 - RayShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) +Query Stage #11 (2 -> 1): +SortPreservingMergeExec: [numwait@1 DESC,s_name@0 ASC NULLS LAST], fetch=100 + RayShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) diff --git a/testdata/expected-plans/q22.txt b/testdata/expected-plans/q22.txt index afebeaf..80da0ec 100644 --- a/testdata/expected-plans/q22.txt +++ b/testdata/expected-plans/q22.txt @@ -34,10 +34,9 @@ SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] CoalescePartitionsExec AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] - ProjectionExec: expr=[c_acctbal@1 as c_acctbal] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_acctbal@1 > Some(0),11,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("24") }, Literal { value: Utf8("34") }, Literal { value: Utf8("16") }, Literal { value: Utf8("30") }, Literal { value: Utf8("33") }, Literal { value: Utf8("14") }, Literal { value: Utf8("13") }]) - ParquetExec: file_groups={ ... }]) AND c_acctbal@5 > Some(0),11,2, pruning_predicate=CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END AND CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: c_acctbal@1 > Some(0),11,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("24") }, Literal { value: Utf8("34") }, Literal { value: Utf8("16") }, Literal { value: Utf8("30") }, Literal { value: Utf8("33") }, Literal { value: Utf8("14") }, Literal { value: Utf8("13") }]), projection=[c_acctbal@1] + ParquetExec: file_groups={ ... }]) AND c_acctbal@5 > Some(0),11,2, pruning_predicate=CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END AND CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END, required_guarantees=[] CoalesceBatchesExec: target_batch_size=8192 HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] CoalesceBatchesExec: target_batch_size=8192 @@ -55,10 +54,9 @@ DataFusion Ray Distributed Plan Query Stage #0 (2 -> 1): RayShuffleWriterExec(stage_id=0, output_partitioning=UnknownPartitioning(2)) AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] - ProjectionExec: expr=[c_acctbal@1 as c_acctbal] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_acctbal@1 > Some(0),11,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("24") }, Literal { value: Utf8("34") }, Literal { value: Utf8("16") }, Literal { value: Utf8("30") }, Literal { value: Utf8("33") }, Literal { value: Utf8("14") }, Literal { value: Utf8("13") }]) - ParquetExec: file_groups={ ... }]) AND c_acctbal@5 > Some(0),11,2, pruning_predicate=CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END AND CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: c_acctbal@1 > Some(0),11,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("24") }, Literal { value: Utf8("34") }, Literal { value: Utf8("16") }, Literal { value: Utf8("30") }, Literal { value: Utf8("33") }, Literal { value: Utf8("14") }, Literal { value: Utf8("13") }]), projection=[c_acctbal@1] + ParquetExec: file_groups={ ... }]) AND c_acctbal@5 > Some(0),11,2, pruning_predicate=CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END AND CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END, required_guarantees=[] Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) diff --git a/testdata/expected-plans/q3.txt b/testdata/expected-plans/q3.txt index 2a1026c..e37300d 100644 --- a/testdata/expected-plans/q3.txt +++ b/testdata/expected-plans/q3.txt @@ -1,67 +1,62 @@ DataFusion Logical Plan ======================= -Limit: skip=0, fetch=10 - Sort: revenue DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST, fetch=10 - Projection: lineitem.l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, orders.o_orderdate, orders.o_shippriority - Aggregate: groupBy=[[lineitem.l_orderkey, orders.o_orderdate, orders.o_shippriority]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: orders.o_orderdate, orders.o_shippriority, lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: orders.o_orderkey, orders.o_orderdate, orders.o_shippriority - Inner Join: customer.c_custkey = orders.o_custkey - Projection: customer.c_custkey - Filter: customer.c_mktsegment = Utf8("BUILDING") - TableScan: customer projection=[c_custkey, c_mktsegment], partial_filters=[customer.c_mktsegment = Utf8("BUILDING")] - Filter: orders.o_orderdate < Date32("1995-03-15") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], partial_filters=[orders.o_orderdate < Date32("1995-03-15")] - Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount - Filter: lineitem.l_shipdate > Date32("1995-03-15") - TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate > Date32("1995-03-15")] +Sort: revenue DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST, fetch=10 + Projection: lineitem.l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, orders.o_orderdate, orders.o_shippriority + Aggregate: groupBy=[[lineitem.l_orderkey, orders.o_orderdate, orders.o_shippriority]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] + Projection: orders.o_orderdate, orders.o_shippriority, lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount + Inner Join: orders.o_orderkey = lineitem.l_orderkey + Projection: orders.o_orderkey, orders.o_orderdate, orders.o_shippriority + Inner Join: customer.c_custkey = orders.o_custkey + Projection: customer.c_custkey + Filter: customer.c_mktsegment = Utf8("BUILDING") + TableScan: customer projection=[c_custkey, c_mktsegment], partial_filters=[customer.c_mktsegment = Utf8("BUILDING")] + Filter: orders.o_orderdate < Date32("1995-03-15") + TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], partial_filters=[orders.o_orderdate < Date32("1995-03-15")] + Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount + Filter: lineitem.l_shipdate > Date32("1995-03-15") + TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate > Date32("1995-03-15")] DataFusion Physical Plan ======================== -GlobalLimitExec: skip=0, fetch=10 - SortPreservingMergeExec: [revenue@1 DESC,o_orderdate@2 ASC NULLS LAST], fetch=10 - SortExec: TopK(fetch=10), expr=[revenue@1 DESC,o_orderdate@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@3 as revenue, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority] - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0, o_orderdate@1, o_shippriority@2], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ProjectionExec: expr=[c_custkey@0 as c_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_mktsegment@1 = BUILDING - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_mktsegment], predicate=c_mktsegment@6 = BUILDING, pruning_predicate=CASE WHEN c_mktsegment_null_count@2 = c_mktsegment_row_count@3 THEN false ELSE c_mktsegment_min@0 <= BUILDING AND BUILDING <= c_mktsegment_max@1 END, required_guarantees=[c_mktsegment in (BUILDING)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 < 1995-03-15 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], predicate=o_orderdate@4 < 1995-03-15, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@0 < 1995-03-15 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] +SortPreservingMergeExec: [revenue@1 DESC,o_orderdate@2 ASC NULLS LAST], fetch=10 + SortExec: TopK(fetch=10), expr=[revenue@1 DESC,o_orderdate@2 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[l_orderkey@0 as l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@3 as revenue, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority] + AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0, o_orderdate@1, o_shippriority@2], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0] + ParquetExec: file_groups={ ... }, projection=[c_custkey, c_mktsegment], predicate=c_mktsegment@6 = BUILDING, pruning_predicate=CASE WHEN c_mktsegment_null_count@2 = c_mktsegment_row_count@3 THEN false ELSE c_mktsegment_min@0 <= BUILDING AND BUILDING <= c_mktsegment_max@1 END, required_guarantees=[c_mktsegment in (BUILDING)] CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 > 1995-03-15 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 > 1995-03-15, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 > 1995-03-15 END, required_guarantees=[] + RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@2 < 1995-03-15 + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], predicate=o_orderdate@4 < 1995-03-15, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@0 < 1995-03-15 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 > 1995-03-15, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 > 1995-03-15 END, required_guarantees=[] DataFusion Ray Distributed Plan =========== Query Stage #0 (2 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ProjectionExec: expr=[c_custkey@0 as c_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_mktsegment@1 = BUILDING - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_mktsegment], predicate=c_mktsegment@6 = BUILDING, pruning_predicate=CASE WHEN c_mktsegment_null_count@2 = c_mktsegment_row_count@3 THEN false ELSE c_mktsegment_min@0 <= BUILDING AND BUILDING <= c_mktsegment_max@1 END, required_guarantees=[c_mktsegment in (BUILDING)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0] + ParquetExec: file_groups={ ... }, projection=[c_custkey, c_mktsegment], predicate=c_mktsegment@6 = BUILDING, pruning_predicate=CASE WHEN c_mktsegment_null_count@2 = c_mktsegment_row_count@3 THEN false ELSE c_mktsegment_min@0 <= BUILDING AND BUILDING <= c_mktsegment_max@1 END, required_guarantees=[c_mktsegment in (BUILDING)] Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) @@ -80,10 +75,9 @@ RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_ord Query Stage #3 (2 -> 2): RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 > 1995-03-15 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 > 1995-03-15, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 > 1995-03-15 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 > 1995-03-15, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 > 1995-03-15 END, required_guarantees=[] Query Stage #4 (2 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 1 }, Column { name: "o_shippriority", index: 2 }], 2)) @@ -103,8 +97,7 @@ RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "l_ord CoalesceBatchesExec: target_batch_size=8192 RayShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 1 }, Column { name: "o_shippriority", index: 2 }], 2)) -Query Stage #6 (1 -> 1): -GlobalLimitExec: skip=0, fetch=10 - SortPreservingMergeExec: [revenue@1 DESC,o_orderdate@2 ASC NULLS LAST], fetch=10 - RayShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 2 }, Column { name: "o_shippriority", index: 3 }], 2)) +Query Stage #6 (2 -> 1): +SortPreservingMergeExec: [revenue@1 DESC,o_orderdate@2 ASC NULLS LAST], fetch=10 + RayShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 2 }, Column { name: "o_shippriority", index: 3 }], 2)) diff --git a/testdata/expected-plans/q4.txt b/testdata/expected-plans/q4.txt index cb2cc48..93f517c 100644 --- a/testdata/expected-plans/q4.txt +++ b/testdata/expected-plans/q4.txt @@ -28,33 +28,29 @@ SortPreservingMergeExec: [o_orderpriority@0 ASC NULLS LAST] HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_orderpriority@2 as o_orderpriority] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@1 >= 1995-04-01 AND o_orderdate@1 < 1995-07-01 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate, o_orderpriority], predicate=o_orderdate@4 >= 1995-04-01 AND o_orderdate@4 < 1995-07-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-04-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-07-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@1 >= 1995-04-01 AND o_orderdate@1 < 1995-07-01, projection=[o_orderkey@0, o_orderpriority@2] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate, o_orderpriority], predicate=o_orderdate@4 >= 1995-04-01 AND o_orderdate@4 < 1995-07-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-04-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-07-01 END, required_guarantees=[] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@0 as l_orderkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@2 > l_commitdate@1 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 DataFusion Ray Distributed Plan =========== Query Stage #0 (2 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_orderpriority@2 as o_orderpriority] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@1 >= 1995-04-01 AND o_orderdate@1 < 1995-07-01 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate, o_orderpriority], predicate=o_orderdate@4 >= 1995-04-01 AND o_orderdate@4 < 1995-07-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-04-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-07-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@1 >= 1995-04-01 AND o_orderdate@1 < 1995-07-01, projection=[o_orderkey@0, o_orderpriority@2] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate, o_orderpriority], predicate=o_orderdate@4 >= 1995-04-01 AND o_orderdate@4 < 1995-07-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-04-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-07-01 END, required_guarantees=[] Query Stage #1 (2 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[l_orderkey@0 as l_orderkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@2 > l_commitdate@1 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0] + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 Query Stage #2 (2 -> 2): RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_orderpriority", index: 0 }], 2)) diff --git a/testdata/expected-plans/q5.txt b/testdata/expected-plans/q5.txt index f5ef8e3..2678abb 100644 --- a/testdata/expected-plans/q5.txt +++ b/testdata/expected-plans/q5.txt @@ -40,10 +40,9 @@ SortPreservingMergeExec: [revenue@1 DESC] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = AFRICA - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = AFRICA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= AFRICA AND AFRICA <= r_name_max@1 END, required_guarantees=[r_name in (AFRICA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = AFRICA, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = AFRICA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= AFRICA AND AFRICA <= r_name_max@1 END, required_guarantees=[r_name in (AFRICA)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([n_regionkey@3], 2), input_partitions=2 ProjectionExec: expr=[l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, n_name@0 as n_name, n_regionkey@1 as n_regionkey] @@ -74,10 +73,9 @@ SortPreservingMergeExec: [revenue@1 DESC] ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1994-01-01 AND o_orderdate@4 < 1995-01-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1994-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-01-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1994-01-01 AND o_orderdate@4 < 1995-01-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1994-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-01-01 END, required_guarantees=[] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount] @@ -87,10 +85,9 @@ DataFusion Ray Distributed Plan Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = AFRICA - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = AFRICA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= AFRICA AND AFRICA <= r_name_max@1 END, required_guarantees=[r_name in (AFRICA)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = AFRICA, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = AFRICA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= AFRICA AND AFRICA <= r_name_max@1 END, required_guarantees=[r_name in (AFRICA)] Query Stage #1 (1 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) @@ -106,10 +103,9 @@ RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "c_cus Query Stage #4 (2 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1994-01-01 AND o_orderdate@4 < 1995-01-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1994-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-01-01 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1] + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1994-01-01 AND o_orderdate@4 < 1995-01-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1994-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-01-01 END, required_guarantees=[] Query Stage #5 (2 -> 2): RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "o_orderkey", index: 1 }], 2)) diff --git a/testdata/expected-plans/q6.txt b/testdata/expected-plans/q6.txt index 70264bf..1b07162 100644 --- a/testdata/expected-plans/q6.txt +++ b/testdata/expected-plans/q6.txt @@ -14,10 +14,9 @@ ProjectionExec: expr=[sum(lineitem.l_extendedprice * lineitem.l_discount)@0 as r AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] CoalescePartitionsExec AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(3),11,2 AND l_discount@2 <= Some(5),11,2 AND l_quantity@0 < Some(2400),11,2 - ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1994-01-01 AND l_shipdate@10 < 1995-01-01 AND l_discount@6 >= Some(3),11,2 AND l_discount@6 <= Some(5),11,2 AND l_quantity@4 < Some(2400),11,2, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1994-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-01-01 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_max@4 >= Some(3),11,2 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_min@7 <= Some(5),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@8 < Some(2400),11,2 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(3),11,2 AND l_discount@2 <= Some(5),11,2 AND l_quantity@0 < Some(2400),11,2, projection=[l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1994-01-01 AND l_shipdate@10 < 1995-01-01 AND l_discount@6 >= Some(3),11,2 AND l_discount@6 <= Some(5),11,2 AND l_quantity@4 < Some(2400),11,2, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1994-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-01-01 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_max@4 >= Some(3),11,2 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_min@7 <= Some(5),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@8 < Some(2400),11,2 END, required_guarantees=[] DataFusion Ray Distributed Plan =========== @@ -25,10 +24,9 @@ DataFusion Ray Distributed Plan Query Stage #0 (2 -> 1): RayShuffleWriterExec(stage_id=0, output_partitioning=UnknownPartitioning(2)) AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(3),11,2 AND l_discount@2 <= Some(5),11,2 AND l_quantity@0 < Some(2400),11,2 - ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1994-01-01 AND l_shipdate@10 < 1995-01-01 AND l_discount@6 >= Some(3),11,2 AND l_discount@6 <= Some(5),11,2 AND l_quantity@4 < Some(2400),11,2, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1994-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-01-01 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_max@4 >= Some(3),11,2 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_min@7 <= Some(5),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@8 < Some(2400),11,2 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(3),11,2 AND l_discount@2 <= Some(5),11,2 AND l_quantity@0 < Some(2400),11,2, projection=[l_extendedprice@1, l_discount@2] + ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1994-01-01 AND l_shipdate@10 < 1995-01-01 AND l_discount@6 >= Some(3),11,2 AND l_discount@6 <= Some(5),11,2 AND l_quantity@4 < Some(2400),11,2, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1994-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-01-01 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_max@4 >= Some(3),11,2 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_min@7 <= Some(5),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@8 < Some(2400),11,2 END, required_guarantees=[] Query Stage #1 (1 -> 1): ProjectionExec: expr=[sum(lineitem.l_extendedprice * lineitem.l_discount)@0 as revenue] diff --git a/testdata/expected-plans/q7.txt b/testdata/expected-plans/q7.txt new file mode 100644 index 0000000..d05c96f --- /dev/null +++ b/testdata/expected-plans/q7.txt @@ -0,0 +1,182 @@ +DataFusion Logical Plan +======================= + +Sort: shipping.supp_nation ASC NULLS LAST, shipping.cust_nation ASC NULLS LAST, shipping.l_year ASC NULLS LAST + Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, sum(shipping.volume) AS revenue + Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[sum(shipping.volume)]] + SubqueryAlias: shipping + Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume + Inner Join: customer.c_nationkey = n2.n_nationkey Filter: n1.n_name = Utf8("GERMANY") AND n2.n_name = Utf8("IRAQ") OR n1.n_name = Utf8("IRAQ") AND n2.n_name = Utf8("GERMANY") + Projection: lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey, n1.n_name + Inner Join: supplier.s_nationkey = n1.n_nationkey + Projection: supplier.s_nationkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey + Inner Join: orders.o_custkey = customer.c_custkey + Projection: supplier.s_nationkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, orders.o_custkey + Inner Join: lineitem.l_orderkey = orders.o_orderkey + Projection: supplier.s_nationkey, lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate + Inner Join: supplier.s_suppkey = lineitem.l_suppkey + TableScan: supplier projection=[s_suppkey, s_nationkey] + Filter: lineitem.l_shipdate >= Date32("1995-01-01") AND lineitem.l_shipdate <= Date32("1996-12-31") + TableScan: lineitem projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1995-01-01"), lineitem.l_shipdate <= Date32("1996-12-31")] + TableScan: orders projection=[o_orderkey, o_custkey] + TableScan: customer projection=[c_custkey, c_nationkey] + SubqueryAlias: n1 + Filter: nation.n_name = Utf8("GERMANY") OR nation.n_name = Utf8("IRAQ") + TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("GERMANY") OR nation.n_name = Utf8("IRAQ")] + SubqueryAlias: n2 + Filter: nation.n_name = Utf8("IRAQ") OR nation.n_name = Utf8("GERMANY") + TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("IRAQ") OR nation.n_name = Utf8("GERMANY")] + +DataFusion Physical Plan +======================== + +SortPreservingMergeExec: [supp_nation@0 ASC NULLS LAST,cust_nation@1 ASC NULLS LAST,l_year@2 ASC NULLS LAST] + SortExec: expr=[supp_nation@0 ASC NULLS LAST,cust_nation@1 ASC NULLS LAST,l_year@2 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year, sum(shipping.volume)@3 as revenue] + AggregateExec: mode=FinalPartitioned, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] + ProjectionExec: expr=[n_name@4 as supp_nation, n_name@0 as cust_nation, date_part(YEAR, l_shipdate@3) as l_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as volume] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], filter=n_name@0 = GERMANY AND n_name@1 = IRAQ OR n_name@0 = IRAQ AND n_name@1 = GERMANY, projection=[n_name@1, l_extendedprice@2, l_discount@3, l_shipdate@4, n_name@6] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = IRAQ OR n_name@1 = GERMANY + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = IRAQ OR n_name@1 = GERMANY, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_nationkey@3], 2), input_partitions=2 + ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_shipdate@3 as l_shipdate, c_nationkey@4 as c_nationkey, n_name@0 as n_name] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@0)], projection=[n_name@1, l_extendedprice@3, l_discount@4, l_shipdate@5, c_nationkey@6] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = GERMANY OR n_name@1 = IRAQ + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = GERMANY OR n_name@1 = IRAQ, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_nationkey@0], 2), input_partitions=2 + ProjectionExec: expr=[s_nationkey@1 as s_nationkey, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, l_shipdate@4 as l_shipdate, c_nationkey@0 as c_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@4)], projection=[c_nationkey@1, s_nationkey@2, l_extendedprice@3, l_discount@4, l_shipdate@5] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_custkey@4], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_orderkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([l_suppkey@1], 2), input_partitions=2 + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31 + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-01-01 AND l_shipdate@10 <= 1996-12-31, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 <= 1996-12-31 END, required_guarantees=[] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey] + +DataFusion Ray Distributed Plan +=========== + +Query Stage #0 (1 -> 2): +RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = IRAQ OR n_name@1 = GERMANY + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = IRAQ OR n_name@1 = GERMANY, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] + +Query Stage #1 (1 -> 2): +RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: n_name@1 = GERMANY OR n_name@1 = IRAQ + ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = GERMANY OR n_name@1 = IRAQ, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] + +Query Stage #2 (2 -> 2): +RayShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) + ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] + +Query Stage #3 (1 -> 2): +RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) + ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] + +Query Stage #4 (2 -> 2): +RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31 + ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-01-01 AND l_shipdate@10 <= 1996-12-31, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 <= 1996-12-31 END, required_guarantees=[] + +Query Stage #5 (2 -> 2): +RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "l_orderkey", index: 1 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) + +Query Stage #6 (2 -> 2): +RayShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) + ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey] + +Query Stage #7 (2 -> 2): +RayShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "o_custkey", index: 4 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "l_orderkey", index: 1 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) + +Query Stage #8 (2 -> 2): +RayShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "s_nationkey", index: 0 }], 2)) + ProjectionExec: expr=[s_nationkey@1 as s_nationkey, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, l_shipdate@4 as l_shipdate, c_nationkey@0 as c_nationkey] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@4)], projection=[c_nationkey@1, s_nationkey@2, l_extendedprice@3, l_discount@4, l_shipdate@5] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "o_custkey", index: 4 }], 2)) + +Query Stage #9 (2 -> 2): +RayShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) + ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_shipdate@3 as l_shipdate, c_nationkey@4 as c_nationkey, n_name@0 as n_name] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@0)], projection=[n_name@1, l_extendedprice@3, l_discount@4, l_shipdate@5, c_nationkey@6] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "s_nationkey", index: 0 }], 2)) + +Query Stage #10 (2 -> 2): +RayShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) + AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] + ProjectionExec: expr=[n_name@4 as supp_nation, n_name@0 as cust_nation, date_part(YEAR, l_shipdate@3) as l_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as volume] + CoalesceBatchesExec: target_batch_size=8192 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], filter=n_name@0 = GERMANY AND n_name@1 = IRAQ OR n_name@0 = IRAQ AND n_name@1 = GERMANY, projection=[n_name@1, l_extendedprice@2, l_discount@3, l_shipdate@4, n_name@6] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) + +Query Stage #11 (2 -> 2): +RayShuffleWriterExec(stage_id=11, output_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) + SortExec: expr=[supp_nation@0 ASC NULLS LAST,cust_nation@1 ASC NULLS LAST,l_year@2 ASC NULLS LAST], preserve_partitioning=[true] + ProjectionExec: expr=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year, sum(shipping.volume)@3 as revenue] + AggregateExec: mode=FinalPartitioned, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] + CoalesceBatchesExec: target_batch_size=8192 + RayShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) + +Query Stage #12 (2 -> 1): +SortPreservingMergeExec: [supp_nation@0 ASC NULLS LAST,cust_nation@1 ASC NULLS LAST,l_year@2 ASC NULLS LAST] + RayShuffleReaderExec(stage_id=11, input_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) + diff --git a/testdata/expected-plans/q8.txt b/testdata/expected-plans/q8.txt index 7b24fe1..656e691 100644 --- a/testdata/expected-plans/q8.txt +++ b/testdata/expected-plans/q8.txt @@ -51,10 +51,9 @@ SortPreservingMergeExec: [o_year@0 ASC NULLS LAST] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = MIDDLE EAST - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = MIDDLE EAST, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= MIDDLE EAST AND MIDDLE EAST <= r_name_max@1 END, required_guarantees=[r_name in (MIDDLE EAST)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = MIDDLE EAST, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = MIDDLE EAST, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= MIDDLE EAST AND MIDDLE EAST <= r_name_max@1 END, required_guarantees=[r_name in (MIDDLE EAST)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([n_regionkey@3], 2), input_partitions=2 ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, o_orderdate@3 as o_orderdate, n_regionkey@4 as n_regionkey, n_name@0 as n_name] @@ -104,11 +103,10 @@ SortPreservingMergeExec: [o_year@0 ASC NULLS LAST] HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_type@1 = LARGE PLATED STEEL - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type], predicate=p_type@4 = LARGE PLATED STEEL, pruning_predicate=CASE WHEN p_type_null_count@2 = p_type_row_count@3 THEN false ELSE p_type_min@0 <= LARGE PLATED STEEL AND LARGE PLATED STEEL <= p_type_max@1 END, required_guarantees=[p_type in (LARGE PLATED STEEL)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_type@1 = LARGE PLATED STEEL, projection=[p_partkey@0] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type], predicate=p_type@4 = LARGE PLATED STEEL, pruning_predicate=CASE WHEN p_type_null_count@2 = p_type_row_count@3 THEN false ELSE p_type_min@0 <= LARGE PLATED STEEL AND LARGE PLATED STEEL <= p_type_max@1 END, required_guarantees=[p_type in (LARGE PLATED STEEL)] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_partkey@1], 2), input_partitions=2 ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] @@ -118,10 +116,9 @@ DataFusion Ray Distributed Plan Query Stage #0 (1 -> 2): RayShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - ProjectionExec: expr=[r_regionkey@0 as r_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = MIDDLE EAST - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = MIDDLE EAST, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= MIDDLE EAST AND MIDDLE EAST <= r_name_max@1 END, required_guarantees=[r_name in (MIDDLE EAST)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: r_name@1 = MIDDLE EAST, projection=[r_regionkey@0] + ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = MIDDLE EAST, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= MIDDLE EAST AND MIDDLE EAST <= r_name_max@1 END, required_guarantees=[r_name in (MIDDLE EAST)] Query Stage #1 (1 -> 2): RayShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) @@ -147,10 +144,9 @@ RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "s_sup Query Stage #6 (1 -> 2): RayShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_type@1 = LARGE PLATED STEEL - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type], predicate=p_type@4 = LARGE PLATED STEEL, pruning_predicate=CASE WHEN p_type_null_count@2 = p_type_row_count@3 THEN false ELSE p_type_min@0 <= LARGE PLATED STEEL AND LARGE PLATED STEEL <= p_type_max@1 END, required_guarantees=[p_type in (LARGE PLATED STEEL)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_type@1 = LARGE PLATED STEEL, projection=[p_partkey@0] + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type], predicate=p_type@4 = LARGE PLATED STEEL, pruning_predicate=CASE WHEN p_type_null_count@2 = p_type_row_count@3 THEN false ELSE p_type_min@0 <= LARGE PLATED STEEL AND LARGE PLATED STEEL <= p_type_max@1 END, required_guarantees=[p_type in (LARGE PLATED STEEL)] Query Stage #7 (2 -> 2): RayShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "l_partkey", index: 1 }], 2)) diff --git a/testdata/expected-plans/q9.txt b/testdata/expected-plans/q9.txt index 25c45b5..5b49def 100644 --- a/testdata/expected-plans/q9.txt +++ b/testdata/expected-plans/q9.txt @@ -71,11 +71,10 @@ SortPreservingMergeExec: [nation@0 ASC NULLS LAST,o_year@1 DESC] HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6] CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE %moccasin% - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE %moccasin% + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_name@1 LIKE %moccasin%, projection=[p_partkey@0] + RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE %moccasin% CoalesceBatchesExec: target_batch_size=8192 RepartitionExec: partitioning=Hash([l_partkey@1], 2), input_partitions=2 ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] @@ -101,10 +100,9 @@ RayShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "s_sup Query Stage #4 (1 -> 2): RayShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - ProjectionExec: expr=[p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE %moccasin% - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE %moccasin% + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: p_name@1 LIKE %moccasin%, projection=[p_partkey@0] + ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE %moccasin% Query Stage #5 (2 -> 2): RayShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "l_partkey", index: 1 }], 2)) diff --git a/tpch/README.md b/tpch/README.md new file mode 100644 index 0000000..4852fbc --- /dev/null +++ b/tpch/README.md @@ -0,0 +1,28 @@ + + +# TPC-H + +## Running Benchmarks + +Data and queries must be available on all nodes of the Ray cluster. + +```shell + RAY_ADDRESS='http://ray-cluster-ip-address:8265' ray job submit --working-dir `pwd` -- python3 tpcbench.py --benchmark tpch --data /path/to/data --queries /path/to/tpch/queries --concurrency 4 +``` \ No newline at end of file diff --git a/tpch/tpcbench.py b/tpch/tpcbench.py new file mode 100644 index 0000000..1e65ee0 --- /dev/null +++ b/tpch/tpcbench.py @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import ray +from datafusion_ray import DatafusionRayContext +from datetime import datetime +import json +import time + +def main(benchmark: str, data_path: str, query_path: str, concurrency: int): + + # Register the tables + if benchmark == "tpch": + num_queries = 22 + table_names = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] + elif benchmark == "tpcds": + num_queries = 99 + table_names = ["call_center", "catalog_page", "catalog_returns", "catalog_sales", "customer", + "customer_address", "customer_demographics", "date_dim", "time_dim", "household_demographics", + "income_band", "inventory", "item", "promotion", "reason", "ship_mode", "store", "store_returns", + "store_sales", "warehouse", "web_page", "web_returns", "web_sales", "web_site"] + else: + raise "invalid benchmark" + + # Connect to a cluster + # use ray job submit + ray.init() + + ctx = DatafusionRayContext(concurrency) + + for table in table_names: + path = f"{data_path}/{table}.parquet" + print(f"Registering table {table} using path {path}") + ctx.register_parquet(table, path) + + results = { + 'engine': 'datafusion-python', + 'benchmark': benchmark, + 'data_path': data_path, + 'query_path': query_path, + 'concurrency': concurrency, + } + + for query in range(1, num_queries + 1): + # read text file + path = f"{query_path}/q{query}.sql" + print(f"Reading query {query} using path {path}") + with open(path, "r") as f: + text = f.read() + # each file can contain multiple queries + queries = text.split(";") + + start_time = time.time() + for sql in queries: + sql = sql.strip() + if len(sql) > 0: + print(f"Executing: {sql}") + rows = ctx.sql(sql) + + print(f"Query {query} returned {len(rows)} rows") + end_time = time.time() + print(f"Query {query} took {end_time - start_time} seconds") + + # store timings in list and later add option to run > 1 iterations + results[query] = [end_time - start_time] + + str = json.dumps(results, indent=4) + current_time_millis = int(datetime.now().timestamp() * 1000) + results_path = f"datafusion-ray-{benchmark}-{current_time_millis}.json" + print(f"Writing results to {results_path}") + with open(results_path, "w") as f: + f.write(str) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="DataFusion benchmark derived from TPC-H / TPC-DS") + parser.add_argument("--benchmark", required=True, help="Benchmark to run (tpch or tpcds)") + parser.add_argument("--data", required=True, help="Path to data files") + parser.add_argument("--queries", required=True, help="Path to query files") + parser.add_argument("--concurrency", required=True, help="Number of concurrent tasks") + args = parser.parse_args() + + main(args.benchmark, args.data, args.queries, int(args.concurrency)) \ No newline at end of file