diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 232f79fb8947..531add1428e6 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -96,9 +96,13 @@ jobs: with: python-version: '3.12' - - name: Create virtual environment + - name: Install uv run: | curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Create virtual environment + run: | uv venv echo "$GITHUB_WORKSPACE/.venv/bin" >> $GITHUB_PATH echo "VIRTUAL_ENV=$GITHUB_WORKSPACE/.venv" >> $GITHUB_ENV @@ -165,7 +169,7 @@ jobs: runs-on: ubuntu-latest steps: - # Needed to fetch the Codecov config file + # Needed to fetch the Codecov config file - uses: actions/checkout@v4 - name: Download coverage reports diff --git a/Cargo.lock b/Cargo.lock index 5176bd831139..d00cfa7ff0a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,9 +62,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" [[package]] name = "android-tzdata" @@ -89,15 +89,15 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstyle" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anyhow" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f37166d7d48a0284b99dd824694c26119c700b53bf0d1540cdb147dbdaaf13" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" [[package]] name = "apache-avro" @@ -120,7 +120,7 @@ dependencies = [ "snap", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "typed-builder", "uuid", ] @@ -206,7 +206,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -217,7 +217,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "aws-config" -version = "1.5.9" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d6448cfb224dd6a9b9ac734f58622dd0d4751f3589f3b777345745f46b2eb14" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -368,9 +368,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded855583fa1d22e88fe39fd6062b062376e50a8211989e07cf5e38d52eb3453" +checksum = "09677244a9da92172c8dc60109b4a9658597d4d298b188dd0018b6a66b410ca4" dependencies = [ "aws-credential-types", "aws-runtime", @@ -390,9 +390,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.49.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9177ea1192e6601ae16c7273385690d88a7ed386a00b74a6bc894d12103cd933" +checksum = "81fea2f3a8bb3bd10932ae7ad59cc59f65f270fc9183a7e91f501dc5efbef7ee" dependencies = [ "aws-credential-types", "aws-runtime", @@ -412,9 +412,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "823ef553cf36713c97453e2ddff1eb8f62be7f4523544e2a5db64caf80100f0a" +checksum = "53dcf5e7d9bd1517b8b998e170e650047cea8a2b85fe1835abe3210713e541b7" dependencies = [ "aws-credential-types", "aws-runtime", @@ -574,9 +574,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.7.2" +version = "1.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" +checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -591,9 +591,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.8" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07c9cdc179e6afbf5d391ab08c85eac817b51c87e1892a5edb5f7bbdc64314b4" +checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" dependencies = [ "base64-simd", "bytes", @@ -794,7 +794,7 @@ checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -845,9 +845,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.31" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" +checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8" dependencies = [ "jobserver", "libc", @@ -1037,9 +1037,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "0ca741a962e1b0bff6d724a1a0958b686406e853bb14061f218562e1896f95e6" dependencies = [ "libc", ] @@ -1252,6 +1252,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -1314,7 +1325,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -1369,9 +1380,9 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastrand" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "ff" @@ -1432,9 +1443,9 @@ dependencies = [ [[package]] name = "fs4" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc91b3da7f1a7968b00f9f65a4971252f6a927d3cb9eec05d91cbeaff678f9a" +checksum = "e871a4cfa68bb224863b53149d973df1ac8d1ed2fa1d1bfc37ac1bb65dd37207" dependencies = [ "rustix", "windows-sys 0.52.0", @@ -1496,7 +1507,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -1663,9 +1674,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" dependencies = [ "allocator-api2", "equivalent", @@ -1916,14 +1927,143 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "icu_normalizer", + "icu_properties", ] [[package]] @@ -1933,7 +2073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "serde", ] @@ -2053,9 +2193,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.161" +version = "0.2.162" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" [[package]] name = "libflate" @@ -2143,6 +2283,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + [[package]] name = "lock_api" version = "0.4.12" @@ -2165,7 +2311,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.0", + "hashbrown 0.15.1", ] [[package]] @@ -2754,7 +2900,7 @@ dependencies = [ "flate2", "futures", "getrandom", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "hex", "indexmap", "itoa", @@ -2826,7 +2972,7 @@ dependencies = [ "comfy-table", "either", "hashbrown 0.14.5", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "indexmap", "ndarray", "num-traits", @@ -2844,7 +2990,7 @@ dependencies = [ "serde", "serde_json", "strum_macros", - "thiserror", + "thiserror 1.0.69", "version_check", "xxhash-rust", ] @@ -2863,6 +3009,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "polars-dylib" +version = "0.44.2" +dependencies = [ + "polars", + "polars-arrow", + "polars-core", + "polars-expr", + "polars-lazy", + "polars-mem-engine", + "polars-plan", + "polars-python", +] + [[package]] name = "polars-error" version = "0.44.2" @@ -2872,7 +3032,7 @@ dependencies = [ "polars-arrow-format", "regex", "simdutf8", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2881,7 +3041,7 @@ version = "0.44.2" dependencies = [ "ahash", "bitflags", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "num-traits", "once_cell", "polars-arrow", @@ -2922,7 +3082,7 @@ dependencies = [ "fs4", "futures", "glob", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "home", "itoa", "memchr", @@ -2963,7 +3123,7 @@ dependencies = [ "chrono", "chrono-tz", "fallible-streaming-iterator", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "indexmap", "itoa", "num-traits", @@ -3036,7 +3196,7 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "hex", "indexmap", "jsonpath_lib_polars_vendor", @@ -3074,7 +3234,7 @@ dependencies = [ "fallible-streaming-iterator", "flate2", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "lz4", "lz4_flex", "num-traits", @@ -3110,7 +3270,7 @@ dependencies = [ "crossbeam-queue", "enum_dispatch", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "num-traits", "polars-arrow", "polars-compute", @@ -3140,7 +3300,7 @@ dependencies = [ "ciborium", "either", "futures", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "libloading", "memmap2", "num-traits", @@ -3196,7 +3356,7 @@ dependencies = [ "pyo3", "recursive", "serde_json", - "thiserror", + "thiserror 1.0.69", "version_check", ] @@ -3257,6 +3417,7 @@ dependencies = [ "polars-expr", "polars-io", "polars-mem-engine", + "polars-ops", "polars-parquet", "polars-plan", "polars-utils", @@ -3296,7 +3457,7 @@ dependencies = [ "bytemuck", "bytes", "compact_str", - "hashbrown 0.15.0", + "hashbrown 0.15.1", "indexmap", "libc", "memmap2", @@ -3384,16 +3545,16 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa37f80ca58604976033fae9515a8a2989fc13797d953f7c04fb8fa36a11f205" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" dependencies = [ "cc", ] [[package]] name = "py-polars" -version = "1.12.0" +version = "1.13.1" dependencies = [ "jemallocator", "libc", @@ -3452,7 +3613,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -3465,14 +3626,14 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] name = "quad-rand" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" @@ -3497,9 +3658,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", @@ -3508,33 +3669,36 @@ dependencies = [ "rustc-hash 2.0.0", "rustls 0.23.16", "socket2", - "thiserror", + "thiserror 2.0.3", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", + "getrandom", "rand", "ring", "rustc-hash 2.0.0", "rustls 0.23.16", + "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.3", "tinyvec", "tracing", + "web-time", ] [[package]] name = "quinn-udp" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" +checksum = "7d5a626c6807713b15cac82a6acaccd6043c9a5408c24baae07611fec3f243da" dependencies = [ "cfg_aliases", "libc", @@ -3664,7 +3828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -3693,7 +3857,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -3710,9 +3874,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -3844,9 +4008,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.38" +version = "0.38.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" +checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" dependencies = [ "bitflags", "errno", @@ -3929,6 +4093,9 @@ name = "rustls-pki-types" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -4072,9 +4239,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" dependencies = [ "core-foundation-sys", "libc", @@ -4088,9 +4255,9 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] @@ -4106,13 +4273,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4303,6 +4470,12 @@ dependencies = [ "log", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "stacker" version = "0.1.17" @@ -4359,7 +4532,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4381,9 +4554,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.86" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89275301d38033efb81a6e60e3497e734dfcc62571f2854bf4b16690398824c" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -4399,6 +4572,17 @@ dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "sysinfo" version = "0.32.0" @@ -4426,9 +4610,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", "fastrand", @@ -4439,22 +4623,42 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.66" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d171f59dbaa811dbbb1aee1e73db92ec2b122911a48e1390dfe327a821ddede" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ - "thiserror-impl", + "proc-macro2", + "quote", + "syn 2.0.87", ] [[package]] name = "thiserror-impl" -version = "1.0.66" +version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08be0f17bd307950653ce45db00cd31200d82b624b36e181337d9c7d92765b5" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4487,6 +4691,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -4514,9 +4728,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.0" +version = "1.41.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" dependencies = [ "backtrace", "bytes", @@ -4537,7 +4751,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4600,7 +4814,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4645,7 +4859,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4660,27 +4874,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" -[[package]] -name = "unicode-bidi" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" - [[package]] name = "unicode-ident" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" -[[package]] -name = "unicode-normalization" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" -dependencies = [ - "tinyvec", -] - [[package]] name = "unicode-reverse" version = "1.0.9" @@ -4716,9 +4915,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.2" +version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" dependencies = [ "form_urlencoded", "idna", @@ -4731,6 +4930,18 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "uuid" version = "1.11.0" @@ -4812,7 +5023,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", "wasm-bindgen-shared", ] @@ -4846,7 +5057,7 @@ checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4880,6 +5091,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -4950,7 +5171,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -4961,7 +5182,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", ] [[package]] @@ -5142,6 +5363,18 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "x11rb" version = "0.13.1" @@ -5171,6 +5404,30 @@ version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -5189,7 +5446,28 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.86", + "syn 2.0.87", +] + +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", ] [[package]] @@ -5198,6 +5476,28 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index 35595086f981..34502bb5e9ee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,7 @@ zstd = "0.13" polars = { version = "0.44.2", path = "crates/polars", default-features = false } polars-compute = { version = "0.44.2", path = "crates/polars-compute", default-features = false } polars-core = { version = "0.44.2", path = "crates/polars-core", default-features = false } +polars-dylib = { version = "0.44.2", path = "crates/polars-dyn", default-features = false } polars-error = { version = "0.44.2", path = "crates/polars-error", default-features = false } polars-expr = { version = "0.44.2", path = "crates/polars-expr", default-features = false } polars-ffi = { version = "0.44.2", path = "crates/polars-ffi", default-features = false } diff --git a/crates/Makefile b/crates/Makefile index 28622ee061f5..6b3cf3372149 100644 --- a/crates/Makefile +++ b/crates/Makefile @@ -152,5 +152,6 @@ check-wasm: ## Check wasm build without supported features --exclude-features parquet \ --exclude-features performant \ --exclude-features streaming \ - --exclude-features http \ + --exclude-features http \ + --exclude-features full \ --exclude-features test diff --git a/crates/polars-arrow/src/io/ipc/read/common.rs b/crates/polars-arrow/src/io/ipc/read/common.rs index 6b893c0e8ce3..0a1297bf1184 100644 --- a/crates/polars-arrow/src/io/ipc/read/common.rs +++ b/crates/polars-arrow/src/io/ipc/read/common.rs @@ -318,10 +318,14 @@ pub fn read_dictionary( Ok(()) } -pub fn prepare_projection( - schema: &ArrowSchema, - mut projection: Vec, -) -> (Vec, PlHashMap, ArrowSchema) { +#[derive(Clone)] +pub struct ProjectionInfo { + pub columns: Vec, + pub map: PlHashMap, + pub schema: ArrowSchema, +} + +pub fn prepare_projection(schema: &ArrowSchema, mut projection: Vec) -> ProjectionInfo { let schema = projection .iter() .map(|x| { @@ -355,7 +359,11 @@ pub fn prepare_projection( } } - (projection, map, schema) + ProjectionInfo { + columns: projection, + map, + schema, + } } pub fn apply_projection( diff --git a/crates/polars-arrow/src/io/ipc/read/file.rs b/crates/polars-arrow/src/io/ipc/read/file.rs index a83e1b758d80..e75fae36730e 100644 --- a/crates/polars-arrow/src/io/ipc/read/file.rs +++ b/crates/polars-arrow/src/io/ipc/read/file.rs @@ -305,7 +305,7 @@ fn get_message_from_block_offset<'a, R: Read + Seek>( .map_err(|err| polars_err!(oos = OutOfSpecKind::InvalidFlatbufferMessage(err))) } -fn get_message_from_block<'a, R: Read + Seek>( +pub(super) fn get_message_from_block<'a, R: Read + Seek>( reader: &mut R, block: &arrow_format::ipc::Block, message_scratch: &'a mut Vec, diff --git a/crates/polars-arrow/src/io/ipc/read/mod.rs b/crates/polars-arrow/src/io/ipc/read/mod.rs index 88411f9b905f..f4430db7dea2 100644 --- a/crates/polars-arrow/src/io/ipc/read/mod.rs +++ b/crates/polars-arrow/src/io/ipc/read/mod.rs @@ -19,6 +19,7 @@ mod schema; mod stream; pub(crate) use common::first_dict_field; +pub use common::{prepare_projection, ProjectionInfo}; pub use error::OutOfSpecKind; pub use file::{ deserialize_footer, get_row_count, read_batch, read_file_dictionaries, read_file_metadata, diff --git a/crates/polars-arrow/src/io/ipc/read/reader.rs b/crates/polars-arrow/src/io/ipc/read/reader.rs index 8369d2960233..e9523477fe39 100644 --- a/crates/polars-arrow/src/io/ipc/read/reader.rs +++ b/crates/polars-arrow/src/io/ipc/read/reader.rs @@ -1,9 +1,9 @@ use std::io::{Read, Seek}; use polars_error::PolarsResult; -use polars_utils::aliases::PlHashMap; use super::common::*; +use super::file::{get_message_from_block, get_record_batch}; use super::{read_batch, read_file_dictionaries, Dictionaries, FileMetadata}; use crate::array::Array; use crate::datatypes::ArrowSchema; @@ -16,7 +16,7 @@ pub struct FileReader { // the dictionaries are going to be read dictionaries: Option, current_block: usize, - projection: Option<(Vec, PlHashMap, ArrowSchema)>, + projection: Option, remaining: usize, data_scratch: Vec, message_scratch: Vec, @@ -32,10 +32,29 @@ impl FileReader { projection: Option>, limit: Option, ) -> Self { - let projection = projection.map(|projection| { - let (p, h, schema) = prepare_projection(&metadata.schema, projection); - (p, h, schema) - }); + let projection = + projection.map(|projection| prepare_projection(&metadata.schema, projection)); + Self { + reader, + metadata, + dictionaries: Default::default(), + projection, + remaining: limit.unwrap_or(usize::MAX), + current_block: 0, + data_scratch: Default::default(), + message_scratch: Default::default(), + } + } + + /// Creates a new [`FileReader`]. Use `projection` to only take certain columns. + /// # Panic + /// Panics iff the projection is not in increasing order (e.g. `[1, 0]` nor `[0, 1, 1]` are valid) + pub fn new_with_projection_info( + reader: R, + metadata: FileMetadata, + projection: Option, + limit: Option, + ) -> Self { Self { reader, metadata, @@ -52,7 +71,7 @@ impl FileReader { pub fn schema(&self) -> &ArrowSchema { self.projection .as_ref() - .map(|x| &x.2) + .map(|x| &x.schema) .unwrap_or(&self.metadata.schema) } @@ -66,9 +85,23 @@ impl FileReader { self.reader } + pub fn set_current_block(&mut self, idx: usize) { + self.current_block = idx; + } + + pub fn get_current_block(&self) -> usize { + self.current_block + } + + /// Get the inner memory scratches so they can be reused in a new writer. + /// This can be utilized to save memory allocations for performance reasons. + pub fn take_projection_info(&mut self) -> Option { + std::mem::take(&mut self.projection) + } + /// Get the inner memory scratches so they can be reused in a new writer. /// This can be utilized to save memory allocations for performance reasons. - pub fn get_scratches(&mut self) -> (Vec, Vec) { + pub fn take_scratches(&mut self) -> (Vec, Vec) { ( std::mem::take(&mut self.data_scratch), std::mem::take(&mut self.message_scratch), @@ -91,6 +124,43 @@ impl FileReader { }; Ok(()) } + + /// Skip over blocks until we have seen at most `offset` rows, returning how many rows we are + /// still too see. + /// + /// This will never go over the `offset`. Meaning that if the `offset < current_block.len()`, + /// the block will not be skipped. + pub fn skip_blocks_till_limit(&mut self, offset: u64) -> PolarsResult { + let mut remaining_offset = offset; + + for (i, block) in self.metadata.blocks.iter().enumerate() { + let message = + get_message_from_block(&mut self.reader, block, &mut self.message_scratch)?; + let record_batch = get_record_batch(message)?; + + let length = record_batch.length()?; + let length = length as u64; + + if length > remaining_offset { + self.current_block = i; + return Ok(remaining_offset); + } + + remaining_offset -= length; + } + + self.current_block = self.metadata.blocks.len(); + Ok(remaining_offset) + } + + pub fn next_record_batch( + &mut self, + ) -> Option>> { + let block = self.metadata.blocks.get(self.current_block)?; + self.current_block += 1; + let message = get_message_from_block(&mut self.reader, block, &mut self.message_scratch); + Some(message.and_then(|m| get_record_batch(m))) + } } impl Iterator for FileReader { @@ -114,7 +184,7 @@ impl Iterator for FileReader { &mut self.reader, self.dictionaries.as_ref().unwrap(), &self.metadata, - self.projection.as_ref().map(|x| x.0.as_ref()), + self.projection.as_ref().map(|x| x.columns.as_ref()), Some(self.remaining), block, &mut self.message_scratch, @@ -122,7 +192,7 @@ impl Iterator for FileReader { ); self.remaining -= chunk.as_ref().map(|x| x.len()).unwrap_or_default(); - let chunk = if let Some((_, map, _)) = &self.projection { + let chunk = if let Some(ProjectionInfo { map, .. }) = &self.projection { // re-order according to projection chunk.map(|chunk| apply_projection(chunk, map)) } else { diff --git a/crates/polars-arrow/src/io/ipc/read/stream.rs b/crates/polars-arrow/src/io/ipc/read/stream.rs index 87241596cdbe..b2cfb727b385 100644 --- a/crates/polars-arrow/src/io/ipc/read/stream.rs +++ b/crates/polars-arrow/src/io/ipc/read/stream.rs @@ -2,7 +2,6 @@ use std::io::Read; use arrow_format::ipc::planus::ReadAsRoot; use polars_error::{polars_bail, polars_err, PolarsError, PolarsResult}; -use polars_utils::aliases::PlHashMap; use super::super::CONTINUATION_MARKER; use super::common::*; @@ -93,7 +92,7 @@ fn read_next( dictionaries: &mut Dictionaries, message_buffer: &mut Vec, data_buffer: &mut Vec, - projection: &Option<(Vec, PlHashMap, ArrowSchema)>, + projection: &Option, scratch: &mut Vec, ) -> PolarsResult> { // determine metadata length @@ -169,7 +168,7 @@ fn read_next( batch, &metadata.schema, &metadata.ipc_schema, - projection.as_ref().map(|x| x.0.as_ref()), + projection.as_ref().map(|x| x.columns.as_ref()), None, dictionaries, metadata.version, @@ -179,7 +178,7 @@ fn read_next( scratch, ); - if let Some((_, map, _)) = projection { + if let Some(ProjectionInfo { map, .. }) = projection { // re-order according to projection chunk .map(|chunk| apply_projection(chunk, map)) @@ -238,7 +237,7 @@ pub struct StreamReader { finished: bool, data_buffer: Vec, message_buffer: Vec, - projection: Option<(Vec, PlHashMap, ArrowSchema)>, + projection: Option, scratch: Vec, } @@ -249,10 +248,8 @@ impl StreamReader { /// encounter a schema. /// To check if the reader is done, use `is_finished(self)` pub fn new(reader: R, metadata: StreamMetadata, projection: Option>) -> Self { - let projection = projection.map(|projection| { - let (p, h, schema) = prepare_projection(&metadata.schema, projection); - (p, h, schema) - }); + let projection = + projection.map(|projection| prepare_projection(&metadata.schema, projection)); Self { reader, @@ -275,7 +272,7 @@ impl StreamReader { pub fn schema(&self) -> &ArrowSchema { self.projection .as_ref() - .map(|x| &x.2) + .map(|x| &x.schema) .unwrap_or(&self.metadata.schema) } diff --git a/crates/polars-arrow/src/record_batch.rs b/crates/polars-arrow/src/record_batch.rs index f58d129831f1..2b0b8112ea9e 100644 --- a/crates/polars-arrow/src/record_batch.rs +++ b/crates/polars-arrow/src/record_batch.rs @@ -9,7 +9,7 @@ use crate::array::{Array, ArrayRef}; /// the same length, [`RecordBatchT::len`]. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RecordBatchT> { - length: usize, + height: usize, arrays: Vec, } @@ -29,14 +29,14 @@ impl> RecordBatchT { /// /// # Error /// - /// I.f.f. the length does not match the length of any of the arrays - pub fn try_new(length: usize, arrays: Vec) -> PolarsResult { + /// I.f.f. the height does not match the length of any of the arrays + pub fn try_new(height: usize, arrays: Vec) -> PolarsResult { polars_ensure!( - arrays.iter().all(|arr| arr.as_ref().len() == length), + arrays.iter().all(|arr| arr.as_ref().len() == height), ComputeError: "RecordBatch requires all its arrays to have an equal number of rows", ); - Ok(Self { length, arrays }) + Ok(Self { height, arrays }) } /// returns the [`Array`]s in [`RecordBatchT`] @@ -51,7 +51,17 @@ impl> RecordBatchT { /// returns the number of rows of every array pub fn len(&self) -> usize { - self.length + self.height + } + + /// returns the number of rows of every array + pub fn height(&self) -> usize { + self.height + } + + /// returns the number of arrays + pub fn width(&self) -> usize { + self.arrays.len() } /// returns whether the columns have any rows diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs index 7f257f23f59e..7787ef28076f 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_bottom_k.rs @@ -39,6 +39,11 @@ pub fn _arg_bottom_k( _broadcast_bools(by_column.len(), &mut sort_options.descending); _broadcast_bools(by_column.len(), &mut sort_options.nulls_last); + // Don't go into row encoding. + if by_column.len() == 1 && sort_options.limit.is_some() && !sort_options.maintain_order { + return Ok(NoNull::new(by_column[0].arg_sort((&*sort_options).into()))); + } + let encoded = _get_rows_encoded( by_column, &sort_options.descending, diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs index ca34d37318a7..4f9a1ff9e9b3 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort.rs @@ -18,7 +18,7 @@ pub(super) fn arg_sort( iters: I, options: SortOptions, null_count: usize, - len: usize, + mut len: usize, ) -> IdxCa where I: IntoIterator, @@ -49,14 +49,46 @@ where vals.extend(iter); } - sort_impl(vals.as_mut_slice(), options); + let vals = if let Some((limit, desc)) = options.limit { + let limit = limit as usize; + // Overwrite output len. + len = limit; + let out = if limit >= vals.len() { + vals.as_mut_slice() + } else if desc { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| b.1.tot_cmp(&a.1)); + lower + } else { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| a.1.tot_cmp(&b.1)); + lower + }; + + sort_impl(out, options); + out + } else { + sort_impl(vals.as_mut_slice(), options); + vals.as_slice() + }; - let iter = vals.into_iter().map(|(idx, _v)| idx); + let iter = vals.iter().map(|(idx, _v)| idx).copied(); let idx = if nulls_last { let mut idx = Vec::with_capacity(len); idx.extend(iter); - idx.extend(nulls_idx); + + let nulls_idx = if options.limit.is_some() { + &nulls_idx[..len - idx.len()] + } else { + &nulls_idx + }; + idx.extend_from_slice(nulls_idx); idx + } else if options.limit.is_some() { + nulls_idx.extend(iter.take(len - nulls_idx.len())); + nulls_idx } else { let ptr = nulls_idx.as_ptr() as usize; nulls_idx.extend(iter); @@ -90,9 +122,29 @@ where })); } - sort_impl(vals.as_mut_slice(), options); + let vals = if let Some((limit, desc)) = options.limit { + let limit = limit as usize; + let out = if limit >= vals.len() { + vals.as_mut_slice() + } else if desc { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| b.1.tot_cmp(&a.1)); + lower + } else { + let (lower, _el, _upper) = vals + .as_mut_slice() + .select_nth_unstable_by(limit, |a, b| a.1.tot_cmp(&b.1)); + lower + }; + sort_impl(out, options); + out + } else { + sort_impl(vals.as_mut_slice(), options); + vals.as_slice() + }; - let iter = vals.into_iter().map(|(idx, _v)| idx); + let iter = vals.iter().map(|(idx, _v)| idx).copied(); let idx: Vec<_> = iter.collect_trusted(); ChunkedArray::with_chunk(name, IdxArr::from_data_default(Buffer::from(idx), None)) diff --git a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs index 5dd71a7b1eb8..c89e0790f251 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/categorical.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/categorical.rs @@ -53,6 +53,7 @@ impl CategoricalChunked { descending, multithreaded: true, maintain_order: false, + limit: None, }) } diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index 727f2ace15a8..add7e8b696a4 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -335,6 +335,7 @@ impl ChunkSort for StringChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -406,6 +407,7 @@ impl ChunkSort for BinaryChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -536,6 +538,7 @@ impl ChunkSort for BinaryOffsetChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -672,6 +675,7 @@ impl ChunkSort for BooleanChunked { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) } @@ -797,6 +801,7 @@ mod test { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); assert_eq!( Vec::from(&out), @@ -816,6 +821,7 @@ mod test { nulls_last: true, multithreaded: true, maintain_order: false, + limit: None, }); assert_eq!( Vec::from(&out), @@ -925,6 +931,7 @@ mod test { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[None, None, Some("a"), Some("b"), Some("c")]; assert_eq!(Vec::from(&out), expected); @@ -934,6 +941,7 @@ mod test { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[None, None, Some("c"), Some("b"), Some("a")]; @@ -944,6 +952,7 @@ mod test { nulls_last: true, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[Some("a"), Some("b"), Some("c"), None, None]; assert_eq!(Vec::from(&out), expected); @@ -953,6 +962,7 @@ mod test { nulls_last: true, multithreaded: true, maintain_order: false, + limit: None, }); let expected = &[Some("c"), Some("b"), Some("a"), None, None]; assert_eq!(Vec::from(&out), expected); diff --git a/crates/polars-core/src/chunked_array/ops/sort/options.rs b/crates/polars-core/src/chunked_array/ops/sort/options.rs index 046d0b251b04..95bff0b1b47a 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/options.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/options.rs @@ -41,6 +41,10 @@ pub struct SortOptions { /// If true maintain the order of equal elements. /// Default `false`. pub maintain_order: bool, + /// Limit a sort output, this is for optimization purposes and might be ignored. + /// - Len + /// - Descending + pub limit: Option<(IdxSize, bool)>, } /// Sort options for multi-series sorting. @@ -96,6 +100,10 @@ pub struct SortMultipleOptions { pub multithreaded: bool, /// Whether maintain the order of equal elements. Default `false`. pub maintain_order: bool, + /// Limit a sort output, this is for optimization purposes and might be ignored. + /// - Len + /// - Descending + pub limit: Option<(IdxSize, bool)>, } impl Default for SortOptions { @@ -105,6 +113,7 @@ impl Default for SortOptions { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, } } } @@ -116,6 +125,7 @@ impl Default for SortMultipleOptions { nulls_last: vec![false], multithreaded: true, maintain_order: false, + limit: None, } } } @@ -224,6 +234,7 @@ impl From<&SortOptions> for SortMultipleOptions { nulls_last: vec![value.nulls_last], multithreaded: value.multithreaded, maintain_order: value.maintain_order, + limit: value.limit, } } } @@ -235,6 +246,7 @@ impl From<&SortMultipleOptions> for SortOptions { nulls_last: value.nulls_last.first().copied().unwrap_or(false), multithreaded: value.multithreaded, maintain_order: value.maintain_order, + limit: value.limit, } } } diff --git a/crates/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs index 30d96649762f..f4c87ce0ad22 100644 --- a/crates/polars-core/src/datatypes/dtype.rs +++ b/crates/polars-core/src/datatypes/dtype.rs @@ -572,6 +572,52 @@ impl DataType { } } + /// Try to get the maximum value for this datatype. + pub fn max(&self) -> PolarsResult { + use DataType::*; + let v = match self { + #[cfg(feature = "dtype-i8")] + Int8 => Scalar::from(i8::MAX), + #[cfg(feature = "dtype-i16")] + Int16 => Scalar::from(i16::MAX), + Int32 => Scalar::from(i32::MAX), + Int64 => Scalar::from(i64::MAX), + #[cfg(feature = "dtype-u8")] + UInt8 => Scalar::from(u8::MAX), + #[cfg(feature = "dtype-u16")] + UInt16 => Scalar::from(u16::MAX), + UInt32 => Scalar::from(u32::MAX), + UInt64 => Scalar::from(u64::MAX), + Float32 => Scalar::from(f32::INFINITY), + Float64 => Scalar::from(f64::INFINITY), + dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt), + }; + Ok(v) + } + + /// Try to get the minimum value for this datatype. + pub fn min(&self) -> PolarsResult { + use DataType::*; + let v = match self { + #[cfg(feature = "dtype-i8")] + Int8 => Scalar::from(i8::MIN), + #[cfg(feature = "dtype-i16")] + Int16 => Scalar::from(i16::MIN), + Int32 => Scalar::from(i32::MIN), + Int64 => Scalar::from(i64::MIN), + #[cfg(feature = "dtype-u8")] + UInt8 => Scalar::from(u8::MIN), + #[cfg(feature = "dtype-u16")] + UInt16 => Scalar::from(u16::MIN), + UInt32 => Scalar::from(u32::MIN), + UInt64 => Scalar::from(u64::MIN), + Float32 => Scalar::from(f32::NEG_INFINITY), + Float64 => Scalar::from(f64::NEG_INFINITY), + dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt), + }; + Ok(v) + } + /// Convert to an Arrow data type. #[inline] pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType { diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index e3b969a81756..0d8fef7f4c4a 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -3,6 +3,7 @@ use std::borrow::Cow; use std::{mem, ops}; +use polars_row::ArrayRef; use polars_utils::itertools::Itertools; use rayon::prelude::*; @@ -1989,6 +1990,12 @@ impl DataFrame { return Ok(out); } if let Some((0, k)) = slice { + let desc = if sort_options.descending.len() == 1 { + sort_options.descending[0] + } else { + false + }; + sort_options.limit = Some((k as IdxSize, desc)); return self.bottom_k_impl(k, by_column, sort_options); } @@ -2012,6 +2019,7 @@ impl DataFrame { nulls_last: sort_options.nulls_last[0], multithreaded: sort_options.multithreaded, maintain_order: sort_options.maintain_order, + limit: sort_options.limit, }; // fast path for a frame with a single series // no need to compute the sort indices and then take by these indices @@ -3327,6 +3335,31 @@ impl DataFrame { pub(crate) fn infer_height(cols: &[Column]) -> usize { cols.first().map_or(0, Column::len) } + + pub fn append_record_batch(&mut self, rb: RecordBatchT) -> PolarsResult<()> { + polars_ensure!( + rb.arrays().len() == self.width(), + InvalidOperation: "attempt to extend dataframe of width {} with record batch of width {}", + self.width(), + rb.arrays().len(), + ); + + if rb.height() == 0 { + return Ok(()); + } + + // SAFETY: + // - we don't adjust the names of the columns + // - each column gets appended the same number of rows, which is an invariant of + // record_batch. + let columns = unsafe { self.get_columns_mut() }; + for (col, arr) in columns.iter_mut().zip(rb.into_arrays()) { + let arr_series = Series::from_arrow_chunks(PlSmallStr::EMPTY, vec![arr])?.into_column(); + col.append(&arr_series)?; + } + + Ok(()) + } } pub struct RecordBatchIter<'a> { diff --git a/crates/polars-core/src/frame/upstream_traits.rs b/crates/polars-core/src/frame/upstream_traits.rs index 38b346ace652..1392f87c052f 100644 --- a/crates/polars-core/src/frame/upstream_traits.rs +++ b/crates/polars-core/src/frame/upstream_traits.rs @@ -1,5 +1,7 @@ use std::ops::{Index, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}; +use arrow::record_batch::RecordBatchT; + use crate::prelude::*; impl FromIterator for DataFrame { @@ -22,6 +24,32 @@ impl FromIterator for DataFrame { } } +impl TryExtend>> for DataFrame { + fn try_extend>>>( + &mut self, + iter: I, + ) -> PolarsResult<()> { + for record_batch in iter { + self.append_record_batch(record_batch)?; + } + + Ok(()) + } +} + +impl TryExtend>>> for DataFrame { + fn try_extend>>>>( + &mut self, + iter: I, + ) -> PolarsResult<()> { + for record_batch in iter { + self.append_record_batch(record_batch?)?; + } + + Ok(()) + } +} + impl Index for DataFrame { type Output = Column; diff --git a/crates/polars-core/src/scalar/from.rs b/crates/polars-core/src/scalar/from.rs index 3af8671dadd1..c104c2ea8573 100644 --- a/crates/polars-core/src/scalar/from.rs +++ b/crates/polars-core/src/scalar/from.rs @@ -1,3 +1,5 @@ +use polars_utils::pl_str::PlSmallStr; + use super::{AnyValue, DataType, Scalar}; macro_rules! impl_from { @@ -25,4 +27,5 @@ impl_from! { (u64, UInt64, UInt64) (f32, Float32, Float32) (f64, Float64, Float64) + (PlSmallStr, StringOwned, String) } diff --git a/crates/polars-dylib/Cargo.toml b/crates/polars-dylib/Cargo.toml new file mode 100644 index 000000000000..5cc963f2d701 --- /dev/null +++ b/crates/polars-dylib/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "polars-dylib" +version.workspace = true +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +repository.workspace = true + +[lib] +crate-type = ["dylib", "rlib"] + +[dependencies] +arrow = { workspace = true, optional = true, features = ["io_flight"] } +polars = { workspace = true, features = ["full"] } +polars-core = { workspace = true, optional = true } +polars-expr = { workspace = true, optional = true } +polars-lazy = { workspace = true, optional = true } +polars-mem-engine = { workspace = true, optional = true } +polars-plan = { workspace = true, optional = true } +polars-python = { workspace = true, optional = true, default-features = true } + +[features] +private = ["polars-plan", "arrow", "polars-core", "polars-lazy", "polars-expr", "polars-mem-engine"] +python = ["polars-plan?/python", "polars-python", "polars-lazy?/python"] diff --git a/crates/polars-dylib/README.md b/crates/polars-dylib/README.md new file mode 100644 index 000000000000..3fd4b30de8f7 --- /dev/null +++ b/crates/polars-dylib/README.md @@ -0,0 +1,16 @@ +# Polars dynamic library + +```toml +# Cargo.toml +[workspace.dependencies.polars] +package = "polars-dylib" +``` + +```toml +# .cargo/config.toml +[build] +rustflags = [ + "-C", + "prefer-dynamic", +] +``` diff --git a/crates/polars-dylib/src/lib.rs b/crates/polars-dylib/src/lib.rs new file mode 100644 index 000000000000..907ce175aec8 --- /dev/null +++ b/crates/polars-dylib/src/lib.rs @@ -0,0 +1,15 @@ +#[cfg(feature = "private")] +pub use arrow as _arrow; +pub use polars::*; +#[cfg(feature = "private")] +pub use polars_core as _core; +#[cfg(feature = "private")] +pub use polars_expr as _expr; +#[cfg(feature = "private")] +pub use polars_lazy as _lazy; +#[cfg(feature = "private")] +pub use polars_mem_engine as _mem_engine; +#[cfg(feature = "private")] +pub use polars_plan as _plan; +#[cfg(feature = "python")] +pub use polars_python as _python; diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index 1624d7c9bcd6..fad081cb49ed 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -160,6 +160,7 @@ fn sort_by_groups_multiple_by( nulls_last: nulls_last.to_owned(), multithreaded, maintain_order, + limit: None, }; let sorted_idx = groups[0] @@ -180,6 +181,7 @@ fn sort_by_groups_multiple_by( nulls_last: nulls_last.to_owned(), multithreaded, maintain_order, + limit: None, }; let sorted_idx = groups[0] .as_materialized_series() diff --git a/crates/polars-io/src/cloud/polars_object_store.rs b/crates/polars-io/src/cloud/polars_object_store.rs index 9738e0cbdbe4..084408e8bc41 100644 --- a/crates/polars-io/src/cloud/polars_object_store.rs +++ b/crates/polars-io/src/cloud/polars_object_store.rs @@ -2,14 +2,16 @@ use std::ops::Range; use std::sync::Arc; use bytes::Bytes; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; -use polars_error::{to_compute_err, PolarsResult}; +use polars_core::prelude::{InitHashMaps, PlHashMap}; +use polars_error::{to_compute_err, PolarsError, PolarsResult}; use tokio::io::AsyncWriteExt; use crate::pl_async::{ - self, tune_with_concurrency_budget, with_concurrency_budget, MAX_BUDGET_PER_REQUEST, + self, get_concurrency_limit, get_download_chunk_size, tune_with_concurrency_budget, + with_concurrency_budget, MAX_BUDGET_PER_REQUEST, }; /// Polars specific wrapper for `Arc` that limits the number of @@ -23,63 +25,184 @@ impl PolarsObjectStore { Self(store) } - pub async fn get(&self, path: &Path) -> PolarsResult { - tune_with_concurrency_budget(1, || async { - self.0 - .get(path) - .await - .map_err(to_compute_err)? - .bytes() - .await - .map_err(to_compute_err) - }) - .await + /// Returns a buffered stream that downloads concurrently up to the concurrency limit. + fn get_buffered_ranges_stream<'a, T: Iterator>>( + &'a self, + path: &'a Path, + ranges: T, + ) -> impl StreamExt> + + TryStreamExt> + + use<'a, T> { + futures::stream::iter( + ranges + .map(|range| async { self.0.get_range(path, range).await.map_err(to_compute_err) }), + ) + // Add a limit locally as this gets run inside a single `tune_with_concurrency_budget`. + .buffered(get_concurrency_limit() as usize) } pub async fn get_range(&self, path: &Path, range: Range) -> PolarsResult { - tune_with_concurrency_budget(1, || self.0.get_range(path, range)) - .await - .map_err(to_compute_err) + let parts = split_range(range.clone()); + + if parts.len() == 1 { + tune_with_concurrency_budget(1, || self.0.get_range(path, range)) + .await + .map_err(to_compute_err) + } else { + let parts = tune_with_concurrency_budget( + parts.len().clamp(0, MAX_BUDGET_PER_REQUEST) as u32, + || { + self.get_buffered_ranges_stream(path, parts) + .try_collect::>() + }, + ) + .await?; + + let mut combined = Vec::with_capacity(range.len()); + + for part in parts { + combined.extend_from_slice(&part) + } + + assert_eq!(combined.len(), range.len()); + + PolarsResult::Ok(Bytes::from(combined)) + } } - pub async fn get_ranges( + /// Fetch byte ranges into a HashMap keyed by the range start. This will mutably sort the + /// `ranges` slice for coalescing. + /// + /// # Panics + /// Panics if the same range start is used by more than 1 range. + pub async fn get_ranges_sort< + K: TryFrom + std::hash::Hash + Eq, + T: From, + >( &self, path: &Path, - ranges: &[Range], - ) -> PolarsResult> { + ranges: &mut [Range], + ) -> PolarsResult> { + if ranges.is_empty() { + return Ok(Default::default()); + } + + let mut out = PlHashMap::with_capacity(ranges.len()); + + ranges.sort_unstable_by_key(|x| x.start); + + let (merged_ranges, merged_ends): (Vec<_>, Vec<_>) = merge_ranges(ranges).unzip(); + + let mut stream = self.get_buffered_ranges_stream(path, merged_ranges.iter().cloned()); + tune_with_concurrency_budget( - (ranges.len() as u32).clamp(0, MAX_BUDGET_PER_REQUEST as u32), - || self.0.get_ranges(path, ranges), + merged_ranges.len().clamp(0, MAX_BUDGET_PER_REQUEST) as u32, + || async { + let mut len = 0; + let mut current_offset = 0; + let mut ends_iter = merged_ends.iter(); + + let mut splitted_parts = vec![]; + + while let Some(bytes) = stream.try_next().await? { + len += bytes.len(); + let end = *ends_iter.next().unwrap(); + + if end == 0 { + splitted_parts.push(bytes); + continue; + } + + let full_range = ranges[current_offset..end] + .iter() + .cloned() + .reduce(|l, r| l.start.min(r.start)..l.end.max(r.end)) + .unwrap(); + + let bytes = if splitted_parts.is_empty() { + bytes + } else { + let mut out = Vec::with_capacity(full_range.len()); + + for x in splitted_parts.drain(..) { + out.extend_from_slice(&x); + } + + out.extend_from_slice(&bytes); + Bytes::from(out) + }; + + assert_eq!(bytes.len(), full_range.len()); + + for range in &ranges[current_offset..end] { + let v = out.insert( + K::try_from(range.start).unwrap(), + T::from(bytes.slice( + range.start - full_range.start..range.end - full_range.start, + )), + ); + + assert!(v.is_none()); // duplicate range start + } + + current_offset = end; + } + + assert!(splitted_parts.is_empty()); + + PolarsResult::Ok(pl_async::Size::from(len as u64)) + }, ) - .await - .map_err(to_compute_err) + .await?; + + Ok(out) } - pub async fn download( - &self, - path: &Path, - file: &mut F, - ) -> PolarsResult<()> { - tune_with_concurrency_budget(1, || async { - let mut stream = self - .0 - .get(path) - .await - .map_err(to_compute_err)? - .into_stream(); - - let mut len = 0; - while let Some(bytes) = stream.next().await { - let bytes = bytes.map_err(to_compute_err)?; - len += bytes.len(); - file.write_all(bytes.as_ref()) + pub async fn download(&self, path: &Path, file: &mut tokio::fs::File) -> PolarsResult<()> { + let opt_size = self.head(path).await.ok().map(|x| x.size); + let parts = opt_size.map(|x| split_range(0..x)).filter(|x| x.len() > 1); + + if let Some(parts) = parts { + tune_with_concurrency_budget( + parts.len().clamp(0, MAX_BUDGET_PER_REQUEST) as u32, + || async { + let mut stream = self.get_buffered_ranges_stream(path, parts); + let mut len = 0; + while let Some(bytes) = stream.try_next().await? { + len += bytes.len(); + file.write_all(&bytes).await.map_err(to_compute_err)?; + } + + assert_eq!(len, opt_size.unwrap()); + + PolarsResult::Ok(pl_async::Size::from(len as u64)) + }, + ) + .await? + } else { + tune_with_concurrency_budget(1, || async { + let mut stream = self + .0 + .get(path) .await - .map_err(to_compute_err)?; - } + .map_err(to_compute_err)? + .into_stream(); + + let mut len = 0; + while let Some(bytes) = stream.try_next().await? { + len += bytes.len(); + file.write_all(&bytes).await.map_err(to_compute_err)?; + } + + PolarsResult::Ok(pl_async::Size::from(len as u64)) + }) + .await? + }; + + // Dropping is delayed for tokio async files so we need to explicitly + // flush here (https://github.com/tokio-rs/tokio/issues/2307#issuecomment-596336451). + file.sync_all().await.map_err(PolarsError::from)?; - PolarsResult::Ok(pl_async::Size::from(len as u64)) - }) - .await?; Ok(()) } @@ -113,3 +236,229 @@ impl PolarsObjectStore { .map_err(to_compute_err) } } + +/// Splits a single range into multiple smaller ranges, which can be downloaded concurrently for +/// much higher throughput. +fn split_range(range: Range) -> impl ExactSizeIterator> { + let chunk_size = get_download_chunk_size(); + + // Calculate n_parts such that we are as close as possible to the `chunk_size`. + let n_parts = [ + (range.len().div_ceil(chunk_size)).max(1), + (range.len() / chunk_size).max(1), + ] + .into_iter() + .min_by_key(|x| (range.len() / *x).abs_diff(chunk_size)) + .unwrap(); + + let chunk_size = (range.len() / n_parts).max(1); + + assert_eq!(n_parts, (range.len() / chunk_size).max(1)); + let bytes_rem = range.len() % chunk_size; + + (0..n_parts).map(move |part_no| { + let (start, end) = if part_no == 0 { + // Download remainder length in the first chunk since it starts downloading first. + let end = range.start + chunk_size + bytes_rem; + let end = if end > range.end { range.end } else { end }; + (range.start, end) + } else { + let start = bytes_rem + range.start + part_no * chunk_size; + (start, start + chunk_size) + }; + + start..end + }) +} + +/// Note: For optimal performance, `ranges` should be sorted. More generally, +/// ranges placed next to each other should also be close in range value. +/// +/// # Returns +/// `[(range1, end1), (range2, end2)]`, where: +/// * `range1` contains bytes for the ranges from `ranges[0..end1]` +/// * `range2` contains bytes for the ranges from `ranges[end1..end2]` +/// * etc.. +/// +/// Note that if an end value is 0, it means the range is a splitted part and should be combined. +fn merge_ranges(ranges: &[Range]) -> impl Iterator, usize)> + '_ { + let chunk_size = get_download_chunk_size(); + + let mut current_merged_range = ranges.first().map_or(0..0, Clone::clone); + // Number of fetched bytes excluding excess. + let mut current_n_bytes = current_merged_range.len(); + + (0..ranges.len()) + .filter_map(move |current_idx| { + let current_idx = 1 + current_idx; + + if current_idx == ranges.len() { + // No more items - flush current state. + Some((current_merged_range.clone(), current_idx)) + } else { + let range = ranges[current_idx].clone(); + + let new_merged = current_merged_range.start.min(range.start) + ..current_merged_range.end.max(range.end); + + // E.g.: + // |--------| + // oo // range1 + // oo // range2 + // ^^^ // distance = 3, is_overlapping = false + // E.g.: + // |--------| + // ooooo // range1 + // ooooo // range2 + // ^^ // distance = 2, is_overlapping = true + let (distance, is_overlapping) = { + let l = current_merged_range.end.min(range.end); + let r = current_merged_range.start.max(range.start); + + (r.abs_diff(l), r < l) + }; + + let should_merge = is_overlapping || { + let leq_current_len_dist_to_chunk_size = new_merged.len().abs_diff(chunk_size) + <= current_merged_range.len().abs_diff(chunk_size); + let gap_tolerance = + (current_n_bytes.max(range.len()) / 8).clamp(1024 * 1024, 8 * 1024 * 1024); + + leq_current_len_dist_to_chunk_size && distance <= gap_tolerance + }; + + if should_merge { + // Merge to existing range + current_merged_range = new_merged; + current_n_bytes += if is_overlapping { + range.len() - distance + } else { + range.len() + }; + None + } else { + let out = (current_merged_range.clone(), current_idx); + current_merged_range = range; + current_n_bytes = current_merged_range.len(); + Some(out) + } + } + }) + .flat_map(|x| { + // Split large individual ranges within the list of ranges. + let (range, end) = x; + let split = split_range(range.clone()); + let len = split.len(); + + split + .enumerate() + .map(move |(i, range)| (range, if 1 + i == len { end } else { 0 })) + }) +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_split_range() { + use super::{get_download_chunk_size, split_range}; + + let chunk_size = get_download_chunk_size(); + + assert_eq!(chunk_size, 64 * 1024 * 1024); + + #[allow(clippy::single_range_in_vec_init)] + { + // Round-trip empty ranges. + assert_eq!(split_range(0..0).collect::>(), [0..0]); + assert_eq!(split_range(3..3).collect::>(), [3..3]); + } + + // Threshold to start splitting to 2 ranges + // + // n - chunk_size == chunk_size - n / 2 + // n + n / 2 == 2 * chunk_size + // 3 * n == 4 * chunk_size + // n = 4 * chunk_size / 3 + let n = 4 * chunk_size / 3; + + #[allow(clippy::single_range_in_vec_init)] + { + assert_eq!(split_range(0..n).collect::>(), [0..89478485]); + } + + assert_eq!( + split_range(0..n + 1).collect::>(), + [0..44739243, 44739243..89478486] + ); + + // Threshold to start splitting to 3 ranges + // + // n / 2 - chunk_size == chunk_size - n / 3 + // n / 2 + n / 3 == 2 * chunk_size + // 5 * n == 12 * chunk_size + // n == 12 * chunk_size / 5 + let n = 12 * chunk_size / 5; + + assert_eq!( + split_range(0..n).collect::>(), + [0..80530637, 80530637..161061273] + ); + + assert_eq!( + split_range(0..n + 1).collect::>(), + [0..53687092, 53687092..107374183, 107374183..161061274] + ); + } + + #[test] + fn test_merge_ranges() { + use super::{get_download_chunk_size, merge_ranges}; + + let chunk_size = get_download_chunk_size(); + + assert_eq!(chunk_size, 64 * 1024 * 1024); + + // Round-trip empty slice + assert_eq!(merge_ranges(&[]).collect::>(), []); + + // We have 1 tiny request followed by 1 huge request. They are combined as it reduces the + // `abs_diff()` to the `chunk_size`, but afterwards they are split to 2 evenly sized + // requests. + assert_eq!( + merge_ranges(&[0..1, 1..127 * 1024 * 1024]).collect::>(), + [(0..66584576, 0), (66584576..133169152, 2)] + ); + + // <= 1MiB gap, merge + assert_eq!( + merge_ranges(&[0..1, 1024 * 1024 + 1..1024 * 1024 + 2]).collect::>(), + [(0..1048578, 2)] + ); + + // > 1MiB gap, do not merge + assert_eq!( + merge_ranges(&[0..1, 1024 * 1024 + 2..1024 * 1024 + 3]).collect::>(), + [(0..1, 1), (1048578..1048579, 2)] + ); + + // <= 12.5% gap, merge + assert_eq!( + merge_ranges(&[0..8, 10..11]).collect::>(), + [(0..11, 2)] + ); + + // <= 12.5% gap relative to RHS, merge + assert_eq!( + merge_ranges(&[0..1, 3..11]).collect::>(), + [(0..11, 2)] + ); + + // Overlapping range, merge + assert_eq!( + merge_ranges(&[0..80 * 1024 * 1024, 10 * 1024 * 1024..70 * 1024 * 1024]) + .collect::>(), + [(0..80 * 1024 * 1024, 2)] + ); + } +} diff --git a/crates/polars-io/src/file_cache/file_fetcher.rs b/crates/polars-io/src/file_cache/file_fetcher.rs index bd16dff7fda4..3d712ba955fc 100644 --- a/crates/polars-io/src/file_cache/file_fetcher.rs +++ b/crates/polars-io/src/file_cache/file_fetcher.rs @@ -116,12 +116,7 @@ impl FileFetcher for CloudFileFetcher { .await .map_err(PolarsError::from)?; - self.object_store.download(&self.cloud_path, file).await?; - // Dropping is delayed for tokio async files so we need to explicitly - // flush here (https://github.com/tokio-rs/tokio/issues/2307#issuecomment-596336451). - file.sync_all().await.map_err(PolarsError::from)?; - PolarsResult::Ok(()) - })?; - Ok(()) + self.object_store.download(&self.cloud_path, file).await + }) } } diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index da50364855da..053aad67464a 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -21,7 +21,7 @@ use crate::parquet::metadata::FileMetadataRef; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; -type DownloadedRowGroup = Vec<(u64, Bytes)>; +type DownloadedRowGroup = PlHashMap; type QueuePayload = (usize, DownloadedRowGroup); type QueueSend = Arc>>; @@ -49,14 +49,8 @@ impl ParquetObjectStore { }) } - async fn get_range(&self, start: usize, length: usize) -> PolarsResult { - self.store - .get_range(&self.path, start..start + length) - .await - } - - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { - self.store.get_ranges(&self.path, ranges).await + async fn get_ranges(&self, ranges: &mut [Range]) -> PolarsResult> { + self.store.get_ranges_sort(&self.path, ranges).await } /// Initialize the length property of the object, unless it has already been fetched. @@ -194,16 +188,10 @@ async fn download_projection( } }); - let result = async_reader.get_ranges(&ranges).await.map(|bytes| { - ( - rg_index, - bytes - .into_iter() - .zip(offsets) - .map(|(bytes, offset)| (offset, bytes)) - .collect::>(), - ) - }); + let result = async_reader + .get_ranges(&mut ranges) + .await + .map(|bytes_map| (rg_index, bytes_map)); sender.send(result).await.is_ok() } @@ -217,33 +205,20 @@ async fn download_row_group( return true; } - let full_byte_range = rg.full_byte_range(); - let full_byte_range = full_byte_range.start as usize..full_byte_range.end as usize; - - let result = async_reader - .get_range( - full_byte_range.start, - full_byte_range.end - full_byte_range.start, + let mut ranges = rg + .byte_ranges_iter() + .map(|x| x.start as usize..x.end as usize) + .collect::>(); + + sender + .send( + async_reader + .get_ranges(&mut ranges) + .await + .map(|bytes_map| (rg_index, bytes_map)), ) .await - .map(|bytes| { - ( - rg_index, - rg.byte_ranges_iter() - .map(|range| { - ( - range.start, - bytes.slice( - range.start as usize - full_byte_range.start - ..range.end as usize - full_byte_range.start, - ), - ) - }) - .collect::(), - ) - }); - - sender.send(result).await.is_ok() + .is_ok() } pub struct FetchRowGroupsFromObjectStore { diff --git a/crates/polars-io/src/pl_async.rs b/crates/polars-io/src/pl_async.rs index cc43a908cda3..4c95c96f7733 100644 --- a/crates/polars-io/src/pl_async.rs +++ b/crates/polars-io/src/pl_async.rs @@ -4,7 +4,7 @@ use std::ops::Deref; use std::sync::atomic::{AtomicBool, AtomicU64, AtomicU8, Ordering}; use once_cell::sync::Lazy; -use polars_core::config::verbose; +use polars_core::config::{self, verbose}; use polars_core::POOL; use tokio::runtime::{Builder, Runtime}; use tokio::sync::Semaphore; @@ -12,6 +12,25 @@ use tokio::sync::Semaphore; static CONCURRENCY_BUDGET: std::sync::OnceLock<(Semaphore, u32)> = std::sync::OnceLock::new(); pub(super) const MAX_BUDGET_PER_REQUEST: usize = 10; +/// Used to determine chunks when splitting large ranges, or combining small +/// ranges. +pub(super) static DOWNLOAD_CHUNK_SIZE: Lazy = Lazy::new(|| { + let v: usize = std::env::var("POLARS_DOWNLOAD_CHUNK_SIZE") + .as_deref() + .map(|x| x.parse().expect("integer")) + .unwrap_or(64 * 1024 * 1024); + + if config::verbose() { + eprintln!("async download_chunk_size: {}", v) + } + + v +}); + +pub(super) fn get_download_chunk_size() -> usize { + *DOWNLOAD_CHUNK_SIZE +} + pub trait GetSize { fn size(&self) -> u64; } @@ -158,6 +177,10 @@ fn get_semaphore() -> &'static (Semaphore, u32) { }) } +pub(crate) fn get_concurrency_limit() -> u32 { + get_semaphore().1 +} + pub async fn tune_with_concurrency_budget(requested_budget: u32, callable: F) -> Fut::Output where F: FnOnce() -> Fut, diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs index e2dd3e876c2a..af37d32b36da 100644 --- a/crates/polars-io/src/utils/byte_source.rs +++ b/crates/polars-io/src/utils/byte_source.rs @@ -1,6 +1,7 @@ use std::ops::Range; use std::sync::Arc; +use polars_core::prelude::PlHashMap; use polars_error::PolarsResult; use polars_utils::_limit_path_len_io_err; use polars_utils::mmap::MemSlice; @@ -16,7 +17,11 @@ pub trait ByteSource: Send + Sync { /// # Panics /// Panics if `range` is not in bounds. async fn get_range(&self, range: Range) -> PolarsResult; - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult>; + /// Note: This will mutably sort ranges for coalescing. + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult>; } /// Byte source backed by a `MemSlice`, which can potentially be memory-mapped. @@ -49,11 +54,14 @@ impl ByteSource for MemSliceByteSource { Ok(out) } - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult> { Ok(ranges .iter() - .map(|x| self.0.slice(x.clone())) - .collect::>()) + .map(|x| (x.start, self.0.slice(x.clone()))) + .collect()) } } @@ -88,9 +96,11 @@ impl ByteSource for ObjectStoreByteSource { Ok(mem_slice) } - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { - let ranges = self.store.get_ranges(&self.path, ranges).await?; - Ok(ranges.into_iter().map(MemSlice::from_bytes).collect()) + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult> { + self.store.get_ranges_sort(&self.path, ranges).await } } @@ -130,7 +140,10 @@ impl ByteSource for DynByteSource { } } - async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + async fn get_ranges( + &self, + ranges: &mut [Range], + ) -> PolarsResult> { match self { Self::MemSlice(v) => v.get_ranges(ranges).await, Self::Cloud(v) => v.get_ranges(ranges).await, diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 4e039124933f..f4ef629821a9 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -45,7 +45,7 @@ pub fn get_reader_bytes( feature = "parquet", feature = "avro" ))] -pub(crate) fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> ArrowSchema { +pub fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> ArrowSchema { projection .iter() .map(|idx| schema.get_at_index(*idx).unwrap()) @@ -59,14 +59,14 @@ pub(crate) fn apply_projection(schema: &ArrowSchema, projection: &[usize]) -> Ar feature = "avro", feature = "parquet" ))] -pub(crate) fn columns_to_projection( - columns: &[String], +pub fn columns_to_projection>( + columns: &[T], schema: &ArrowSchema, ) -> PolarsResult> { let mut prj = Vec::with_capacity(columns.len()); for column in columns { - let i = schema.try_index_of(column)?; + let i = schema.try_index_of(column.as_ref())?; prj.push(i); } diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index 6b2d8cb05da0..2ab337ef51e9 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -450,6 +450,7 @@ fn take_aggregations() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .head(Some(2)), ) @@ -489,6 +490,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0))]) .collect()?; @@ -507,6 +509,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0))]) .collect()?; @@ -526,6 +529,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0)) .alias("1"), @@ -537,6 +541,7 @@ fn test_take_consistency() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .get(lit(0)), ) diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index 95cbf586be67..0c4e518b5042 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -1666,6 +1666,7 @@ fn test_single_group_result() -> PolarsResult<()> { nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }) .over([col("a")])]) .collect()?; diff --git a/crates/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs index 9caf861b6cd9..ef37e267c10f 100644 --- a/crates/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -285,6 +285,7 @@ fn top_k_by_impl( nulls_last: vec![true; by.len()], multithreaded, maintain_order: false, + limit: None, }; let idx = _arg_bottom_k(k, by, &mut sort_options)?; diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index d34c37e7ff67..def36b76a677 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -237,6 +237,7 @@ impl JoinValidation { s_left: &Series, s_right: &Series, build_shortest_table: bool, + join_nulls: bool, ) -> PolarsResult<()> { // In default, probe is the left series. // @@ -253,7 +254,13 @@ impl JoinValidation { // Only check the `build` side. // The other side use `validate_build` to check ManyToMany | ManyToOne => true, - OneToMany | OneToOne => probe.n_unique()? == probe.len(), + OneToMany | OneToOne => { + if !join_nulls && probe.null_count() > 0 { + probe.n_unique()? - 1 == probe.len() - probe.null_count() + } else { + probe.n_unique()? == probe.len() + } + }, }; polars_ensure!(valid, ComputeError: "join keys did not fulfill {} validation", self); Ok(()) diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs index f79e8759d9e8..7c365210b208 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_dispatch.rs @@ -20,7 +20,7 @@ pub trait SeriesJoin: SeriesSealed + Sized { ) -> PolarsResult { let s_self = self.as_series(); let (lhs, rhs) = (s_self.to_physical_repr(), other.to_physical_repr()); - validate.validate_probe(&lhs, &rhs, false)?; + validate.validate_probe(&lhs, &rhs, false, join_nulls)?; let lhs_dtype = lhs.dtype(); let rhs_dtype = rhs.dtype(); @@ -35,7 +35,8 @@ pub trait SeriesJoin: SeriesSealed + Sized { let (lhs, rhs, _, _) = prepare_binary::(lhs, rhs, false); let lhs = lhs.iter().map(|v| v.as_slice()).collect::>(); let rhs = rhs.iter().map(|v| v.as_slice()).collect::>(); - hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls) + let build_null_count = other.null_count(); + hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls, build_null_count) }, T::BinaryOffset => { let lhs = lhs.binary_offset().unwrap(); @@ -44,7 +45,8 @@ pub trait SeriesJoin: SeriesSealed + Sized { // Take slices so that vecs are not copied let lhs = lhs.iter().map(|k| k.as_slice()).collect::>(); let rhs = rhs.iter().map(|k| k.as_slice()).collect::>(); - hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls) + let build_null_count = other.null_count(); + hash_join_tuples_left(lhs, rhs, None, None, validate, join_nulls, build_null_count) }, x if x.is_float() => { with_match_physical_float_polars_type!(lhs.dtype(), |$T| { @@ -168,7 +170,7 @@ pub trait SeriesJoin: SeriesSealed + Sized { ) -> PolarsResult<(InnerJoinIds, bool)> { let s_self = self.as_series(); let (lhs, rhs) = (s_self.to_physical_repr(), other.to_physical_repr()); - validate.validate_probe(&lhs, &rhs, true)?; + validate.validate_probe(&lhs, &rhs, true, join_nulls)?; let lhs_dtype = lhs.dtype(); let rhs_dtype = rhs.dtype(); @@ -184,8 +186,20 @@ pub trait SeriesJoin: SeriesSealed + Sized { // Take slices so that vecs are not copied let lhs = lhs.iter().map(|k| k.as_slice()).collect::>(); let rhs = rhs.iter().map(|k| k.as_slice()).collect::>(); + let build_null_count = if swapped { + s_self.null_count() + } else { + other.null_count() + }; Ok(( - hash_join_tuples_inner(lhs, rhs, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + lhs, + rhs, + swapped, + validate, + join_nulls, + build_null_count, + )?, !swapped, )) }, @@ -196,8 +210,20 @@ pub trait SeriesJoin: SeriesSealed + Sized { // Take slices so that vecs are not copied let lhs = lhs.iter().map(|k| k.as_slice()).collect::>(); let rhs = rhs.iter().map(|k| k.as_slice()).collect::>(); + let build_null_count = if swapped { + s_self.null_count() + } else { + other.null_count() + }; Ok(( - hash_join_tuples_inner(lhs, rhs, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + lhs, + rhs, + swapped, + validate, + join_nulls, + build_null_count, + )?, !swapped, )) }, @@ -244,7 +270,7 @@ pub trait SeriesJoin: SeriesSealed + Sized { ) -> PolarsResult<(PrimitiveArray, PrimitiveArray)> { let s_self = self.as_series(); let (lhs, rhs) = (s_self.to_physical_repr(), other.to_physical_repr()); - validate.validate_probe(&lhs, &rhs, true)?; + validate.validate_probe(&lhs, &rhs, true, join_nulls)?; let lhs_dtype = lhs.dtype(); let rhs_dtype = rhs.dtype(); @@ -352,20 +378,38 @@ where .map(|arr| arr.as_slice().unwrap()) .collect::>(); Ok(( - hash_join_tuples_inner(splitted_a, splitted_b, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + splitted_a, splitted_b, swapped, validate, join_nulls, 0, + )?, !swapped, )) } else { Ok(( - hash_join_tuples_inner(splitted_a, splitted_b, swapped, validate, join_nulls)?, + hash_join_tuples_inner( + splitted_a, splitted_b, swapped, validate, join_nulls, 0, + )?, !swapped, )) } }, - _ => Ok(( - hash_join_tuples_inner(splitted_a, splitted_b, swapped, validate, join_nulls)?, - !swapped, - )), + _ => { + let build_null_count = if swapped { + left.null_count() + } else { + right.null_count() + }; + Ok(( + hash_join_tuples_inner( + splitted_a, + splitted_b, + swapped, + validate, + join_nulls, + build_null_count, + )?, + !swapped, + )) + }, } } @@ -430,7 +474,7 @@ where (0, 0, 1, 1) => { let keys_a = chunks_as_slices(&splitted_a); let keys_b = chunks_as_slices(&splitted_b); - hash_join_tuples_left(keys_a, keys_b, None, None, validate, join_nulls) + hash_join_tuples_left(keys_a, keys_b, None, None, validate, join_nulls, 0) }, (0, 0, _, _) => { let keys_a = chunks_as_slices(&splitted_a); @@ -445,6 +489,7 @@ where mapping_right.as_deref(), validate, join_nulls, + 0, ) }, _ => { @@ -452,6 +497,7 @@ where let keys_b = get_arrays(&splitted_b); let (mapping_left, mapping_right) = create_mappings(left.chunks(), right.chunks(), left.len(), right.len()); + let build_null_count = right.null_count(); hash_join_tuples_left( keys_a, keys_b, @@ -459,6 +505,7 @@ where mapping_right.as_deref(), validate, join_nulls, + build_null_count, ) }, } diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs index f01c99529aea..aeca8bb32546 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_inner.rs @@ -44,6 +44,8 @@ pub(super) fn hash_join_tuples_inner( swapped: bool, validate: JoinValidation, join_nulls: bool, + // Null count is required for join validation + build_null_count: usize, ) -> PolarsResult<(Vec, Vec)> where I: IntoIterator + Send + Sync + Clone, @@ -53,10 +55,13 @@ where // NOTE: see the left join for more elaborate comments // first we hash one relation let hash_tbls = if validate.needs_checks() { - let expected_size = build + let mut expected_size = build .iter() .map(|v| v.clone().into_iter().size_hint().1.unwrap()) .sum(); + if !join_nulls { + expected_size -= build_null_count; + } let hash_tbls = build_tables(build, join_nulls); let build_size = hash_tbls.iter().map(|m| m.len()).sum(); validate.validate_build(build_size, expected_size, swapped)?; diff --git a/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs b/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs index 91c4f0cd1008..b23d9de1776f 100644 --- a/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs +++ b/crates/polars-ops/src/frame/join/hash_join/single_keys_left.rs @@ -112,6 +112,8 @@ pub(super) fn hash_join_tuples_left( chunk_mapping_right: Option<&[ChunkId]>, validate: JoinValidation, join_nulls: bool, + // We should know the number of nulls to avoid extra calculation + build_null_count: usize, ) -> PolarsResult where I: IntoIterator, @@ -123,7 +125,10 @@ where let build = build.into_iter().map(|i| i.into_iter()).collect::>(); // first we hash one relation let hash_tbls = if validate.needs_checks() { - let expected_size = build.iter().map(|v| v.size_hint().1.unwrap()).sum(); + let mut expected_size = build.iter().map(|v| v.size_hint().1.unwrap()).sum(); + if !join_nulls { + expected_size -= build_null_count; + } let hash_tbls = build_tables(build, join_nulls); let build_size = hash_tbls.iter().map(|m| m.len()).sum(); validate.validate_build(build_size, expected_size, false)?; diff --git a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs index fce2f2bf6cf0..95cde8387733 100644 --- a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs +++ b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs @@ -225,6 +225,7 @@ pub(crate) fn _sort_or_hash_inner( nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let s_right = unsafe { s_right.take_unchecked(&sort_idx) }; let ids = par_sorted_merge_inner_no_nulls(s_left, &s_right); @@ -252,6 +253,7 @@ pub(crate) fn _sort_or_hash_inner( nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let s_left = unsafe { s_left.take_unchecked(&sort_idx) }; let ids = par_sorted_merge_inner_no_nulls(&s_left, s_right); @@ -323,6 +325,7 @@ pub(crate) fn sort_or_hash_left( nulls_last: false, multithreaded: true, maintain_order: false, + limit: None, }); let s_right = unsafe { s_right.take_unchecked(&sort_idx) }; diff --git a/crates/polars-pipe/src/executors/sinks/sort/source.rs b/crates/polars-pipe/src/executors/sinks/sort/source.rs index 1c1fa2984a0e..6f544e8e6ef1 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/source.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/source.rs @@ -101,6 +101,7 @@ impl SortSource { nulls_last: self.nulls_last, multithreaded: true, maintain_order: false, + limit: None, }, ), Some((offset, len)) => { @@ -119,6 +120,7 @@ impl SortSource { nulls_last: self.nulls_last, multithreaded: true, maintain_order: false, + limit: None, }, ); *len = len.saturating_sub(df_len); diff --git a/crates/polars-plan/src/dsl/function_expr/bounds.rs b/crates/polars-plan/src/dsl/function_expr/bounds.rs index 77c8a6f3ef5f..ae0f36a0956e 100644 --- a/crates/polars-plan/src/dsl/function_expr/bounds.rs +++ b/crates/polars-plan/src/dsl/function_expr/bounds.rs @@ -2,50 +2,12 @@ use super::*; pub(super) fn upper_bound(s: &Column) -> PolarsResult { let name = s.name().clone(); - use DataType::*; - let s = match s.dtype().to_physical() { - #[cfg(feature = "dtype-i8")] - Int8 => Column::new_scalar(name, Scalar::from(i8::MAX), 1), - #[cfg(feature = "dtype-i16")] - Int16 => Column::new_scalar(name, Scalar::from(i16::MAX), 1), - Int32 => Column::new_scalar(name, Scalar::from(i32::MAX), 1), - Int64 => Column::new_scalar(name, Scalar::from(i64::MAX), 1), - #[cfg(feature = "dtype-u8")] - UInt8 => Column::new_scalar(name, Scalar::from(u8::MAX), 1), - #[cfg(feature = "dtype-u16")] - UInt16 => Column::new_scalar(name, Scalar::from(u16::MAX), 1), - UInt32 => Column::new_scalar(name, Scalar::from(u32::MAX), 1), - UInt64 => Column::new_scalar(name, Scalar::from(u64::MAX), 1), - Float32 => Column::new_scalar(name, Scalar::from(f32::INFINITY), 1), - Float64 => Column::new_scalar(name, Scalar::from(f64::INFINITY), 1), - dt => polars_bail!( - ComputeError: "cannot determine upper bound for dtype `{}`", dt, - ), - }; - Ok(s) + let scalar = s.dtype().to_physical().max()?; + Ok(Column::new_scalar(name, scalar, 1)) } pub(super) fn lower_bound(s: &Column) -> PolarsResult { let name = s.name().clone(); - use DataType::*; - let s = match s.dtype().to_physical() { - #[cfg(feature = "dtype-i8")] - Int8 => Column::new_scalar(name, Scalar::from(i8::MIN), 1), - #[cfg(feature = "dtype-i16")] - Int16 => Column::new_scalar(name, Scalar::from(i16::MIN), 1), - Int32 => Column::new_scalar(name, Scalar::from(i32::MIN), 1), - Int64 => Column::new_scalar(name, Scalar::from(i64::MIN), 1), - #[cfg(feature = "dtype-u8")] - UInt8 => Column::new_scalar(name, Scalar::from(u8::MIN), 1), - #[cfg(feature = "dtype-u16")] - UInt16 => Column::new_scalar(name, Scalar::from(u16::MIN), 1), - UInt32 => Column::new_scalar(name, Scalar::from(u32::MIN), 1), - UInt64 => Column::new_scalar(name, Scalar::from(u64::MIN), 1), - Float32 => Column::new_scalar(name, Scalar::from(f32::NEG_INFINITY), 1), - Float64 => Column::new_scalar(name, Scalar::from(f64::NEG_INFINITY), 1), - dt => polars_bail!( - ComputeError: "cannot determine lower bound for dtype `{}`", dt, - ), - }; - Ok(s) + let scalar = s.dtype().to_physical().min()?; + Ok(Column::new_scalar(name, scalar, 1)) } diff --git a/crates/polars-plan/src/plans/aexpr/schema.rs b/crates/polars-plan/src/plans/aexpr/schema.rs index 7105855636c5..6c1b675b2bd8 100644 --- a/crates/polars-plan/src/plans/aexpr/schema.rs +++ b/crates/polars-plan/src/plans/aexpr/schema.rs @@ -32,50 +32,57 @@ impl AExpr { ctx: Context, arena: &Arena, ) -> PolarsResult { - // During aggregation a column that isn't aggregated gets an extra nesting level - // col(foo: i64) -> list[i64] - // But not if we do an aggregation: - // col(foo: i64).sum() -> i64 - // The `nested` keeps track of the nesting we need to add. - let mut nested = matches!(ctx, Context::Aggregation) as u8; - let mut field = self.to_field_impl(schema, ctx, arena, &mut nested)?; + // Indicates whether we should auto-implode the result. This is initialized to true if we are + // in an aggregation context, so functions that return scalars should explicitly set this + // to false in `to_field_impl`. + let mut agg_list = matches!(ctx, Context::Aggregation); + let mut field = self.to_field_impl(schema, ctx, arena, &mut agg_list)?; - if nested >= 1 { + if agg_list { field.coerce(field.dtype().clone().implode()); } + Ok(field) } /// Get Field result of the expression. The schema is the input data. + /// + /// This is taken as `&mut bool` as for some expressions this is determined by the upper node + /// (e.g. `alias`, `cast`). #[recursive] pub fn to_field_impl( &self, schema: &Schema, ctx: Context, arena: &Arena, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult { use AExpr::*; use DataType::*; match self { Len => { - *nested = 0; + *agg_list = false; Ok(Field::new(PlSmallStr::from_static(LEN), IDX_DTYPE)) }, Window { function, options, .. } => { - if let WindowType::Over(mapping) = options { - *nested += matches!(mapping, WindowMapping::Join) as u8; + if let WindowType::Over(WindowMapping::Join) = options { + // expr.over(..), defaults to agg-list unless explicitly unset + // by the `to_field_impl` of the `expr` + *agg_list = true; } + let e = arena.get(*function); - e.to_field_impl(schema, ctx, arena, nested) + e.to_field_impl(schema, ctx, arena, agg_list) }, Explode(expr) => { // `Explode` is a "flatten" operation, which is not the same as returning a scalar. // Namely, it should be auto-imploded in the aggregation context, so we don't update - // the `nested` state here. - let field = arena.get(*expr).to_field_impl(schema, ctx, arena, &mut 0)?; + // the `agg_list` state here. + let field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; if let List(inner) = field.dtype() { Ok(Field::new(field.name().clone(), *inner.clone())) @@ -87,14 +94,14 @@ impl AExpr { name.clone(), arena .get(*expr) - .to_field_impl(schema, ctx, arena, nested)? + .to_field_impl(schema, ctx, arena, agg_list)? .dtype, )), Column(name) => schema .get_field(name) .ok_or_else(|| PolarsError::ColumnNotFound(name.to_string().into())), Literal(sv) => { - *nested = 0; + *agg_list = false; Ok(match sv { LiteralValue::Series(s) => s.field().into_owned(), _ => Field::new(sv.output_name().clone(), sv.get_datatype()), @@ -116,35 +123,42 @@ impl AExpr { | Operator::LogicalOr => { let out_field; let out_name = { - out_field = - arena.get(*left).to_field_impl(schema, ctx, arena, nested)?; + out_field = arena + .get(*left) + .to_field_impl(schema, ctx, arena, agg_list)?; out_field.name() }; Field::new(out_name.clone(), Boolean) }, Operator::TrueDivide => { - return get_truediv_field(*left, *right, arena, ctx, schema, nested) + return get_truediv_field(*left, *right, arena, ctx, schema, agg_list) }, _ => { - return get_arithmetic_field(*left, *right, arena, *op, ctx, schema, nested) + return get_arithmetic_field( + *left, *right, arena, *op, ctx, schema, agg_list, + ) }, }; Ok(field) }, - Sort { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, nested), + Sort { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, agg_list), Gather { expr, returns_scalar, .. } => { if *returns_scalar { - *nested = nested.saturating_sub(1); + *agg_list = false; } - arena.get(*expr).to_field_impl(schema, ctx, arena, nested) + arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false) }, - SortBy { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, nested), - Filter { input, .. } => arena.get(*input).to_field_impl(schema, ctx, arena, nested), + SortBy { expr, .. } => arena.get(*expr).to_field_impl(schema, ctx, arena, agg_list), + Filter { input, .. } => arena + .get(*input) + .to_field_impl(schema, ctx, arena, agg_list), Agg(agg) => { use IRAggExpr::*; match agg { @@ -152,13 +166,16 @@ impl AExpr { | Min { input: expr, .. } | First(expr) | Last(expr) => { - *nested = nested.saturating_sub(1); - arena.get(*expr).to_field_impl(schema, ctx, arena, nested) + *agg_list = false; + arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false) }, Sum(expr) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; let dt = match field.dtype() { Boolean => Some(IDX_DTYPE), UInt8 | Int8 | Int16 | UInt16 => Some(Int64), @@ -170,9 +187,10 @@ impl AExpr { Ok(field) }, Median(expr) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; match field.dtype { Date => field.coerce(Datetime(TimeUnit::Milliseconds, None)), _ => float_type(&mut field), @@ -180,9 +198,10 @@ impl AExpr { Ok(field) }, Mean(expr) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; match field.dtype { Date => field.coerce(Datetime(TimeUnit::Milliseconds, None)), _ => float_type(&mut field), @@ -190,69 +209,80 @@ impl AExpr { Ok(field) }, Implode(expr) => { - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(DataType::List(field.dtype().clone().into())); Ok(field) }, Std(expr, _) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; float_type(&mut field); Ok(field) }, Var(expr, _) => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; float_type(&mut field); Ok(field) }, NUnique(expr) => { - *nested = 0; - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(IDX_DTYPE); Ok(field) }, Count(expr, _) => { - *nested = 0; - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(IDX_DTYPE); Ok(field) }, AggGroups(expr) => { - *nested = 1; - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = true; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; field.coerce(List(IDX_DTYPE.into())); Ok(field) }, Quantile { expr, .. } => { - *nested = nested.saturating_sub(1); - let mut field = - arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let mut field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; float_type(&mut field); Ok(field) }, #[cfg(feature = "bitwise")] Bitwise(expr, _) => { - *nested = nested.saturating_sub(1); - let field = arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + *agg_list = false; + let field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, &mut false)?; // @Q? Do we need to coerce here? Ok(field) }, } }, Cast { expr, dtype, .. } => { - let field = arena.get(*expr).to_field_impl(schema, ctx, arena, nested)?; + let field = arena + .get(*expr) + .to_field_impl(schema, ctx, arena, agg_list)?; Ok(Field::new(field.name().clone(), dtype.clone())) }, Ternary { truthy, falsy, .. } => { - let mut nested_truthy = *nested; - let mut nested_falsy = *nested; + let mut agg_list_truthy = *agg_list; + let mut agg_list_falsy = *agg_list; // During aggregation: // left: col(foo): list nesting: 1 @@ -261,11 +291,11 @@ impl AExpr { let mut truthy = arena .get(*truthy) - .to_field_impl(schema, ctx, arena, &mut nested_truthy)?; + .to_field_impl(schema, ctx, arena, &mut agg_list_truthy)?; let falsy = arena .get(*falsy) - .to_field_impl(schema, ctx, arena, &mut nested_falsy)?; + .to_field_impl(schema, ctx, arena, &mut agg_list_falsy)?; let st = if let DataType::Null = *truthy.dtype() { falsy.dtype().clone() @@ -273,7 +303,7 @@ impl AExpr { try_get_supertype(truthy.dtype(), falsy.dtype())? }; - *nested = std::cmp::max(nested_truthy, nested_falsy); + *agg_list = agg_list_truthy | agg_list_falsy; truthy.coerce(st); Ok(truthy) @@ -284,14 +314,14 @@ impl AExpr { options, .. } => { - let fields = func_args_to_fields(input, ctx, schema, arena, nested)?; + let fields = func_args_to_fields(input, ctx, schema, arena, agg_list)?; polars_ensure!(!fields.is_empty(), ComputeError: "expression: '{}' didn't get any inputs", options.fmt_str); let out = output_type.get_field(schema, ctx, &fields)?; if options.flags.contains(FunctionFlags::RETURNS_SCALAR) { - *nested = 0; + *agg_list = false; } else if matches!(ctx, Context::Aggregation) { - *nested += 1; + *agg_list = true; } Ok(out) @@ -301,19 +331,21 @@ impl AExpr { input, options, } => { - let fields = func_args_to_fields(input, ctx, schema, arena, nested)?; + let fields = func_args_to_fields(input, ctx, schema, arena, agg_list)?; polars_ensure!(!fields.is_empty(), ComputeError: "expression: '{}' didn't get any inputs", function); let out = function.get_field(schema, ctx, &fields)?; if options.flags.contains(FunctionFlags::RETURNS_SCALAR) { - *nested = 0; + *agg_list = false; } else if matches!(ctx, Context::Aggregation) { - *nested += 1; + *agg_list = true; } Ok(out) }, - Slice { input, .. } => arena.get(*input).to_field_impl(schema, ctx, arena, nested), + Slice { input, .. } => arena + .get(*input) + .to_field_impl(schema, ctx, arena, agg_list), } } } @@ -323,25 +355,28 @@ fn func_args_to_fields( ctx: Context, schema: &Schema, arena: &Arena, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult> { - let mut first = true; input .iter() + .enumerate() // Default context because `col()` would return a list in aggregation context - .map(|e| { - // Only mutate first nested as that is the dtype of the function. - let mut nested_tmp = *nested; - let nested = if first { - first = false; - &mut *nested - } else { - &mut nested_tmp - }; + .map(|(i, e)| { + let tmp = &mut false; arena .get(e.node()) - .to_field_impl(schema, ctx, arena, nested) + .to_field_impl( + schema, + ctx, + arena, + if i == 0 { + // Only mutate first agg_list as that is the dtype of the function. + agg_list + } else { + tmp + }, + ) .map(|mut field| { field.name = e.output_name().clone(); field @@ -357,7 +392,7 @@ fn get_arithmetic_field( op: Operator, ctx: Context, schema: &Schema, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult { use DataType::*; let left_ae = arena.get(left); @@ -371,11 +406,11 @@ fn get_arithmetic_field( // leading to quadratic behavior. # 4736 // // further right_type is only determined when needed. - let mut left_field = left_ae.to_field_impl(schema, ctx, arena, nested)?; + let mut left_field = left_ae.to_field_impl(schema, ctx, arena, agg_list)?; let super_type = match op { Operator::Minus => { - let right_type = right_ae.to_field_impl(schema, ctx, arena, nested)?.dtype; + let right_type = right_ae.to_field_impl(schema, ctx, arena, agg_list)?.dtype; match (&left_field.dtype, &right_type) { #[cfg(feature = "dtype-struct")] (Struct(_), Struct(_)) => { @@ -430,7 +465,7 @@ fn get_arithmetic_field( } }, Operator::Plus => { - let right_type = right_ae.to_field_impl(schema, ctx, arena, nested)?.dtype; + let right_type = right_ae.to_field_impl(schema, ctx, arena, agg_list)?.dtype; match (&left_field.dtype, &right_type) { (Duration(_), Datetime(_, _)) | (Datetime(_, _), Duration(_)) @@ -472,7 +507,7 @@ fn get_arithmetic_field( } }, _ => { - let right_type = right_ae.to_field_impl(schema, ctx, arena, nested)?.dtype; + let right_type = right_ae.to_field_impl(schema, ctx, arena, agg_list)?.dtype; match (&left_field.dtype, &right_type) { #[cfg(feature = "dtype-struct")] @@ -558,10 +593,14 @@ fn get_truediv_field( arena: &Arena, ctx: Context, schema: &Schema, - nested: &mut u8, + agg_list: &mut bool, ) -> PolarsResult { - let mut left_field = arena.get(left).to_field_impl(schema, ctx, arena, nested)?; - let right_field = arena.get(right).to_field_impl(schema, ctx, arena, nested)?; + let mut left_field = arena + .get(left) + .to_field_impl(schema, ctx, arena, agg_list)?; + let right_field = arena + .get(right) + .to_field_impl(schema, ctx, arena, agg_list)?; use DataType::*; // TODO: Re-investigate this. A lot of "_" is being used on the RHS match because this code diff --git a/crates/polars-plan/src/plans/optimizer/mod.rs b/crates/polars-plan/src/plans/optimizer/mod.rs index 70880ca78359..dc0d330d8b86 100644 --- a/crates/polars-plan/src/plans/optimizer/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/mod.rs @@ -89,6 +89,7 @@ pub fn optimize( let simplify_expr = opt_state.contains(OptFlags::SIMPLIFY_EXPR); let slice_pushdown = opt_state.contains(OptFlags::SLICE_PUSHDOWN); let streaming = opt_state.contains(OptFlags::STREAMING); + let new_streaming = opt_state.contains(OptFlags::NEW_STREAMING); let fast_projection = opt_state.contains(OptFlags::FAST_PROJECTION); // Don't run optimizations that don't make sense on a single node. @@ -181,7 +182,7 @@ pub fn optimize( } if slice_pushdown { - let slice_pushdown_opt = SlicePushDown::new(streaming); + let slice_pushdown_opt = SlicePushDown::new(streaming, new_streaming); let alp = lp_arena.take(lp_top); let alp = slice_pushdown_opt.optimize(alp, lp_arena, expr_arena)?; diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs index 9c2f8497fac8..a5ff806abae9 100644 --- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs @@ -5,6 +5,7 @@ use crate::prelude::*; pub(super) struct SlicePushDown { streaming: bool, + new_streaming: bool, pub scratch: Vec, } @@ -59,9 +60,10 @@ fn can_pushdown_slice_past_projections(exprs: &[ExprIR], arena: &Arena) - } impl SlicePushDown { - pub(super) fn new(streaming: bool) -> Self { + pub(super) fn new(streaming: bool, new_streaming: bool) -> Self { Self { streaming, + new_streaming, scratch: vec![], } } @@ -211,6 +213,32 @@ impl SlicePushDown { Ok(lp) }, + + #[cfg(feature = "ipc")] + (Scan { + sources, + file_info, + hive_parts, + output_schema, + mut file_options, + predicate, + scan_type: scan_type @ FileScan::Ipc { .. }, + }, Some(state)) if self.new_streaming && predicate.is_none() => { + file_options.slice = Some((state.offset, state.len as usize)); + + let lp = Scan { + sources, + file_info, + hive_parts, + output_schema, + scan_type, + file_options, + predicate, + }; + + Ok(lp) + }, + // TODO! we currently skip slice pushdown if there is a predicate. (Scan { sources, diff --git a/crates/polars-python/src/datatypes.rs b/crates/polars-python/src/datatypes.rs index a31a2301f866..ea7686a29ec6 100644 --- a/crates/polars-python/src/datatypes.rs +++ b/crates/polars-python/src/datatypes.rs @@ -1,10 +1,12 @@ use polars::prelude::*; use polars_core::utils::arrow::array::Utf8ViewArray; +use polars_lazy::dsl; use pyo3::prelude::*; +use crate::error::PyPolarsErr; #[cfg(feature = "object")] use crate::object::OBJECT_NAME; -use crate::Wrap; +use crate::{PyExpr, Wrap}; // Don't change the order of these! #[repr(u8)] @@ -117,3 +119,15 @@ impl<'py> FromPyObject<'py> for PyDataType { Ok(dt.0.into()) } } + +#[pyfunction] +pub fn _get_dtype_max(dt: Wrap) -> PyResult { + let v = dt.0.max().map_err(PyPolarsErr::from)?; + Ok(dsl::lit(v).into()) +} + +#[pyfunction] +pub fn _get_dtype_min(dt: Wrap) -> PyResult { + let v = dt.0.min().map_err(PyPolarsErr::from)?; + Ok(dsl::lit(v).into()) +} diff --git a/crates/polars-python/src/expr/general.rs b/crates/polars-python/src/expr/general.rs index 7125388e88cd..fe5fdafdbbb8 100644 --- a/crates/polars-python/src/expr/general.rs +++ b/crates/polars-python/src/expr/general.rs @@ -260,6 +260,7 @@ impl PyExpr { nulls_last, multithreaded: true, maintain_order: false, + limit: None, }) .into() } @@ -272,6 +273,7 @@ impl PyExpr { nulls_last, multithreaded: true, maintain_order: false, + limit: None, }) .into() } @@ -349,6 +351,7 @@ impl PyExpr { nulls_last, multithreaded, maintain_order, + limit: None, }, ) .into() diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index d3ebb376d10f..1c4e738ea69c 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -75,6 +75,7 @@ pub fn arg_sort_by( nulls_last, multithreaded, maintain_order, + limit: None, }, ) .into() diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index fd89884ece82..f9fb740d4cae 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -539,6 +539,7 @@ impl PyLazyFrame { nulls_last: vec![nulls_last], multithreaded, maintain_order, + limit: None, }, ) .into() @@ -561,6 +562,7 @@ impl PyLazyFrame { nulls_last, maintain_order, multithreaded, + limit: None, }, ) .into() diff --git a/crates/polars-python/src/series/aggregation.rs b/crates/polars-python/src/series/aggregation.rs index 5aa8ee16639e..c4fe8d3447ec 100644 --- a/crates/polars-python/src/series/aggregation.rs +++ b/crates/polars-python/src/series/aggregation.rs @@ -8,37 +8,39 @@ use crate::error::PyPolarsErr; #[pymethods] impl PySeries { - fn any(&self, ignore_nulls: bool) -> PyResult> { - let s = self.series.bool().map_err(PyPolarsErr::from)?; - Ok(if ignore_nulls { - Some(s.any()) - } else { - s.any_kleene() + fn any(&self, py: Python, ignore_nulls: bool) -> PyResult> { + py.allow_threads(|| { + let s = self.series.bool().map_err(PyPolarsErr::from)?; + Ok(if ignore_nulls { + Some(s.any()) + } else { + s.any_kleene() + }) }) } - fn all(&self, ignore_nulls: bool) -> PyResult> { - let s = self.series.bool().map_err(PyPolarsErr::from)?; - Ok(if ignore_nulls { - Some(s.all()) - } else { - s.all_kleene() + fn all(&self, py: Python, ignore_nulls: bool) -> PyResult> { + py.allow_threads(|| { + let s = self.series.bool().map_err(PyPolarsErr::from)?; + Ok(if ignore_nulls { + Some(s.all()) + } else { + s.all_kleene() + }) }) } - fn arg_max(&self) -> Option { - self.series.arg_max() + fn arg_max(&self, py: Python) -> Option { + py.allow_threads(|| self.series.arg_max()) } - fn arg_min(&self) -> Option { - self.series.arg_min() + fn arg_min(&self, py: Python) -> Option { + py.allow_threads(|| self.series.arg_min()) } fn max(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .max_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.max_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -47,49 +49,42 @@ impl PySeries { fn mean(&self, py: Python) -> PyResult { match self.series.dtype() { Boolean => Ok(Wrap( - self.series - .cast(&DataType::UInt8) - .unwrap() - .mean_reduce() + py.allow_threads(|| self.series.cast(&DataType::UInt8).unwrap().mean_reduce()) .as_any_value(), ) .into_py(py)), // For non-numeric output types we require mean_reduce. - dt if dt.is_temporal() => { - Ok(Wrap(self.series.mean_reduce().as_any_value()).into_py(py)) - }, - _ => Ok(self.series.mean().into_py(py)), + dt if dt.is_temporal() => Ok(Wrap( + py.allow_threads(|| self.series.mean_reduce()) + .as_any_value(), + ) + .into_py(py)), + _ => Ok(py.allow_threads(|| self.series.mean()).into_py(py)), } } fn median(&self, py: Python) -> PyResult { match self.series.dtype() { Boolean => Ok(Wrap( - self.series - .cast(&DataType::UInt8) - .unwrap() - .median_reduce() + py.allow_threads(|| self.series.cast(&DataType::UInt8).unwrap().median_reduce()) .map_err(PyPolarsErr::from)? .as_any_value(), ) .into_py(py)), // For non-numeric output types we require median_reduce. dt if dt.is_temporal() => Ok(Wrap( - self.series - .median_reduce() + py.allow_threads(|| self.series.median_reduce()) .map_err(PyPolarsErr::from)? .as_any_value(), ) .into_py(py)), - _ => Ok(self.series.median().into_py(py)), + _ => Ok(py.allow_threads(|| self.series.median()).into_py(py)), } } fn min(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .min_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.min_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -97,26 +92,27 @@ impl PySeries { fn product(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .product() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.product().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) } - fn quantile(&self, quantile: f64, interpolation: Wrap) -> PyResult { - let bind = self.series.quantile_reduce(quantile, interpolation.0); + fn quantile( + &self, + py: Python, + quantile: f64, + interpolation: Wrap, + ) -> PyResult { + let bind = py.allow_threads(|| self.series.quantile_reduce(quantile, interpolation.0)); let sc = bind.map_err(PyPolarsErr::from)?; - Ok(Python::with_gil(|py| Wrap(sc.as_any_value()).into_py(py))) + Ok(Wrap(sc.as_any_value()).into_py(py)) } fn std(&self, py: Python, ddof: u8) -> PyResult { Ok(Wrap( - self.series - .std_reduce(ddof) - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.std_reduce(ddof).map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -124,9 +120,7 @@ impl PySeries { fn var(&self, py: Python, ddof: u8) -> PyResult { Ok(Wrap( - self.series - .var_reduce(ddof) - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.var_reduce(ddof).map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -134,37 +128,31 @@ impl PySeries { fn sum(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .sum_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.sum_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) } fn first(&self, py: Python) -> PyObject { - Wrap(self.series.first().as_any_value()).into_py(py) + Wrap(py.allow_threads(|| self.series.first()).as_any_value()).into_py(py) } fn last(&self, py: Python) -> PyObject { - Wrap(self.series.last().as_any_value()).into_py(py) + Wrap(py.allow_threads(|| self.series.last()).as_any_value()).into_py(py) } #[cfg(feature = "approx_unique")] fn approx_n_unique(&self, py: Python) -> PyResult { - Ok(self - .series - .approx_n_unique() - .map_err(PyPolarsErr::from)? + Ok(py + .allow_threads(|| self.series.approx_n_unique().map_err(PyPolarsErr::from))? .into_py(py)) } #[cfg(feature = "bitwise")] fn bitwise_and(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .and_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.and_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -173,9 +161,7 @@ impl PySeries { #[cfg(feature = "bitwise")] fn bitwise_or(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .or_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.or_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) @@ -184,9 +170,7 @@ impl PySeries { #[cfg(feature = "bitwise")] fn bitwise_xor(&self, py: Python) -> PyResult { Ok(Wrap( - self.series - .xor_reduce() - .map_err(PyPolarsErr::from)? + py.allow_threads(|| self.series.xor_reduce().map_err(PyPolarsErr::from))? .as_any_value(), ) .into_py(py)) diff --git a/crates/polars-python/src/series/arithmetic.rs b/crates/polars-python/src/series/arithmetic.rs index c5483aced1e7..62edd00a7656 100644 --- a/crates/polars-python/src/series/arithmetic.rs +++ b/crates/polars-python/src/series/arithmetic.rs @@ -6,28 +6,33 @@ use crate::error::PyPolarsErr; #[pymethods] impl PySeries { - fn add(&self, other: &PySeries) -> PyResult { - Ok((&self.series + &other.series) + fn add(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series + &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn sub(&self, other: &PySeries) -> PyResult { - Ok((&self.series - &other.series) + fn sub(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series - &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn div(&self, other: &PySeries) -> PyResult { - Ok((&self.series / &other.series) + fn div(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series / &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn mul(&self, other: &PySeries) -> PyResult { - Ok((&self.series * &other.series) + fn mul(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series * &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } - fn rem(&self, other: &PySeries) -> PyResult { - Ok((&self.series % &other.series) + fn rem(&self, py: Python, other: &PySeries) -> PyResult { + Ok(py + .allow_threads(|| &self.series % &other.series) .map(Into::into) .map_err(PyPolarsErr::from)?) } @@ -37,8 +42,8 @@ macro_rules! impl_arithmetic { ($name:ident, $type:ty, $operand:tt) => { #[pymethods] impl PySeries { - fn $name(&self, other: $type) -> PyResult { - Ok((&self.series $operand other).into()) + fn $name(&self, py: Python, other: $type) -> PyResult { + Ok(py.allow_threads(|| {&self.series $operand other}).into()) } } }; @@ -103,8 +108,8 @@ macro_rules! impl_rhs_arithmetic { ($name:ident, $type:ty, $operand:ident) => { #[pymethods] impl PySeries { - fn $name(&self, other: $type) -> PyResult { - Ok(other.$operand(&self.series).into()) + fn $name(&self, py: Python, other: $type) -> PyResult { + Ok(py.allow_threads(|| other.$operand(&self.series)).into()) } } }; diff --git a/crates/polars-python/src/series/buffers.rs b/crates/polars-python/src/series/buffers.rs index 939159220277..e3b9402d4d47 100644 --- a/crates/polars-python/src/series/buffers.rs +++ b/crates/polars-python/src/series/buffers.rs @@ -82,9 +82,9 @@ impl PySeries { } /// Return the underlying values, validity, and offsets buffers as Series. - fn _get_buffers(&self) -> PyResult<(Self, Option, Option)> { + fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option, Option)> { let s = &self.series; - match s.dtype().to_physical() { + py.allow_threads(|| match s.dtype().to_physical() { dt if dt.is_numeric() => get_buffers_from_primitive(s), DataType::Boolean => get_buffers_from_primitive(s), DataType::String => get_buffers_from_string(s), @@ -92,7 +92,7 @@ impl PySeries { let msg = format!("`_get_buffers` not implemented for `dtype` {dt}"); Err(PyTypeError::new_err(msg)) }, - } + }) } } @@ -253,6 +253,7 @@ impl PySeries { #[staticmethod] #[pyo3(signature = (dtype, data, validity=None))] unsafe fn _from_buffers( + py: Python, dtype: Wrap, data: Vec, validity: Option, @@ -320,7 +321,7 @@ impl PySeries { )), }; let values = series_to_buffer::(values); - from_buffers_string_impl(values, validity, offsets)? + py.allow_threads(|| from_buffers_string_impl(values, validity, offsets))? }, dt => { let msg = format!("`_from_buffers` not implemented for `dtype` {dt}"); diff --git a/crates/polars-python/src/series/comparison.rs b/crates/polars-python/src/series/comparison.rs index 7064edb7698a..2b7de37931f9 100644 --- a/crates/polars-python/src/series/comparison.rs +++ b/crates/polars-python/src/series/comparison.rs @@ -6,36 +6,45 @@ use crate::PySeries; #[pymethods] impl PySeries { - fn eq(&self, rhs: &PySeries) -> PyResult { - let s = self.series.equal(&rhs.series).map_err(PyPolarsErr::from)?; + fn eq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.equal(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn neq(&self, rhs: &PySeries) -> PyResult { - let s = self - .series - .not_equal(&rhs.series) + fn neq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.not_equal(&rhs.series)) .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn gt(&self, rhs: &PySeries) -> PyResult { - let s = self.series.gt(&rhs.series).map_err(PyPolarsErr::from)?; + fn gt(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.gt(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn gt_eq(&self, rhs: &PySeries) -> PyResult { - let s = self.series.gt_eq(&rhs.series).map_err(PyPolarsErr::from)?; + fn gt_eq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.gt_eq(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn lt(&self, rhs: &PySeries) -> PyResult { - let s = self.series.lt(&rhs.series).map_err(PyPolarsErr::from)?; + fn lt(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.lt(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } - fn lt_eq(&self, rhs: &PySeries) -> PyResult { - let s = self.series.lt_eq(&rhs.series).map_err(PyPolarsErr::from)?; + fn lt_eq(&self, py: Python, rhs: &PySeries) -> PyResult { + let s = py + .allow_threads(|| self.series.lt_eq(&rhs.series)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -44,8 +53,10 @@ macro_rules! impl_eq_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.equal(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.equal(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -69,8 +80,10 @@ macro_rules! impl_neq_num { #[allow(clippy::nonstandard_macro_braces)] #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.not_equal(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.not_equal(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -93,8 +106,10 @@ macro_rules! impl_gt_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.gt(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.gt(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -117,8 +132,10 @@ macro_rules! impl_gt_eq_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.gt_eq(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.gt_eq(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -142,8 +159,10 @@ macro_rules! impl_lt_num { #[allow(clippy::nonstandard_macro_braces)] #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.lt(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.lt(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -166,8 +185,10 @@ macro_rules! impl_lt_eq_num { ($name:ident, $type:ty) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: $type) -> PyResult { - let s = self.series.lt_eq(rhs).map_err(PyPolarsErr::from)?; + fn $name(&self, py: Python, rhs: $type) -> PyResult { + let s = py + .allow_threads(|| self.series.lt_eq(rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } @@ -226,12 +247,14 @@ macro_rules! impl_decimal { ($name:ident, $method:ident) => { #[pymethods] impl PySeries { - fn $name(&self, rhs: PyDecimal) -> PyResult { + fn $name(&self, py: Python, rhs: PyDecimal) -> PyResult { let rhs = Series::new( PlSmallStr::from_static("decimal"), &[AnyValue::Decimal(rhs.0, rhs.1)], ); - let s = self.series.$method(&rhs).map_err(PyPolarsErr::from)?; + let s = py + .allow_threads(|| self.series.$method(&rhs)) + .map_err(PyPolarsErr::from)?; Ok(s.into_series().into()) } } diff --git a/crates/polars-python/src/series/construction.rs b/crates/polars-python/src/series/construction.rs index 5935f1e7b0ce..e9dbdf264d8c 100644 --- a/crates/polars-python/src/series/construction.rs +++ b/crates/polars-python/src/series/construction.rs @@ -71,10 +71,11 @@ impl PySeries { if nan_is_null { let array = array.readonly(); let vals = array.as_slice().unwrap(); - let ca: Float32Chunked = vals - .iter() - .map(|&val| if f32::is_nan(val) { None } else { Some(val) }) - .collect_trusted(); + let ca: Float32Chunked = py.allow_threads(|| { + vals.iter() + .map(|&val| if f32::is_nan(val) { None } else { Some(val) }) + .collect_trusted() + }); ca.with_name(name.into()).into_series().into() } else { mmap_numpy_array(py, name, array) @@ -86,10 +87,11 @@ impl PySeries { if nan_is_null { let array = array.readonly(); let vals = array.as_slice().unwrap(); - let ca: Float64Chunked = vals - .iter() - .map(|&val| if f64::is_nan(val) { None } else { Some(val) }) - .collect_trusted(); + let ca: Float64Chunked = py.allow_threads(|| { + vals.iter() + .map(|&val| if f64::is_nan(val) { None } else { Some(val) }) + .collect_trusted() + }); ca.with_name(name.into()).into_series().into() } else { mmap_numpy_array(py, name, array) diff --git a/crates/polars-python/src/series/export.rs b/crates/polars-python/src/series/export.rs index 886b6114427a..959b2dd47293 100644 --- a/crates/polars-python/src/series/export.rs +++ b/crates/polars-python/src/series/export.rs @@ -147,17 +147,11 @@ impl PySeries { /// Return the underlying Arrow array. #[allow(clippy::wrong_self_convention)] - fn to_arrow(&mut self, compat_level: PyCompatLevel) -> PyResult { - self.rechunk(true); - Python::with_gil(|py| { - let pyarrow = py.import_bound("pyarrow")?; + fn to_arrow(&mut self, py: Python, compat_level: PyCompatLevel) -> PyResult { + self.rechunk(py, true); + let pyarrow = py.import_bound("pyarrow")?; - interop::arrow::to_py::to_py_array( - self.series.to_arrow(0, compat_level.0), - py, - &pyarrow, - ) - }) + interop::arrow::to_py::to_py_array(self.series.to_arrow(0, compat_level.0), py, &pyarrow) } #[allow(unused_variables)] diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index b14285e77aa0..3134f5354f09 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -16,9 +16,9 @@ use crate::py_modules::POLARS; #[pymethods] impl PySeries { - fn struct_unnest(&self) -> PyResult { + fn struct_unnest(&self, py: Python) -> PyResult { let ca = self.series.struct_().map_err(PyPolarsErr::from)?; - let df: DataFrame = ca.clone().unnest(); + let df: DataFrame = py.allow_threads(|| ca.clone().unnest()); Ok(df.into()) } @@ -56,9 +56,9 @@ impl PySeries { Ok(ca.get_rev_map().is_local()) } - pub fn cat_to_local(&self) -> PyResult { + pub fn cat_to_local(&self, py: Python) -> PyResult { let ca = self.series.categorical().map_err(PyPolarsErr::from)?; - Ok(ca.to_local().into_series().into()) + Ok(py.allow_threads(|| ca.to_local().into_series().into())) } fn estimated_size(&self) -> usize { @@ -78,15 +78,14 @@ impl PySeries { } #[cfg(feature = "dtype-array")] - fn reshape(&self, dims: Vec) -> PyResult { + fn reshape(&self, py: Python, dims: Vec) -> PyResult { let dims = dims .into_iter() .map(ReshapeDimension::new) .collect::>(); - let out = self - .series - .reshape_array(&dims) + let out = py + .allow_threads(|| self.series.reshape_array(&dims)) .map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -114,8 +113,8 @@ impl PySeries { } } - pub fn rechunk(&mut self, in_place: bool) -> Option { - let series = self.series.rechunk(); + pub fn rechunk(&mut self, py: Python, in_place: bool) -> Option { + let series = py.allow_threads(|| self.series.rechunk()); if in_place { self.series = series; None @@ -167,16 +166,23 @@ impl PySeries { self.get_index(py, index) } - fn bitand(&self, other: &PySeries) -> PyResult { - let out = (&self.series & &other.series).map_err(PyPolarsErr::from)?; + fn bitand(&self, py: Python, other: &PySeries) -> PyResult { + let out = py + .allow_threads(|| &self.series & &other.series) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } - fn bitor(&self, other: &PySeries) -> PyResult { - let out = (&self.series | &other.series).map_err(PyPolarsErr::from)?; + + fn bitor(&self, py: Python, other: &PySeries) -> PyResult { + let out = py + .allow_threads(|| &self.series | &other.series) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } - fn bitxor(&self, other: &PySeries) -> PyResult { - let out = (&self.series ^ &other.series).map_err(PyPolarsErr::from)?; + fn bitxor(&self, py: Python, other: &PySeries) -> PyResult { + let out = py + .allow_threads(|| &self.series ^ &other.series) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -217,48 +223,58 @@ impl PySeries { Ok(()) } - fn extend(&mut self, other: &PySeries) -> PyResult<()> { - self.series - .extend(&other.series) + fn extend(&mut self, py: Python, other: &PySeries) -> PyResult<()> { + py.allow_threads(|| self.series.extend(&other.series)) .map_err(PyPolarsErr::from)?; Ok(()) } - fn new_from_index(&self, index: usize, length: usize) -> PyResult { + fn new_from_index(&self, py: Python, index: usize, length: usize) -> PyResult { if index >= self.series.len() { Err(PyValueError::new_err("index is out of bounds")) } else { - Ok(self.series.new_from_index(index, length).into()) + Ok(py.allow_threads(|| self.series.new_from_index(index, length).into())) } } - fn filter(&self, filter: &PySeries) -> PyResult { + fn filter(&self, py: Python, filter: &PySeries) -> PyResult { let filter_series = &filter.series; if let Ok(ca) = filter_series.bool() { - let series = self.series.filter(ca).map_err(PyPolarsErr::from)?; + let series = py + .allow_threads(|| self.series.filter(ca)) + .map_err(PyPolarsErr::from)?; Ok(PySeries { series }) } else { Err(PyRuntimeError::new_err("Expected a boolean mask")) } } - fn sort(&mut self, descending: bool, nulls_last: bool, multithreaded: bool) -> PyResult { - Ok(self - .series - .sort( - SortOptions::default() - .with_order_descending(descending) - .with_nulls_last(nulls_last) - .with_multithreaded(multithreaded), - ) + fn sort( + &mut self, + py: Python, + descending: bool, + nulls_last: bool, + multithreaded: bool, + ) -> PyResult { + Ok(py + .allow_threads(|| { + self.series.sort( + SortOptions::default() + .with_order_descending(descending) + .with_nulls_last(nulls_last) + .with_multithreaded(multithreaded), + ) + }) .map_err(PyPolarsErr::from)? .into()) } - fn gather_with_series(&self, indices: &PySeries) -> PyResult { - let indices = indices.series.idx().map_err(PyPolarsErr::from)?; - let s = self.series.take(indices).map_err(PyPolarsErr::from)?; - Ok(s.into()) + fn gather_with_series(&self, py: Python, indices: &PySeries) -> PyResult { + py.allow_threads(|| { + let indices = indices.series.idx().map_err(PyPolarsErr::from)?; + let s = self.series.take(indices).map_err(PyPolarsErr::from)?; + Ok(s.into()) + }) } fn null_count(&self) -> PyResult { @@ -271,6 +287,7 @@ impl PySeries { fn equals( &self, + py: Python, other: &PySeries, check_dtypes: bool, check_names: bool, @@ -283,9 +300,9 @@ impl PySeries { return false; } if null_equal { - self.series.equals_missing(&other.series) + py.allow_threads(|| self.series.equals_missing(&other.series)) } else { - self.series.equals(&other.series) + py.allow_threads(|| self.series.equals(&other.series)) } } @@ -300,8 +317,10 @@ impl PySeries { /// Rechunk and return a pointer to the start of the Series. /// Only implemented for numeric types - fn as_single_ptr(&mut self) -> PyResult { - let ptr = self.series.as_single_ptr().map_err(PyPolarsErr::from)?; + fn as_single_ptr(&mut self, py: Python) -> PyResult { + let ptr = py + .allow_threads(|| self.series.as_single_ptr()) + .map_err(PyPolarsErr::from)?; Ok(ptr) } @@ -309,20 +328,23 @@ impl PySeries { self.series.clone().into() } - fn zip_with(&self, mask: &PySeries, other: &PySeries) -> PyResult { + fn zip_with(&self, py: Python, mask: &PySeries, other: &PySeries) -> PyResult { let mask = mask.series.bool().map_err(PyPolarsErr::from)?; - let s = self - .series - .zip_with(mask, &other.series) + let s = py + .allow_threads(|| self.series.zip_with(mask, &other.series)) .map_err(PyPolarsErr::from)?; Ok(s.into()) } #[pyo3(signature = (separator, drop_first=false))] - fn to_dummies(&self, separator: Option<&str>, drop_first: bool) -> PyResult { - let df = self - .series - .to_dummies(separator, drop_first) + fn to_dummies( + &self, + py: Python, + separator: Option<&str>, + drop_first: bool, + ) -> PyResult { + let df = py + .allow_threads(|| self.series.to_dummies(separator, drop_first)) .map_err(PyPolarsErr::from)?; Ok(df.into()) } @@ -332,18 +354,22 @@ impl PySeries { Some(ca.get_as_series(index)?.into()) } - fn n_unique(&self) -> PyResult { - let n = self.series.n_unique().map_err(PyPolarsErr::from)?; + fn n_unique(&self, py: Python) -> PyResult { + let n = py + .allow_threads(|| self.series.n_unique()) + .map_err(PyPolarsErr::from)?; Ok(n) } - fn floor(&self) -> PyResult { - let s = self.series.floor().map_err(PyPolarsErr::from)?; + fn floor(&self, py: Python) -> PyResult { + let s = py + .allow_threads(|| self.series.floor()) + .map_err(PyPolarsErr::from)?; Ok(s.into()) } - fn shrink_to_fit(&mut self) { - self.series.shrink_to_fit(); + fn shrink_to_fit(&mut self, py: Python) { + py.allow_threads(|| self.series.shrink_to_fit()); } fn dot(&self, other: &PySeries, py: Python) -> PyResult { @@ -358,15 +384,11 @@ impl PySeries { } let result: AnyValue = if lhs_dtype.is_float() || rhs_dtype.is_float() { - (&self.series * &other.series) - .map_err(PyPolarsErr::from)? - .sum::() + py.allow_threads(|| (&self.series * &other.series)?.sum::()) .map_err(PyPolarsErr::from)? .into() } else { - (&self.series * &other.series) - .map_err(PyPolarsErr::from)? - .sum::() + py.allow_threads(|| (&self.series * &other.series)?.sum::()) .map_err(PyPolarsErr::from)? .into() }; @@ -413,20 +435,27 @@ impl PySeries { } } - fn skew(&self, bias: bool) -> PyResult> { - let out = self.series.skew(bias).map_err(PyPolarsErr::from)?; + fn skew(&self, py: Python, bias: bool) -> PyResult> { + let out = py + .allow_threads(|| self.series.skew(bias)) + .map_err(PyPolarsErr::from)?; Ok(out) } - fn kurtosis(&self, fisher: bool, bias: bool) -> PyResult> { - let out = self - .series - .kurtosis(fisher, bias) + fn kurtosis(&self, py: Python, fisher: bool, bias: bool) -> PyResult> { + let out = py + .allow_threads(|| self.series.kurtosis(fisher, bias)) .map_err(PyPolarsErr::from)?; Ok(out) } - fn cast(&self, dtype: Wrap, strict: bool, wrap_numerical: bool) -> PyResult { + fn cast( + &self, + py: Python, + dtype: Wrap, + strict: bool, + wrap_numerical: bool, + ) -> PyResult { let options = if wrap_numerical { CastOptions::Overflowing } else if strict { @@ -436,7 +465,7 @@ impl PySeries { }; let dtype = dtype.0; - let out = self.series.cast_with_options(&dtype, options); + let out = py.allow_threads(|| self.series.cast_with_options(&dtype, options)); let out = out.map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -451,38 +480,44 @@ impl PySeries { }) } - fn is_sorted(&self, descending: bool, nulls_last: bool) -> PyResult { + fn is_sorted(&self, py: Python, descending: bool, nulls_last: bool) -> PyResult { let options = SortOptions { descending, nulls_last, multithreaded: true, maintain_order: false, + limit: None, }; - Ok(self.series.is_sorted(options).map_err(PyPolarsErr::from)?) + Ok(py + .allow_threads(|| self.series.is_sorted(options)) + .map_err(PyPolarsErr::from)?) } fn clear(&self) -> Self { self.series.clear().into() } - fn head(&self, n: usize) -> Self { - self.series.head(Some(n)).into() + fn head(&self, py: Python, n: usize) -> Self { + py.allow_threads(|| self.series.head(Some(n))).into() } - fn tail(&self, n: usize) -> Self { - self.series.tail(Some(n)).into() + fn tail(&self, py: Python, n: usize) -> Self { + py.allow_threads(|| self.series.tail(Some(n))).into() } fn value_counts( &self, + py: Python, sort: bool, parallel: bool, name: String, normalize: bool, ) -> PyResult { - let out = self - .series - .value_counts(sort, parallel, name.into(), normalize) + let out = py + .allow_threads(|| { + self.series + .value_counts(sort, parallel, name.into(), normalize) + }) .map_err(PyPolarsErr::from)?; Ok(out.into()) } @@ -493,8 +528,10 @@ impl PySeries { self.series.slice(offset, length).into() } - pub fn not_(&self) -> PyResult { - let out = polars_ops::series::negate_bitwise(&self.series).map_err(PyPolarsErr::from)?; + pub fn not_(&self, py: Python) -> PyResult { + let out = py + .allow_threads(|| polars_ops::series::negate_bitwise(&self.series)) + .map_err(PyPolarsErr::from)?; Ok(out.into()) } } @@ -515,8 +552,15 @@ macro_rules! impl_set_with_mask { #[pymethods] impl PySeries { #[pyo3(signature = (filter, value))] - fn $name(&self, filter: &PySeries, value: Option<$native>) -> PyResult { - let series = $name(&self.series, filter, value).map_err(PyPolarsErr::from)?; + fn $name( + &self, + py: Python, + filter: &PySeries, + value: Option<$native>, + ) -> PyResult { + let series = py + .allow_threads(|| $name(&self.series, filter, value)) + .map_err(PyPolarsErr::from)?; Ok(Self::new(series)) } } diff --git a/crates/polars-python/src/series/scatter.rs b/crates/polars-python/src/series/scatter.rs index 97df60ef205b..798cd189a9b6 100644 --- a/crates/polars-python/src/series/scatter.rs +++ b/crates/polars-python/src/series/scatter.rs @@ -7,11 +7,12 @@ use crate::error::PyPolarsErr; #[pymethods] impl PySeries { - fn scatter(&mut self, idx: PySeries, values: PySeries) -> PyResult<()> { + fn scatter(&mut self, py: Python, idx: PySeries, values: PySeries) -> PyResult<()> { // we take the value because we want a ref count of 1 so that we can // have mutable access cheaply via _get_inner_mut(). let s = std::mem::take(&mut self.series); - match scatter(s, &idx.series, &values.series) { + let result = py.allow_threads(|| scatter(s, &idx.series, &values.series)); + match result { Ok(out) => { self.series = out; Ok(()) diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index 9e068efb6064..a2ada46e1c68 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -919,7 +919,7 @@ impl SQLExprVisitor<'_> { } let else_res = match else_result { Some(else_res) => self.visit_expr(else_res)?, - None => polars_bail!(SQLSyntax: "ELSE expression is required"), + None => lit(Null), // ELSE clause is optional; when omitted, it is implicitly NULL }; if let Some(operand_expr) = operand { let first_operand_expr = self.visit_expr(operand_expr)?; diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index c40f477ff741..f0b3b1c30e35 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -28,6 +28,7 @@ polars-core = { workspace = true } polars-error = { workspace = true } polars-expr = { workspace = true } polars-mem-engine = { workspace = true } +polars-ops = { workspace = true } polars-parquet = { workspace = true } polars-plan = { workspace = true } diff --git a/crates/polars-stream/src/nodes/io_sources/ipc.rs b/crates/polars-stream/src/nodes/io_sources/ipc.rs new file mode 100644 index 000000000000..3a83c8e3132c --- /dev/null +++ b/crates/polars-stream/src/nodes/io_sources/ipc.rs @@ -0,0 +1,557 @@ +use std::cmp::Reverse; +use std::io::Cursor; +use std::ops::Range; +use std::sync::Arc; + +use polars_core::config; +use polars_core::frame::DataFrame; +use polars_core::prelude::{Column, DataType}; +use polars_core::scalar::Scalar; +use polars_core::utils::arrow::array::TryExtend; +use polars_core::utils::arrow::io::ipc::read::{ + prepare_projection, read_file_metadata, FileMetadata, FileReader, ProjectionInfo, +}; +use polars_error::{ErrString, PolarsError, PolarsResult}; +use polars_expr::prelude::PhysicalExpr; +use polars_expr::state::ExecutionState; +use polars_io::cloud::CloudOptions; +use polars_io::ipc::IpcScanOptions; +use polars_io::utils::columns_to_projection; +use polars_io::RowIndex; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::{FileInfo, ScanSources}; +use polars_plan::prelude::FileScanOptions; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::priority::Priority; +use polars_utils::IdxSize; + +use crate::async_primitives::distributor_channel::distributor_channel; +use crate::async_primitives::linearizer::Linearizer; +use crate::morsel::{get_ideal_morsel_size, SourceToken}; +use crate::nodes::{ + ComputeNode, JoinHandle, Morsel, MorselSeq, PortState, TaskPriority, TaskScope, +}; +use crate::pipe::{RecvPort, SendPort}; +use crate::{DEFAULT_DISTRIBUTOR_BUFFER_SIZE, DEFAULT_LINEARIZER_BUFFER_SIZE}; + +const ROW_COUNT_OVERFLOW_ERR: PolarsError = PolarsError::ComputeError(ErrString::new_static( + "\ +IPC file produces more than 2^32 rows; \ +consider compiling with polars-bigidx feature (polars-u64-idx package on python)", +)); + +pub struct IpcSourceNode { + sources: ScanSources, + + config: IpcSourceNodeConfig, + num_pipelines: usize, + + /// Every phase we need to be able to continue from where we left off, so we save the state of + /// the Walker task. + state: IpcSourceNodeState, +} + +pub struct IpcSourceNodeConfig { + row_index: Option, + projection_info: Option, + + rechunk: bool, + include_file_paths: Option, + + first_metadata: FileMetadata, +} + +pub struct IpcSourceNodeState { + morsel_seq: u64, + row_idx_offset: IdxSize, + + slice: Range, + + source_idx: usize, + source: Option, +} + +pub struct Source { + file_path: Option>, + + memslice: Arc, + metadata: Arc, + + block_offset: usize, +} + +impl IpcSourceNode { + #[allow(clippy::too_many_arguments)] + pub fn new( + sources: ScanSources, + _file_info: FileInfo, + _hive_parts: Option>>, // @TODO + predicate: Option>, + options: IpcScanOptions, + _cloud_options: Option, + file_options: FileScanOptions, + mut first_metadata: Option, + ) -> PolarsResult { + // These should have all been removed during lower_ir + assert!(predicate.is_none()); + assert!(!sources.is_empty()); + + let IpcScanOptions = options; + + let FileScanOptions { + slice, + with_columns, + cache: _, // @TODO + row_index, + rechunk, + file_counter: _, // @TODO + hive_options: _, // @TODO + glob: _, // @TODO + include_file_paths, + allow_missing_columns: _, // @TODO + } = file_options; + + let first_metadata = match first_metadata.take() { + Some(md) => md, + None => { + let source = sources.iter().next().unwrap(); + let source = source.to_memslice()?; + read_file_metadata(&mut std::io::Cursor::new(&*source))? + }, + }; + + let projection = with_columns + .as_ref() + .map(|cols| columns_to_projection(cols, &first_metadata.schema)) + .transpose()?; + let projection_info = projection + .as_ref() + .map(|p| prepare_projection(&first_metadata.schema, p.clone())); + + let state = IpcSourceNodeState { + morsel_seq: 0, + row_idx_offset: row_index.as_ref().map_or(0, |ri| ri.offset), + + // Always create a slice. If no slice was given, just make the biggest slice possible. + slice: slice.map_or(0..usize::MAX, |(offset, length)| { + let offset = offset as usize; + offset..offset + length + }), + + source_idx: 0, + source: None, + }; + + Ok(IpcSourceNode { + sources, + + config: IpcSourceNodeConfig { + row_index, + projection_info, + + rechunk, + include_file_paths, + + first_metadata, + }, + + num_pipelines: 0, + + state, + }) + } +} + +/// Move `slice` forward by `n` and return the slice until then. +fn slice_take(slice: &mut Range, n: usize) -> Range { + let offset = slice.start; + let length = slice.len(); + + assert!(offset < n); + + let chunk_length = (n - offset).min(length); + let rng = offset..offset + chunk_length; + *slice = 0..length - chunk_length; + + rng +} + +fn get_max_morsel_size() -> usize { + std::env::var("POLARS_STREAMING_IPC_SOURCE_MAX_MORSEL_SIZE") + .map_or_else( + |_| get_ideal_morsel_size(), + |v| { + v.parse::().expect( + "POLARS_STREAMING_IPC_SOURCE_MAX_MORSEL_SIZE does not contain valid size", + ) + }, + ) + .max(1) +} + +impl ComputeNode for IpcSourceNode { + fn name(&self) -> &str { + "ipc_source" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.num_pipelines = num_pipelines; + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + + if self.state.slice.is_empty() || self.state.source_idx >= self.sources.len() { + send[0] = PortState::Done; + } + + if send[0] != PortState::Done { + send[0] = PortState::Ready; + } + + Ok(()) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv_ports: &mut [Option>], + send_ports: &mut [Option>], + _state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + assert!(recv_ports.is_empty()); + assert_eq!(send_ports.len(), 1); + + // Split size for morsels. + let max_morsel_size = get_max_morsel_size(); + let source_token = SourceToken::new(); + + let num_pipelines = self.num_pipelines; + let config = &self.config; + let sources = &self.sources; + let state = &mut self.state; + + /// Messages sent from Walker task to Decoder tasks. + struct BatchMessage { + memslice: Arc, + metadata: Arc, + file_path: Option>, + row_idx_offset: IdxSize, + slice: Range, + block_range: Range, + morsel_seq_base: u64, + } + + // Walker task -> Decoder tasks. + let (mut batch_tx, batch_rxs) = + distributor_channel::(num_pipelines, DEFAULT_DISTRIBUTOR_BUFFER_SIZE); + // Decoder tasks -> Distributor task. + let (mut decoded_rx, decoded_tx) = Linearizer::, Morsel>>::new( + num_pipelines, + DEFAULT_LINEARIZER_BUFFER_SIZE, + ); + // Distributor task -> output. + let mut sender = send_ports[0].take().unwrap().serial(); + + // Distributor task. + // + // Shuffles morsels from `n` producers amongst `n` consumers. + // + // If record batches in the source IPC file are large, one decoder might produce many + // morsels at the same time. At the same time, other decoders might not produce anything. + // Therefore, we would like to distribute the output of a single decoder task over the + // available output pipelines. + join_handles.push(scope.spawn_task(TaskPriority::High, async move { + while let Some(morsel) = decoded_rx.get().await { + if sender.send(morsel.1).await.is_err() { + break; + } + } + PolarsResult::Ok(()) + })); + + // Decoder tasks. + // + // Tasks a IPC file and certain number of blocks and decodes each block as a record batch. + // Then, all record batches are concatenated into a DataFrame. If the resulting DataFrame + // is too large, which happens when we have one very large block, the DataFrame is split + // into smaller pieces an spread among the pipelines. + let decoder_tasks = decoded_tx.into_iter().zip(batch_rxs) + .map(|(mut send, mut rx)| { + let source_token = source_token.clone(); + scope.spawn_task(TaskPriority::Low, async move { + // Amortize allocations. + let mut data_scratch = Vec::new(); + let mut message_scratch = Vec::new(); + let mut projection_info = config.projection_info.clone(); + + let schema = projection_info.as_ref().map_or(config.first_metadata.schema.as_ref(), |ProjectionInfo { schema, .. }| schema); + let pl_schema = schema + .iter() + .map(|(n, f)| (n.clone(), DataType::from_arrow(&f.dtype, true))) + .collect(); + + while let Ok(m) = rx.recv().await { + let BatchMessage { + memslice: source, + metadata, + file_path, + row_idx_offset, + slice, + morsel_seq_base, + block_range, + } = m; + + let mut reader = FileReader::new_with_projection_info( + Cursor::new(source.as_ref()), + metadata.as_ref().clone(), + std::mem::take(&mut projection_info), + None, + ); + reader.set_current_block(block_range.start); + reader.set_scratches(( + std::mem::take(&mut data_scratch), + std::mem::take(&mut message_scratch), + )); + + // Create the DataFrame with the appropriate schema and append all the record + // batches to it. This will perform schema validation as well. + let mut df = DataFrame::empty_with_schema(&pl_schema); + df.try_extend(reader.by_ref().take(block_range.len()))?; + + df = df.slice(slice.start as i64, slice.len()); + + if config.rechunk { + df.rechunk_mut(); + } + + if let Some(RowIndex { name, offset: _ }) = &config.row_index { + let offset = row_idx_offset + slice.start as IdxSize; + df = df.with_row_index(name.clone(), Some(offset))?; + } + + if let Some(col) = config.include_file_paths.as_ref() { + let file_path = file_path.unwrap(); + let file_path = Scalar::from(PlSmallStr::from(file_path.as_ref())); + df.with_column(Column::new_scalar( + col.clone(), + file_path, + df.height(), + ))?; + } + + // If the block is very large, we want to split the block amongst the + // pipelines. That will at least allow some parallelism. + if df.height() > max_morsel_size && config::verbose() { + eprintln!("IPC source encountered a (too) large record batch of {} rows. Splitting and continuing.", df.height()); + } + for i in 0..df.height().div_ceil(max_morsel_size) { + let morsel = df.slice((i * max_morsel_size) as i64, max_morsel_size); + let seq = MorselSeq::new(morsel_seq_base + i as u64); + let morsel = Morsel::new( + morsel, + seq, + source_token.clone(), + ); + if send.insert(Priority(Reverse(seq), morsel)).await.is_err() { + break; + } + } + + (data_scratch, message_scratch) = reader.take_scratches(); + projection_info = reader.take_projection_info(); + } + + PolarsResult::Ok(()) + }) + }) + .collect::>(); + + // Walker task. + // + // Walks all the sources and supplies block ranges to the decoder tasks. + join_handles.push(scope.spawn_task(TaskPriority::Low, async move { + struct Batch { + row_idx_offset: IdxSize, + block_start: usize, + num_rows: usize, + } + + // Batch completion parameters + let batch_size_limit = get_ideal_morsel_size(); + let sliced_batch_size_limit = state.slice.len().div_ceil(num_pipelines); + let batch_block_limit = if sources.len() >= num_pipelines { + // If there are more files than decoder tasks, try to subdivide the files instead + // of the blocks. + usize::MAX + } else { + config.first_metadata.blocks.len().div_ceil(num_pipelines) + }; + + // Amortize allocations + let mut data_scratch = Vec::new(); + let mut message_scratch = Vec::new(); + let mut projection_info = config.projection_info.clone(); + + 'source_loop: while !state.slice.is_empty() { + let source = match state.source { + Some(ref mut source) => source, + None => { + let Some(source) = sources.get(state.source_idx) else { + break; + }; + + let file_path: Option> = config + .include_file_paths + .as_ref() + .map(|_| source.to_include_path_name().into()); + let memslice = source.to_memslice()?; + let metadata = if state.source_idx == 0 { + config.first_metadata.clone() + } else { + read_file_metadata(&mut std::io::Cursor::new(memslice.as_ref()))? + }; + + state.source.insert(Source { + file_path, + memslice: Arc::new(memslice), + metadata: Arc::new(metadata), + block_offset: 0, + }) + }, + }; + + let mut reader = FileReader::new_with_projection_info( + Cursor::new(source.memslice.as_ref()), + source.metadata.as_ref().clone(), + std::mem::take(&mut projection_info), + None, + ); + reader.set_current_block(source.block_offset); + reader.set_scratches(( + std::mem::take(&mut data_scratch), + std::mem::take(&mut message_scratch), + )); + + if state.slice.start > 0 { + // Skip over all blocks that the slice would skip anyway. + let new_offset = reader.skip_blocks_till_limit(state.slice.start as u64)?; + + state.row_idx_offset += (state.slice.start as u64 - new_offset) as IdxSize; + state.slice = new_offset as usize..new_offset as usize + state.slice.len(); + + // If we skip the entire file. Don't even try to read from it. + if reader.get_current_block() == reader.metadata().blocks.len() { + (data_scratch, message_scratch) = reader.take_scratches(); + projection_info = reader.take_projection_info(); + state.source.take(); + state.source_idx += 1; + continue; + } + } + + let mut batch = Batch { + row_idx_offset: state.row_idx_offset, + block_start: reader.get_current_block(), + num_rows: 0, + }; + + // We don't yet want to commit these values to the state in case this batch gets + // cancelled. + let mut uncommitted_slice = state.slice.clone(); + let mut uncommitted_row_idx_offset = state.row_idx_offset; + while !state.slice.is_empty() { + let mut is_batch_complete = false; + + match reader.next_record_batch() { + None if batch.num_rows == 0 => break, + + // If we have no more record batches available, we want to send what is + // left. + None => is_batch_complete = true, + Some(record_batch) => { + let rb_num_rows = record_batch?.length()? as usize; + batch.num_rows += rb_num_rows; + + // We need to ensure that we are not overflowing the IdxSize maximum + // capacity. + let rb_num_rows = IdxSize::try_from(rb_num_rows) + .map_err(|_| ROW_COUNT_OVERFLOW_ERR)?; + uncommitted_row_idx_offset = uncommitted_row_idx_offset + .checked_add(rb_num_rows) + .ok_or(ROW_COUNT_OVERFLOW_ERR)?; + }, + } + + let current_block = reader.get_current_block(); + + // Subdivide into batches for large files. + is_batch_complete |= batch.num_rows >= batch_size_limit; + // Subdivide into batches if the file is sliced. + is_batch_complete |= batch.num_rows >= sliced_batch_size_limit; + // Subdivide into batches for small files. + is_batch_complete |= current_block - batch.block_start >= batch_block_limit; + + // Batch blocks such that we send appropriately sized morsels. We guarantee a + // lower bound here, but not an upper bound. + if is_batch_complete { + let batch_slice = slice_take(&mut uncommitted_slice, batch.num_rows); + let batch_slice_len = batch_slice.len(); + let block_range = batch.block_start..current_block; + + let message = BatchMessage { + memslice: source.memslice.clone(), + metadata: source.metadata.clone(), + file_path: source.file_path.clone(), + row_idx_offset: batch.row_idx_offset, + slice: batch_slice, + morsel_seq_base: state.morsel_seq, + block_range, + }; + + if source_token.stop_requested() { + break 'source_loop; + } + + if batch_tx.send(message).await.is_err() { + // This should only happen if the receiver of the decoder + // has broken off, meaning no further input will be needed. + break 'source_loop; + } + + // Commit the changes to the state. + // Now, we know that the a decoder will process it. + // + // This might generate several morsels if the record batch is very large. + state.morsel_seq += batch_slice_len.div_ceil(max_morsel_size) as u64; + state.slice = uncommitted_slice.clone(); + state.row_idx_offset = uncommitted_row_idx_offset; + source.block_offset = current_block; + + batch = Batch { + row_idx_offset: state.row_idx_offset, + block_start: current_block, + num_rows: 0, + }; + } + } + + (data_scratch, message_scratch) = reader.take_scratches(); + projection_info = reader.take_projection_info(); + + state.source.take(); + state.source_idx += 1; + } + + drop(batch_tx); // Inform decoder tasks to stop. + for decoder_task in decoder_tasks { + decoder_task.await?; + } + + PolarsResult::Ok(()) + })); + } +} diff --git a/crates/polars-stream/src/nodes/io_sources/mod.rs b/crates/polars-stream/src/nodes/io_sources/mod.rs new file mode 100644 index 000000000000..ce14ad3b0f7a --- /dev/null +++ b/crates/polars-stream/src/nodes/io_sources/mod.rs @@ -0,0 +1 @@ +pub mod ipc; diff --git a/crates/polars-stream/src/nodes/joins/in_memory.rs b/crates/polars-stream/src/nodes/joins/in_memory.rs new file mode 100644 index 000000000000..a98c23a435b0 --- /dev/null +++ b/crates/polars-stream/src/nodes/joins/in_memory.rs @@ -0,0 +1,119 @@ +use std::sync::Arc; + +use polars_core::schema::Schema; + +use crate::nodes::compute_node_prelude::*; +use crate::nodes::in_memory_sink::InMemorySinkNode; +use crate::nodes::in_memory_source::InMemorySourceNode; + +enum InMemoryJoinState { + Sink { + left: InMemorySinkNode, + right: InMemorySinkNode, + }, + Source(InMemorySourceNode), + Done, +} + +pub struct InMemoryJoinNode { + state: InMemoryJoinState, + num_pipelines: usize, + joiner: Arc PolarsResult + Send + Sync>, +} + +impl InMemoryJoinNode { + pub fn new( + left_input_schema: Arc, + right_input_schema: Arc, + joiner: Arc PolarsResult + Send + Sync>, + ) -> Self { + Self { + state: InMemoryJoinState::Sink { + left: InMemorySinkNode::new(left_input_schema), + right: InMemorySinkNode::new(right_input_schema), + }, + num_pipelines: 0, + joiner, + } + } +} + +impl ComputeNode for InMemoryJoinNode { + fn name(&self) -> &str { + "in_memory_join" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.num_pipelines = num_pipelines; + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { + assert!(recv.len() == 2 && send.len() == 1); + + // If the output doesn't want any more data, transition to being done. + if send[0] == PortState::Done && !matches!(self.state, InMemoryJoinState::Done) { + self.state = InMemoryJoinState::Done; + } + + // If the input is done, transition to being a source. + if let InMemoryJoinState::Sink { left, right } = &mut self.state { + if recv[0] == PortState::Done && recv[1] == PortState::Done { + let left_df = left.get_output()?.unwrap(); + let right_df = right.get_output()?.unwrap(); + let mut source_node = + InMemorySourceNode::new(Arc::new((self.joiner)(left_df, right_df)?)); + source_node.initialize(self.num_pipelines); + self.state = InMemoryJoinState::Source(source_node); + } + } + + match &mut self.state { + InMemoryJoinState::Sink { left, right, .. } => { + left.update_state(&mut recv[0..1], &mut [])?; + right.update_state(&mut recv[1..2], &mut [])?; + send[0] = PortState::Blocked; + }, + InMemoryJoinState::Source(source_node) => { + recv[0] = PortState::Done; + recv[1] = PortState::Done; + source_node.update_state(&mut [], send)?; + }, + InMemoryJoinState::Done => { + recv[0] = PortState::Done; + recv[1] = PortState::Done; + send[0] = PortState::Done; + }, + } + Ok(()) + } + + fn is_memory_intensive_pipeline_blocker(&self) -> bool { + matches!(self.state, InMemoryJoinState::Sink { .. }) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv_ports: &mut [Option>], + send_ports: &mut [Option>], + state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + assert!(recv_ports.len() == 2); + assert!(send_ports.len() == 1); + match &mut self.state { + InMemoryJoinState::Sink { left, right, .. } => { + if recv_ports[0].is_some() { + left.spawn(scope, &mut recv_ports[0..1], &mut [], state, join_handles); + } + if recv_ports[1].is_some() { + right.spawn(scope, &mut recv_ports[1..2], &mut [], state, join_handles); + } + }, + InMemoryJoinState::Source(source) => { + source.spawn(scope, &mut [], send_ports, state, join_handles) + }, + InMemoryJoinState::Done => unreachable!(), + } + } +} diff --git a/crates/polars-stream/src/nodes/joins/mod.rs b/crates/polars-stream/src/nodes/joins/mod.rs new file mode 100644 index 000000000000..fa2e12699f5e --- /dev/null +++ b/crates/polars-stream/src/nodes/joins/mod.rs @@ -0,0 +1 @@ +pub mod in_memory; diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index 4fb42daddd6b..effebe67c34b 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -5,6 +5,8 @@ pub mod in_memory_sink; pub mod in_memory_source; pub mod input_independent_select; pub mod io_sinks; +pub mod io_sources; +pub mod joins; pub mod map; pub mod multiplexer; pub mod ordered_union; diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs index 9a87f0f91b7c..bf2e7e60ea6e 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs @@ -1,7 +1,7 @@ use std::future::Future; use std::sync::Arc; -use polars_core::prelude::{ArrowSchema, InitHashMaps, PlHashMap}; +use polars_core::prelude::{ArrowSchema, PlHashMap}; use polars_core::series::IsSorted; use polars_core::utils::operation_exceeded_idxsize_msg; use polars_error::{polars_err, PolarsResult}; @@ -197,46 +197,37 @@ impl RowGroupDataFetcher { mem_slice, } } else if let Some(columns) = projection.as_ref() { - let ranges = get_row_group_byte_ranges_for_projection( + let mut ranges = get_row_group_byte_ranges_for_projection( &row_group_metadata, columns.as_ref(), ) .collect::>(); - let bytes = current_byte_source.get_ranges(ranges.as_ref()).await?; + let n_ranges = ranges.len(); - assert_eq!(bytes.len(), ranges.len()); + let bytes_map = current_byte_source.get_ranges(&mut ranges).await?; - let mut bytes_map = PlHashMap::with_capacity(ranges.len()); - - for (range, bytes) in ranges.iter().zip(bytes) { - memory_prefetch_func(bytes.as_ref()); - let v = bytes_map.insert(range.start, bytes); - debug_assert!(v.is_none(), "duplicate range start {}", range.start); - } + assert_eq!(bytes_map.len(), n_ranges); FetchedBytes::BytesMap(bytes_map) } else { - // We have a dedicated code-path for a full projection that performs a - // single range request for the entire row group. During testing this - // provided much higher throughput from cloud than making multiple range - // request with `get_ranges()`. - let full_range = row_group_metadata.full_byte_range(); - let full_range = full_range.start as usize..full_range.end as usize; - - let mem_slice = { - let full_range_2 = full_range.clone(); - task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { - current_byte_source.get_range(full_range_2).await - })) - .await - .unwrap()? - }; + // We still prefer `get_ranges()` over a single `get_range()` for downloading + // the entire row group, as it can have less memory-copying. A single `get_range()` + // would naively concatenate the memory blocks of the entire row group, while + // `get_ranges()` can skip concatenation since the downloaded blocks are + // aligned to the columns. + let mut ranges = row_group_metadata + .byte_ranges_iter() + .map(|x| x.start as usize..x.end as usize) + .collect::>(); - FetchedBytes::MemSlice { - offset: full_range.start, - mem_slice, - } + let n_ranges = ranges.len(); + + let bytes_map = current_byte_source.get_ranges(&mut ranges).await?; + + assert_eq!(bytes_map.len(), n_ranges); + + FetchedBytes::BytesMap(bytes_map) }; PolarsResult::Ok(RowGroupData { diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index e0735144da79..7ef74d5b0ad9 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -200,16 +200,34 @@ fn visualize_plan_rec( (out, &[][..]) }, - PhysNodeKind::GroupBy { input, key, aggs } => { - let label = "group-by"; - ( - format!( - "{label}\\nkey:\\n{}\\naggs:\\n{}", - fmt_exprs(key, expr_arena), - fmt_exprs(aggs, expr_arena) - ), - from_ref(input), + PhysNodeKind::GroupBy { input, key, aggs } => ( + format!( + "group-by\\nkey:\\n{}\\naggs:\\n{}", + fmt_exprs(key, expr_arena), + fmt_exprs(aggs, expr_arena) + ), + from_ref(input), + ), + PhysNodeKind::InMemoryJoin { + input_left, + input_right, + left_on, + right_on, + args, + } => { + let mut label = "in-memory-join".to_string(); + write!(label, r"\nleft_on:\n{}", fmt_exprs(left_on, expr_arena)).unwrap(); + write!(label, r"\nright_on:\n{}", fmt_exprs(right_on, expr_arena)).unwrap(); + write!( + label, + r"\nhow: {}", + escape_graphviz(&format!("{:?}", args.how)) ) + .unwrap(); + if args.join_nulls { + write!(label, r"\njoin-nulls").unwrap(); + } + (label, &[*input_left, *input_right][..]) }, }; diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index d57a8667c479..063c94081dbc 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -1,10 +1,11 @@ use std::sync::Arc; +use polars_core::frame::DataFrame; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; use polars_core::schema::Schema; use polars_error::{polars_ensure, PolarsResult}; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; -use polars_plan::plans::{AExpr, FunctionIR, IRAggExpr, IR}; +use polars_plan::plans::{AExpr, FileScan, FunctionIR, IRAggExpr, IR}; use polars_plan::prelude::{FileType, SinkType}; use polars_utils::arena::{Arena, Node}; use polars_utils::itertools::Itertools; @@ -314,23 +315,67 @@ pub fn lower_ir( sources: scan_sources, file_info, hive_parts, - output_schema, + output_schema: scan_output_schema, scan_type, - predicate, + mut predicate, file_options, } = v.clone() else { unreachable!(); }; - PhysNodeKind::FileScan { - scan_sources, - file_info, - hive_parts, - output_schema, - scan_type, - predicate, - file_options, + if scan_sources.is_empty() { + // If there are no sources, just provide an empty in-memory source with the right + // schema. + PhysNodeKind::InMemorySource { + df: Arc::new(DataFrame::empty_with_schema(output_schema.as_ref())), + } + } else { + if matches!(scan_type, FileScan::Ipc { .. }) { + // @TODO: All the things the IPC source does not support yet. + if hive_parts.is_some() + || scan_sources.is_cloud_url() + || file_options.allow_missing_columns + || file_options.slice.is_some_and(|(offset, _)| offset < 0) + { + todo!(); + } + } + + // If the node itself would just filter on the whole output then there is no real + // reason to do it in the source node itself. + let do_filter_in_separate_node = + predicate.is_some() && matches!(scan_type, FileScan::Ipc { .. }); + + if do_filter_in_separate_node { + assert!(file_options.slice.is_none()); // Invariant of the scan + let predicate = predicate.take().unwrap(); + + let input = phys_sm.insert(PhysNode::new( + output_schema.clone(), + PhysNodeKind::FileScan { + scan_sources, + file_info, + hive_parts, + output_schema: scan_output_schema, + scan_type, + predicate: None, + file_options, + }, + )); + + PhysNodeKind::Filter { input, predicate } + } else { + PhysNodeKind::FileScan { + scan_sources, + file_info, + hive_parts, + output_schema: scan_output_schema, + scan_type, + predicate, + file_options, + } + } } }, @@ -415,7 +460,29 @@ pub fn lower_ir( } return Ok(node); }, - IR::Join { .. } => todo!(), + IR::Join { + input_left, + input_right, + schema: _, + left_on, + right_on, + options, + } => { + let input_left = *input_left; + let input_right = *input_right; + let left_on = left_on.clone(); + let right_on = right_on.clone(); + let args = options.args.clone(); + let phys_left = lower_ir!(input_left)?; + let phys_right = lower_ir!(input_right)?; + PhysNodeKind::InMemoryJoin { + input_left: phys_left, + input_right: phys_right, + left_on, + right_on, + args, + } + }, IR::Distinct { .. } => todo!(), IR::ExtContext { .. } => todo!(), IR::Invalid => unreachable!(), diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index 3b4643100249..707c2a53dec2 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -5,6 +5,7 @@ use polars_core::frame::DataFrame; use polars_core::prelude::{IdxSize, InitHashMaps, PlHashMap, SortMultipleOptions}; use polars_core::schema::{Schema, SchemaRef}; use polars_error::PolarsResult; +use polars_ops::frame::JoinArgs; use polars_plan::plans::hive::HivePartitions; use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, ScanSources, IR}; use polars_plan::prelude::expr_ir::ExprIR; @@ -100,6 +101,9 @@ pub enum PhysNodeKind { input: PhysNodeKey, }, + /// Generic fallback for (as-of-yet) unsupported streaming mappings. + /// Fully sinks all data to an in-memory data frame and uses the in-memory + /// engine to perform the map. InMemoryMap { input: PhysNodeKey, map: Arc, @@ -149,6 +153,17 @@ pub enum PhysNodeKind { key: Vec, aggs: Vec, }, + + /// Generic fallback for (as-of-yet) unsupported streaming joins. + /// Fully sinks all data to in-memory data frames and uses the in-memory + /// engine to perform the join. + InMemoryJoin { + input_left: PhysNodeKey, + input_right: PhysNodeKey, + left_on: Vec, + right_on: Vec, + args: JoinArgs, + }, } #[recursive::recursive] @@ -198,6 +213,16 @@ fn insert_multiplexers( insert_multiplexers(*input, phys_sm, referenced); }, + PhysNodeKind::InMemoryJoin { + input_left, + input_right, + .. + } => { + let input_right = *input_right; + insert_multiplexers(*input_left, phys_sm, referenced); + insert_multiplexers(input_right, phys_sm, referenced); + }, + PhysNodeKind::OrderedUnion { inputs } | PhysNodeKind::Zip { inputs, .. } => { for input in inputs.clone() { insert_multiplexers(input, phys_sm, referenced); diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 472cf982a253..b701696972a9 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -8,6 +8,7 @@ use polars_expr::planner::{create_physical_expr, get_expr_depth_limit, Expressio use polars_expr::reduce::into_reduction; use polars_expr::state::ExecutionState; use polars_mem_engine::create_physical_plan; +use polars_plan::dsl::JoinOptions; use polars_plan::global::_set_n_rows_for_scan; use polars_plan::plans::expr_ir::ExprIR; use polars_plan::plans::{AExpr, ArenaExprIter, Context, IR}; @@ -366,6 +367,23 @@ fn to_graph_rec<'a>( todo!() } }, + FileScan::Ipc { + options, + cloud_options, + metadata: first_metadata, + } => ctx.graph.add_node( + nodes::io_sources::ipc::IpcSourceNode::new( + scan_sources, + file_info, + hive_parts, + predicate, + options, + cloud_options, + file_options, + first_metadata, + )?, + [], + ), _ => todo!(), } } @@ -410,6 +428,61 @@ fn to_graph_rec<'a>( [input_key], ) }, + + InMemoryJoin { + input_left, + input_right, + left_on, + right_on, + args, + } => { + let left_input_key = to_graph_rec(*input_left, ctx)?; + let right_input_key = to_graph_rec(*input_right, ctx)?; + let left_input_schema = ctx.phys_sm[*input_left].output_schema.clone(); + let right_input_schema = ctx.phys_sm[*input_right].output_schema.clone(); + + let mut lp_arena = Arena::default(); + let left_lmdf = Arc::new(LateMaterializedDataFrame::default()); + let right_lmdf = Arc::new(LateMaterializedDataFrame::default()); + + let left_node = lp_arena.add(left_lmdf.clone().as_ir_node(left_input_schema.clone())); + let right_node = + lp_arena.add(right_lmdf.clone().as_ir_node(right_input_schema.clone())); + let join_node = lp_arena.add(IR::Join { + input_left: left_node, + input_right: right_node, + schema: node.output_schema.clone(), + left_on: left_on.clone(), + right_on: right_on.clone(), + options: Arc::new(JoinOptions { + allow_parallel: true, + force_parallel: false, + args: args.clone(), + rows_left: (None, 0), + rows_right: (None, 0), + }), + }); + + let executor = Mutex::new(create_physical_plan( + join_node, + &mut lp_arena, + ctx.expr_arena, + )?); + + ctx.graph.add_node( + nodes::joins::in_memory::InMemoryJoinNode::new( + left_input_schema, + right_input_schema, + Arc::new(move |left, right| { + left_lmdf.set_materialized_dataframe(left); + right_lmdf.set_materialized_dataframe(right); + let mut state = ExecutionState::new(); + executor.lock().execute(&mut state) + }), + ), + [left_input_key, right_input_key], + ) + }, }; ctx.phys_to_graph.insert(phys_node_key, graph_key); diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index 0ac1a643d93d..ef07714d591f 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -130,6 +130,12 @@ mod private { out } } + + impl From for MemSlice { + fn from(value: bytes::Bytes) -> Self { + Self::from_bytes(value) + } + } } use memmap::MmapOptions; diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 9ff45610a3c7..7c054c21f59b 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -417,12 +417,21 @@ docs-selection = [ "replace", "approx_unique", "unique_counts", + "polars_cloud", + "serde", + "ir_serde", + "cloud", + "async", + "cloud_write", ] bench = [ "lazy", ] +# All features expect python +full = ["docs-selection", "performant", "fmt"] + [package.metadata.docs.rs] # all-features = true features = ["docs-selection"] diff --git a/docs/source/user-guide/expressions/index.md b/docs/source/user-guide/expressions/index.md index 7e4b6f0a8b1a..b4442d6f4289 100644 --- a/docs/source/user-guide/expressions/index.md +++ b/docs/source/user-guide/expressions/index.md @@ -4,19 +4,21 @@ We [introduced the concept of “expressions” in a previous section](../concep In this section we will focus on exploring the types of expressions that Polars offers. Each section gives an overview of what they do and provides additional examples. + - Essentials: - - [Basic operations](basic-operations.md) – how to do basic operations on dataframe columns, like arithmetic calculations, comparisons, and other common, general-purpose operations - - [Expression expansion](expression-expansion.md) – what is expression expansion and how to use it - - [Casting](casting.md) – how to convert / cast values to different data types + - [Basic operations](basic-operations.md) – how to do basic operations on dataframe columns, like arithmetic calculations, comparisons, and other common, general-purpose operations + - [Expression expansion](expression-expansion.md) – what is expression expansion and how to use it + - [Casting](casting.md) – how to convert / cast values to different data types - How to work with specific types of data or data type namespaces: - - [Strings](strings.md) – how to work with strings and the namespace `str` - - [Lists and arrays](lists-and-arrays.md) – the differences between the data types `List` and `Array`, when to use them, and how to use them - - [Categorical data and enums](categorical-data-and-enums.md) – the differences between the data types `Categorical` and `Enum`, when to use them, and how to use them - - [Structs](structs.md) – when to use the data type `Struct` and how to use it - - [Missing data](missing-data.md) – how to work with missing data and how to fill missing data + - [Strings](strings.md) – how to work with strings and the namespace `str` + - [Lists and arrays](lists-and-arrays.md) – the differences between the data types `List` and `Array`, when to use them, and how to use them + - [Categorical data and enums](categorical-data-and-enums.md) – the differences between the data types `Categorical` and `Enum`, when to use them, and how to use them + - [Structs](structs.md) – when to use the data type `Struct` and how to use it + - [Missing data](missing-data.md) – how to work with missing data and how to fill missing data - Types of operations: - - [Aggregation](aggregation.md) – how to work with aggregating contexts like `group_by` - - [Window functions](window-functions.md) – how to apply window functions over columns in a dataframe - - [Folds](folds.md) – how to perform arbitrary computations horizontally across columns + - [Aggregation](aggregation.md) – how to work with aggregating contexts like `group_by` + - [Window functions](window-functions.md) – how to apply window functions over columns in a dataframe + - [Folds](folds.md) – how to perform arbitrary computations horizontally across columns - [User-defined Python functions](user-defined-python-functions.md) – how to apply user-defined Python functions to dataframe columns or to column values - [Numpy functions](numpy-functions.md) – how to use NumPy native functions on Polars dataframes and series + diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index b135a45f53d3..5b55386b70f0 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -15,14 +15,14 @@ If you want to learn about joins in general and how to work with them in Polars, === ":fontawesome-brands-python: Python" [:material-api: `join`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html) - [:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) - [:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + [:material-api: `join_where`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_where.html) + [:material-api: `join_asof`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join_asof.html) === ":fontawesome-brands-rust: Rust" [:material-api: `join`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join) ([:material-flag-plus: semi_anti_join](/user-guide/installation/#feature-flags "Enable the feature flag semi_anti_join for semi and for anti joins"){.feature-flag} needed for some options.) - [:material-api: `join_asof_by`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoin.html#method.join_asof) + [:material-api: `join_asof_by`](https://docs.pola.rs/api/rust/dev/polars/prelude/trait.AsofJoinBy.html#method.join_asof_by) [:material-flag-plus: Available on feature asof_join](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag asof_join"){.feature-flag} [:material-api: `join_where`](https://docs.rs/polars/latest/polars/prelude/struct.JoinBuilder.html#method.join_where) [:material-flag-plus: Available on feature iejoin](/user-guide/installation/#feature-flags "To use this functionality enable the feature flag iejoin"){.feature-flag} diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index fc3e520e5ecc..a2ff3d9882da 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.12.0" +version = "1.13.1" edition = "2021" [lib] diff --git a/py-polars/docs/source/reference/expressions/meta.rst b/py-polars/docs/source/reference/expressions/meta.rst index e70283c4c9b4..514067e0166f 100644 --- a/py-polars/docs/source/reference/expressions/meta.rst +++ b/py-polars/docs/source/reference/expressions/meta.rst @@ -11,6 +11,7 @@ The following methods are available under the `expr.meta` attribute. Expr.meta.eq Expr.meta.has_multiple_outputs + Expr.meta.is_column Expr.meta.is_column_selection Expr.meta.is_regex_projection Expr.meta.ne diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 4ff2752fdfb5..49c2e2470534 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -9236,7 +9236,7 @@ def n_chunks(self, strategy: Literal["first"] = ...) -> int: ... @overload def n_chunks(self, strategy: Literal["all"]) -> list[int]: ... - def n_chunks(self, strategy: str = "first") -> int | list[int]: + def n_chunks(self, strategy: Literal["first", "all"] = "first") -> int | list[int]: """ Get number of chunks used by the ChunkedArrays of this DataFrame. diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index bb538b4f01e8..5543f629a620 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -12,6 +12,7 @@ import polars.functions as F with contextlib.suppress(ImportError): # Module not available when building docs + import polars.polars as plr from polars.polars import dtype_str_repr as _dtype_str_repr if TYPE_CHECKING: @@ -91,7 +92,7 @@ def from_python(cls, py_type: PythonDataType) -> PolarsDataType: # noqa: D102 ... @classmethod - def to_python(self) -> PythonDataType: # noqa: D102 + def to_python(cls) -> PythonDataType: # noqa: D102 ... @@ -238,6 +239,44 @@ def to_python(self) -> PythonDataType: class NumericType(DataType): """Base class for numeric data types.""" + @classmethod + def max(cls) -> pl.Expr: + """ + Return a literal expression representing the maximum value of this data type. + + Examples + -------- + >>> pl.select(pl.Int8.max() == 127) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ bool │ + ╞═════════╡ + │ true │ + └─────────┘ + """ + return pl.Expr._from_pyexpr(plr._get_dtype_max(cls)) + + @classmethod + def min(cls) -> pl.Expr: + """ + Return a literal expression representing the minimum value of this data type. + + Examples + -------- + >>> pl.select(pl.Int8.min() == -128) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ bool │ + ╞═════════╡ + │ true │ + └─────────┘ + """ + return pl.Expr._from_pyexpr(plr._get_dtype_min(cls)) + class IntegerType(NumericType): """Base class for integer data types.""" diff --git a/py-polars/polars/io/database/_executor.py b/py-polars/polars/io/database/_executor.py index 1cbaf4679db9..85401d582fe7 100644 --- a/py-polars/polars/io/database/_executor.py +++ b/py-polars/polars/io/database/_executor.py @@ -511,7 +511,7 @@ def execute( result = cursor_execute(query, *positional_options) # note: some cursors execute in-place, some access results via a property - result = self.cursor if result is None else result + result = self.cursor if (result is None or result is True) else result if self.driver_name == "duckdb": result = result.cursor diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 4cb11506b3f6..9d3cedb47e85 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -385,10 +385,6 @@ def __and__(self, other: Any) -> Expr: ... def __and__(self, other: Any) -> SelectorType | Expr: if is_column(other): colname = other.meta.output_name() - if self._attrs["name"] == "by_name" and ( - params := self._attrs["params"] - ).get("require_all", True): - return by_name(*params["*names"], colname) other = by_name(colname) if is_selector(other): return _selector_proxy_( @@ -399,6 +395,12 @@ def __and__(self, other: Any) -> SelectorType | Expr: else: return self.as_expr().__and__(other) + def __rand__(self, other: Any) -> Expr: + if is_column(other): + colname = other.meta.output_name() + return by_name(colname) & self + return self.as_expr().__rand__(other) + @overload def __or__(self, other: SelectorType) -> SelectorType: ... @@ -417,6 +419,11 @@ def __or__(self, other: Any) -> SelectorType | Expr: else: return self.as_expr().__or__(other) + def __ror__(self, other: Any) -> Expr: + if is_column(other): + other = by_name(other.meta.output_name()) + return self.as_expr().__ror__(other) + @overload def __xor__(self, other: SelectorType) -> SelectorType: ... @@ -435,21 +442,6 @@ def __xor__(self, other: Any) -> SelectorType | Expr: else: return self.as_expr().__or__(other) - def __rand__(self, other: Any) -> Expr: - if is_column(other): - colname = other.meta.output_name() - if self._attrs["name"] == "by_name" and ( - params := self._attrs["params"] - ).get("require_all", True): - return by_name(colname, *params["*names"]) - other = by_name(colname) - return self.as_expr().__rand__(other) - - def __ror__(self, other: Any) -> Expr: - if is_column(other): - other = by_name(other.meta.output_name()) - return self.as_expr().__ror__(other) - def __rxor__(self, other: Any) -> Expr: if is_column(other): other = by_name(other.meta.output_name()) diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 859609828d19..f73577319545 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -20,7 +20,7 @@ use polars_python::lazygroupby::PyLazyGroupBy; use polars_python::series::PySeries; #[cfg(feature = "sql")] use polars_python::sql::PySQLContext; -use polars_python::{exceptions, functions}; +use polars_python::{datatypes, exceptions, functions}; use pyo3::prelude::*; use pyo3::{wrap_pyfunction, wrap_pymodule}; @@ -279,6 +279,12 @@ fn polars(py: Python, m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(functions::escape_regex)) .unwrap(); + // Dtype helpers + m.add_wrapped(wrap_pyfunction!(datatypes::_get_dtype_max)) + .unwrap(); + m.add_wrapped(wrap_pyfunction!(datatypes::_get_dtype_min)) + .unwrap(); + // Exceptions - Errors m.add( "PolarsError", diff --git a/py-polars/tests/unit/io/test_lazy_ipc.py b/py-polars/tests/unit/io/test_lazy_ipc.py index 0d67b6b06f89..ec75d495ce8d 100644 --- a/py-polars/tests/unit/io/test_lazy_ipc.py +++ b/py-polars/tests/unit/io/test_lazy_ipc.py @@ -88,6 +88,7 @@ def test_ipc_list_arg(io_files_path: Path) -> None: assert df.row(0) == ("vegetables", 45, 0.5, 2) +@pytest.mark.may_fail_auto_streaming def test_scan_ipc_local_with_async( capfd: Any, monkeypatch: Any, diff --git a/py-polars/tests/unit/operations/test_is_sorted.py b/py-polars/tests/unit/operations/test_is_sorted.py index f81076ced502..093dae47bfbf 100644 --- a/py-polars/tests/unit/operations/test_is_sorted.py +++ b/py-polars/tests/unit/operations/test_is_sorted.py @@ -384,12 +384,16 @@ def test_with_pd( test_with_pd(dfbpd, dfapd, "b", "left", joined) joined = dfb.join(dfa, on="b", how="inner") - assert not joined["a"].flags["SORTED_ASC"] + if (joined["a"] != sorted(joined["a"])).any(): + assert not joined["a"].flags["SORTED_ASC"] joined = dfb.join(dfa, on="b", how="semi") - assert not joined["a"].flags["SORTED_ASC"] + if (joined["a"] != sorted(joined["a"])).any(): + assert not joined["a"].flags["SORTED_ASC"] + joined = dfb.join(dfa, on="b", how="anti") - assert not joined["a"].flags["SORTED_ASC"] + if (joined["a"] != sorted(joined["a"])).any(): + assert not joined["a"].flags["SORTED_ASC"] def test_sorted_flag_group_by_dynamic() -> None: diff --git a/py-polars/tests/unit/operations/test_top_k.py b/py-polars/tests/unit/operations/test_top_k.py index debb3e729274..866ef88e6e10 100644 --- a/py-polars/tests/unit/operations/test_top_k.py +++ b/py-polars/tests/unit/operations/test_top_k.py @@ -397,3 +397,37 @@ def test_bottom_k_nulls(s: pl.Series, should_sort: bool) -> None: def test_top_k_descending_deprecated() -> None: with pytest.deprecated_call(): pl.col("a").top_k_by("b", descending=True) # type: ignore[call-arg] + + +def test_top_k_df() -> None: + df = pl.LazyFrame({"a": [3, 4, 1, 2, 5]}) + expected = [5, 4, 3] + assert df.sort("a", descending=True).limit(3).collect()["a"].to_list() == expected + assert df.top_k(3, by="a").collect()["a"].to_list() == expected + expected = [1, 2, 3] + assert df.sort("a", descending=False).limit(3).collect()["a"].to_list() == expected + assert df.bottom_k(3, by="a").collect()["a"].to_list() == expected + + df = pl.LazyFrame({"a": [1, None, None, 4, 5]}) + expected2 = [5, 4, 1, None] + assert ( + df.sort("a", descending=True, nulls_last=True).limit(4).collect()["a"].to_list() + == expected2 + ) + assert df.top_k(4, by="a").collect()["a"].to_list() == expected2 + expected2 = [1, 4, 5, None] + assert ( + df.sort("a", descending=False, nulls_last=True) + .limit(4) + .collect()["a"] + .to_list() + == expected2 + ) + assert df.bottom_k(4, by="a").collect()["a"].to_list() == expected2 + + assert df.sort("a", descending=False, nulls_last=False).limit(4).collect()[ + "a" + ].to_list() == [None, None, 1, 4] + assert df.sort("a", descending=True, nulls_last=False).limit(4).collect()[ + "a" + ].to_list() == [None, None, 5, 4] diff --git a/py-polars/tests/unit/sql/test_conditional.py b/py-polars/tests/unit/sql/test_conditional.py index b2000ebe37b1..3a80c1234aff 100644 --- a/py-polars/tests/unit/sql/test_conditional.py +++ b/py-polars/tests/unit/sql/test_conditional.py @@ -36,6 +36,24 @@ def test_case_when() -> None: } +@pytest.mark.parametrize("else_clause", ["ELSE NULL ", ""]) +def test_case_when_optional_else(else_clause: str) -> None: + df = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [7, 6, 5, 4, 3, 2, 1], + "c": [3, 4, 0, 3, 4, 1, 1], + } + ) + query = f""" + SELECT + AVG(CASE WHEN a <= b THEN c {else_clause}END) AS conditional_mean + FROM self + """ + res = df.sql(query) + assert res.to_dict(as_series=False) == {"conditional_mean": [2.5]} + + def test_control_flow(foods_ipc_path: Path) -> None: nums = pl.LazyFrame( { diff --git a/py-polars/tests/unit/sql/test_joins.py b/py-polars/tests/unit/sql/test_joins.py index d25610eb6763..c423fc4c45f4 100644 --- a/py-polars/tests/unit/sql/test_joins.py +++ b/py-polars/tests/unit/sql/test_joins.py @@ -663,3 +663,26 @@ def test_nested_join(join_clause: str) -> None: "Species": "Human", }, ] + + +def test_join_nulls_19624() -> None: + df1 = pl.DataFrame({"a": [1, 2, None, None]}) + df2 = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]}) + + # left join + result_df = df1.join(df2, how="left", on="a", join_nulls=False, validate="1:m") + expected_df = pl.DataFrame( + {"a": [1, 1, 2, 2, None, None], "b": [0, 1, 2, 3, None, None]} + ) + assert_frame_equal(result_df, expected_df) + result_df = df2.join(df1, how="left", on="a", join_nulls=False, validate="m:1") + expected_df = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]}) + assert_frame_equal(result_df, expected_df) + + # inner join + result_df = df1.join(df2, how="inner", on="a", join_nulls=False, validate="1:m") + expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]}) + assert_frame_equal(result_df, expected_df) + result_df = df2.join(df1, how="inner", on="a", join_nulls=False, validate="m:1") + expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]}) + assert_frame_equal(result_df, expected_df) diff --git a/py-polars/tests/unit/test_datatypes.py b/py-polars/tests/unit/test_datatypes.py index 4d604f2964e9..ed4b8cd1dd61 100644 --- a/py-polars/tests/unit/test_datatypes.py +++ b/py-polars/tests/unit/test_datatypes.py @@ -202,3 +202,28 @@ def test_struct_field_iter() -> None: def test_raise_invalid_namespace() -> None: with pytest.raises(pl.exceptions.InvalidOperationError): pl.select(pl.lit(1.5).str.replace("1", "2")) + + +@pytest.mark.parametrize( + ("dtype", "lower", "upper"), + [ + (pl.Int8, -128, 127), + (pl.UInt8, 0, 255), + (pl.Int16, -32768, 32767), + (pl.UInt16, 0, 65535), + (pl.Int32, -2147483648, 2147483647), + (pl.UInt32, 0, 4294967295), + (pl.Int64, -9223372036854775808, 9223372036854775807), + (pl.UInt64, 0, 18446744073709551615), + (pl.Float32, float("-inf"), float("inf")), + (pl.Float64, float("-inf"), float("inf")), + ], +) +def test_max_min( + dtype: datatypes.IntegerType | datatypes.Float32 | datatypes.Float64, + upper: int | float, + lower: int | float, +) -> None: + df = pl.select(min=dtype.min(), max=dtype.max()) + assert df.to_series(0).item() == lower + assert df.to_series(1).item() == upper diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 78a277a3662f..a8f9e43d84c0 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -246,3 +246,35 @@ def test_lf_agg_lit_explode() -> None: schema = {"k": pl.Int64, "o": pl.List(pl.Int64)} assert q.collect_schema() == schema assert_frame_equal(q.collect(), pl.DataFrame({"k": 1, "o": [[1]]}, schema=schema)) # type: ignore[arg-type] + + +@pytest.mark.parametrize("expr_op", [ + "approx_n_unique", "arg_max", "arg_min", "bitwise_and", "bitwise_or", + "bitwise_xor", "count", "entropy", "first", "has_nulls", "implode", "kurtosis", + "last", "len", "lower_bound", "max", "mean", "median", "min", "n_unique", "nan_max", + "nan_min", "null_count", "product", "sample", "skew", "std", "sum", "upper_bound", + "var" +]) # fmt: skip +def test_lf_agg_auto_agg_list_19752(expr_op: str) -> None: + op = getattr(pl.Expr, expr_op) + + lf = pl.LazyFrame({"a": 1, "b": 1}) + + q = lf.group_by("a").agg(pl.col("b").reverse().pipe(op)) + assert q.collect_schema() == q.collect().collect_schema() + + q = lf.group_by("a").agg(pl.col("b").shuffle().reverse().pipe(op)) + + assert q.collect_schema() == q.collect().collect_schema() + + +@pytest.mark.parametrize( + "expr", [pl.col("b"), pl.col("b").sum(), pl.col("b").reverse()] +) +@pytest.mark.parametrize("mapping_strategy", ["explode", "join", "group_to_rows"]) +def test_lf_window_schema(expr: pl.Expr, mapping_strategy: str) -> None: + q = pl.LazyFrame({"a": 1, "b": 1}).select( + expr.over("a", mapping_strategy=mapping_strategy) # type: ignore[arg-type] + ) + + assert q.collect_schema() == q.collect().collect_schema() diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index dd2c415c9a13..f4e29e9194c6 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -182,11 +182,17 @@ def test_selector_by_name(df: pl.DataFrame) -> None: # check "by_name & col" for selector_expr, expected in ( - (cs.by_name("abc", "cde") & pl.col("ghi"), ["abc", "cde", "ghi"]), - (pl.col("ghi") & cs.by_name("cde", "abc"), ["ghi", "cde", "abc"]), + (cs.by_name("abc", "cde") & pl.col("ghi"), []), + (cs.by_name("abc", "cde") & pl.col("cde"), ["cde"]), + (pl.col("cde") & cs.by_name("cde", "abc"), ["cde"]), ): assert df.select(selector_expr).columns == expected + # check "by_name & by_name" + assert df.select( + cs.by_name("abc", "cde", "def", "eee") & cs.by_name("cde", "eee", "fgg") + ).columns == ["cde", "eee"] + # expected errors with pytest.raises(ColumnNotFoundError, match="xxx"): df.select(cs.by_name("xxx", "fgg", "!!!"))