From a5ac93c50532e19e265df77ae0f916997e5c35c3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 26 May 2026 07:28:41 -0600 Subject: [PATCH 1/6] chore(deps): upgrade to DataFusion 54 Bumps the DataFusion stack to 54.0.0 (pinned to apache/datafusion branch-54 commit 1321d60 until 54.0.0 is published to crates.io), and adapts ballista to the breaking API changes: * `arrow`/`arrow-flight`/`parquet` -> 58.3, `object_store` -> 0.13.2. * `rustyline` -> 18.0.0 in ballista-cli to match datafusion-cli. * Drop the `fn as_any(&self) -> &dyn Any` method from every `ExecutionPlan`/`TableProvider` impl. The trait method was removed in DataFusion 54; downcasting now uses the `dyn ExecutionPlan::is`/ `downcast_ref` helpers introduced in the same release. * Update `ExecutionPlan::partition_statistics` to return `Result>` instead of `Result`. * Adapt to the new `PhysicalPlanDecodeContext` parameter on `parse_protobuf_partitioning` / `parse_protobuf_hash_partitioning`. * `BatchPartitioner::new_hash_partitioner` now returns `Result`; propagate the error. * `TaskContext::new` gained a `higher_order_functions` argument and `FunctionRegistry` gained `higher_order_function`/`higher_order_function_names`; wire both with empty defaults in `BallistaFunctionRegistry`. Closes #1776 --- Cargo.lock | 514 ++++++++---------- Cargo.toml | 21 +- ballista-cli/Cargo.toml | 2 +- ballista/core/src/diagram.rs | 40 +- .../distributed_explain_analyze.rs | 7 - .../src/execution_plans/distributed_query.rs | 6 - .../src/execution_plans/shuffle_reader.rs | 13 +- .../src/execution_plans/shuffle_writer.rs | 9 +- .../execution_plans/sort_shuffle/writer.rs | 10 +- .../src/execution_plans/unresolved_shuffle.rs | 5 - ballista/core/src/planner.rs | 2 - ballista/core/src/registry.rs | 15 +- ballista/core/src/serde/mod.rs | 30 +- .../core/src/serde/scheduler/from_proto.rs | 2 + ballista/executor/src/collect.rs | 8 +- ballista/executor/src/execution_engine.rs | 6 +- ballista/executor/src/execution_loop.rs | 2 + ballista/executor/src/executor.rs | 5 - ballista/executor/src/executor_server.rs | 1 + .../src/physical_optimizer/join_selection.rs | 109 ++-- ballista/scheduler/src/planner.rs | 61 +-- ballista/scheduler/src/state/aqe/adapter.rs | 7 +- .../src/state/aqe/execution_plan/adaptive.rs | 5 - .../src/state/aqe/execution_plan/exchange.rs | 13 +- .../aqe/optimizer_rule/coalesce_partitions.rs | 9 +- .../optimizer_rule/distributed_exchange.rs | 127 +---- .../aqe/optimizer_rule/propagate_empty.rs | 20 +- ballista/scheduler/src/state/aqe/planner.rs | 17 +- .../src/state/aqe/test/alter_stages.rs | 9 +- .../src/state/aqe/test/plan_to_stages.rs | 6 +- .../src/state/distributed_explain.rs | 2 +- .../scheduler/src/state/execution_graph.rs | 16 +- .../src/state/execution_graph_dot.rs | 61 +-- .../scheduler/src/state/execution_stage.rs | 4 +- ballista/scheduler/src/test_utils.rs | 5 - 35 files changed, 468 insertions(+), 701 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c8b3767ad..78595b5303 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -132,35 +132,6 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" -[[package]] -name = "apache-avro" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" -dependencies = [ - "bigdecimal", - "bon", - "bzip2", - "crc32fast", - "digest 0.10.7", - "liblzma", - "log", - "miniz_oxide", - "num-bigint", - "quad-rand", - "rand 0.9.4", - "regex-lite", - "serde", - "serde_bytes", - "serde_json", - "snap", - "strum", - "strum_macros", - "thiserror 2.0.18", - "uuid", - "zstd", -] - [[package]] name = "ar_archive_writer" version = "0.5.1" @@ -190,9 +161,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c" +checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ "arrow-arith", "arrow-array", @@ -211,9 +182,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" +checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ "arrow-array", "arrow-buffer", @@ -225,9 +196,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash", "arrow-buffer", @@ -236,17 +207,41 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "num-complex", "num-integer", "num-traits", ] +[[package]] +name = "arrow-avro" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "049230728cd6e093088c8d231b4beede184e35cad7777c1505c0d5a8571f4376" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "bytes", + "bzip2", + "crc", + "flate2", + "indexmap 2.14.0", + "liblzma", + "rand 0.9.4", + "serde", + "serde_json", + "snap", + "strum_macros 0.28.0", + "uuid", + "zstd", +] + [[package]] name = "arrow-buffer" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" dependencies = [ "bytes", "half", @@ -256,9 +251,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" +checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +273,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe" +checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ "arrow-array", "arrow-cast", @@ -293,9 +288,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -306,9 +301,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302b2e036335f3f04d65dad3f74ff1f2aae6dc671d6aa04dc6b61193761e16fb" +checksum = "28abfe8bf9f124e5fc83b334af4fa58f8d0323ad25312ccb2d1da50178415704" dependencies = [ "arrow-arith", "arrow-array", @@ -334,9 +329,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ "arrow-array", "arrow-buffer", @@ -350,15 +345,16 @@ dependencies = [ [[package]] name = "arrow-json" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8" +checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", - "arrow-data", + "arrow-ord", "arrow-schema", + "arrow-select", "chrono", "half", "indexmap 2.14.0", @@ -374,9 +370,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" +checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -387,9 +383,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e" +checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ "arrow-array", "arrow-buffer", @@ -400,9 +396,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ "serde_core", "serde_json", @@ -410,9 +406,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash", "arrow-array", @@ -424,9 +420,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" +checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ "arrow-array", "arrow-buffer", @@ -564,7 +560,7 @@ dependencies = [ "fastrand", "hex", "http 1.4.0", - "sha1", + "sha1 0.10.6", "time", "tokio", "tracing", @@ -1212,7 +1208,6 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", - "serde", ] [[package]] @@ -1357,31 +1352,6 @@ dependencies = [ "time", ] -[[package]] -name = "bon" -version = "3.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" -dependencies = [ - "bon-macros", - "rustversion", -] - -[[package]] -name = "bon-macros" -version = "3.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" -dependencies = [ - "darling 0.23.0", - "ident_case", - "prettyplease", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.117", -] - [[package]] name = "brotli" version = "8.0.2" @@ -1801,6 +1771,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" + [[package]] name = "crc32fast" version = "1.5.0" @@ -2067,14 +2052,12 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "arrow-schema", "async-trait", - "bytes", "bzip2", "chrono", "datafusion-catalog", @@ -2105,14 +2088,13 @@ dependencies = [ "datafusion-sql", "flate2", "futures", + "indexmap 2.14.0", "itertools 0.14.0", "liblzma", "log", "object_store", "parking_lot", "parquet", - "rand 0.9.4", - "regex", "sqlparser", "tempfile", "tokio", @@ -2123,9 +2105,8 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", @@ -2148,9 +2129,8 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", @@ -2171,9 +2151,8 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22c001ad1ac11cda09dab69b151eef5b1a992e23bc524ab0d1e63e5dea327" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", @@ -2199,17 +2178,16 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "ahash", - "apache-avro", "arrow", "arrow-ipc", + "arrow-schema", "chrono", + "foldhash 0.2.0", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "hex", "indexmap 2.14.0", "itertools 0.14.0", @@ -2217,18 +2195,17 @@ dependencies = [ "log", "object_store", "parquet", - "paste", "recursive", "sqlparser", "tokio", + "uuid", "web-time", ] [[package]] name = "datafusion-common-runtime" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "futures", "log", @@ -2237,9 +2214,8 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-compression", @@ -2263,6 +2239,7 @@ dependencies = [ "liblzma", "log", "object_store", + "parking_lot", "rand 0.9.4", "tokio", "tokio-util", @@ -2272,9 +2249,8 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "arrow-ipc", @@ -2296,29 +2272,26 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a579c3bd290c66ea4b269493e75e8a3ed42c9c895a651f10210a29538aee50c4" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "apache-avro", "arrow", + "arrow-avro", "async-trait", "bytes", "datafusion-common", "datafusion-datasource", - "datafusion-physical-expr-common", + "datafusion-physical-expr-adapter", "datafusion-physical-plan", "datafusion-session", "futures", - "num-traits", "object_store", ] [[package]] name = "datafusion-datasource-csv" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", @@ -2339,9 +2312,8 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", @@ -2356,16 +2328,14 @@ dependencies = [ "datafusion-session", "futures", "object_store", - "serde_json", "tokio", "tokio-stream", ] [[package]] name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", @@ -2375,6 +2345,7 @@ dependencies = [ "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate-common", "datafusion-physical-expr", "datafusion-physical-expr-adapter", @@ -2393,20 +2364,17 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" [[package]] name = "datafusion-execution" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "arrow-buffer", "async-trait", - "chrono", "dashmap", "datafusion-common", "datafusion-expr", @@ -2423,11 +2391,11 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", + "arrow-schema", "async-trait", "chrono", "datafusion-common", @@ -2438,7 +2406,6 @@ dependencies = [ "datafusion-physical-expr-common", "indexmap 2.14.0", "itertools 0.14.0", - "paste", "recursive", "serde_json", "sqlparser", @@ -2446,22 +2413,19 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "datafusion-common", "indexmap 2.14.0", "itertools 0.14.0", - "paste", ] [[package]] name = "datafusion-functions" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "arrow-buffer", @@ -2476,26 +2440,24 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", + "datafusion-physical-expr-common", "hex", "itertools 0.14.0", "log", - "md-5 0.10.6", + "md-5 0.11.0", "memchr", "num-traits", "rand 0.9.4", "regex", - "sha2 0.10.9", - "unicode-segmentation", + "sha2 0.11.0", "uuid", ] [[package]] name = "datafusion-functions-aggregate" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "ahash", "arrow", "datafusion-common", "datafusion-doc", @@ -2505,19 +2467,17 @@ dependencies = [ "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", + "foldhash 0.2.0", "half", "log", "num-traits", - "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "ahash", "arrow", "datafusion-common", "datafusion-expr-common", @@ -2526,9 +2486,8 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "arrow-ord", @@ -2542,34 +2501,32 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-macros", "datafusion-physical-expr-common", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "itertools 0.14.0", "itoa", "log", - "paste", + "memchr", ] [[package]] name = "datafusion-functions-table" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "async-trait", "datafusion-catalog", "datafusion-common", "datafusion-expr", + "datafusion-physical-expr", "datafusion-physical-plan", "parking_lot", - "paste", ] [[package]] name = "datafusion-functions-window" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "datafusion-common", @@ -2580,14 +2537,12 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "log", - "paste", ] [[package]] name = "datafusion-functions-window-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2595,9 +2550,8 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "datafusion-doc", "quote", @@ -2606,9 +2560,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "chrono", @@ -2626,11 +2579,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "ahash", "arrow", "datafusion-common", "datafusion-expr", @@ -2638,11 +2589,10 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "indexmap 2.14.0", "itertools 0.14.0", "parking_lot", - "paste", "petgraph", "recursive", "tokio", @@ -2650,9 +2600,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "datafusion-common", @@ -2665,26 +2614,24 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "ahash", "arrow", "chrono", "datafusion-common", "datafusion-expr-common", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "indexmap 2.14.0", "itertools 0.14.0", "parking_lot", + "pin-project", ] [[package]] name = "datafusion-physical-optimizer" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "datafusion-common", @@ -2701,12 +2648,12 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ - "ahash", "arrow", + "arrow-data", + "arrow-ipc", "arrow-ord", "arrow-schema", "async-trait", @@ -2721,7 +2668,7 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "indexmap 2.14.0", "itertools 0.14.0", "log", @@ -2733,9 +2680,8 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a387aaef949dc16bb6abc81bd1af850ec7449183aef011214f9724957495738" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "chrono", @@ -2756,14 +2702,12 @@ dependencies = [ "datafusion-proto-common", "object_store", "prost", - "rand 0.9.4", ] [[package]] name = "datafusion-proto-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e614c7c53a9c304c6a850b821010bb492e57300311835f1180613f9d2c63d9" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "datafusion-common", @@ -2772,9 +2716,8 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "datafusion-common", @@ -2783,15 +2726,13 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools 0.14.0", "log", ] [[package]] name = "datafusion-session" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "async-trait", "datafusion-common", @@ -2803,9 +2744,8 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "bigdecimal", @@ -2818,21 +2758,23 @@ dependencies = [ "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", "datafusion-functions-nested", "log", + "num-traits", "percent-encoding", "rand 0.9.4", "serde_json", - "sha1", - "sha2 0.10.9", + "sha1 0.11.0", + "sha2 0.11.0", + "twox-hash", "url", ] [[package]] name = "datafusion-sql" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "arrow", "bigdecimal", @@ -2849,9 +2791,8 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98494539a5468979cc42d86c7bc5f0f8cb71ee5c742694c26fc34efdd29dd2e5" +version = "54.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1321d60cc37ee487d1e7ce7f501357c3236b2542#1321d60cc37ee487d1e7ce7f501357c3236b2542" dependencies = [ "async-recursion", "async-trait", @@ -3072,9 +3013,9 @@ dependencies = [ [[package]] name = "endian-type" -version = "0.1.2" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +checksum = "869b0adbda23651a9c5c0c3d270aac9fcb52e8622a8f2b17e57802d7791962f2" [[package]] name = "env_filter" @@ -3156,17 +3097,6 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" -[[package]] -name = "fd-lock" -version = "4.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" -dependencies = [ - "cfg-if", - "rustix 1.1.4", - "windows-sys 0.59.0", -] - [[package]] name = "ferroid" version = "2.0.0" @@ -3516,9 +3446,14 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.17.0" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "hashlink" @@ -3913,7 +3848,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.17.0", + "hashbrown 0.17.1", "serde", "serde_core", ] @@ -4503,9 +4438,9 @@ dependencies = [ [[package]] name = "nix" -version = "0.30.1" +version = "0.31.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" +checksum = "cf20d2fde8ff38632c426f1165ed7436270b44f199fc55284c38276f9db47c3d" dependencies = [ "bitflags 2.11.1", "cfg-if", @@ -4563,7 +4498,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "serde", ] [[package]] @@ -4798,9 +4732,9 @@ dependencies = [ [[package]] name = "parquet" -version = "58.1.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash", "arrow-array", @@ -4816,7 +4750,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "lz4_flex", "num-bigint", "num-integer", @@ -5355,12 +5289,6 @@ dependencies = [ "pulldown-cmark", ] -[[package]] -name = "quad-rand" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" - [[package]] name = "quick-xml" version = "0.39.2" @@ -5450,9 +5378,9 @@ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "radix_trie" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +checksum = "3b4431027dcd37fc2a73ef740b5f233aa805897935b8bce0195e41bbf9a3289a" dependencies = [ "endian-type", "nibble_vec", @@ -5995,24 +5923,23 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyline" -version = "17.0.2" +version = "18.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" +checksum = "4a990b25f351b25139ddc7f21ee3f6f56f86d6846b74ac8fad3a719a287cd4a0" dependencies = [ "bitflags 2.11.1", "cfg-if", "clipboard-win", - "fd-lock", "home", "libc", "log", "memchr", - "nix 0.30.1", + "nix 0.31.3", "radix_trie", "unicode-segmentation", "unicode-width 0.2.2", "utf8parse", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -6142,16 +6069,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_bytes" -version = "0.11.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" -dependencies = [ - "serde", - "serde_core", -] - [[package]] name = "serde_core" version = "1.0.228" @@ -6189,6 +6106,7 @@ version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ + "indexmap 2.14.0", "itoa", "memchr", "serde", @@ -6297,6 +6215,17 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "sha1" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.2", +] + [[package]] name = "sha2" version = "0.10.9" @@ -6419,9 +6348,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.61.0" +version = "0.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" +checksum = "13c6d1b651dc4edf07eead2a0c6c78016ce971bc2c10da5266861b13f25e7cec" dependencies = [ "log", "recursive", @@ -6523,7 +6452,7 @@ version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" dependencies = [ - "strum_macros", + "strum_macros 0.27.2", ] [[package]] @@ -6538,13 +6467,26 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "strum_macros" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "substrait" -version = "0.62.2" +version = "0.63.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62fc4b483a129b9772ccb9c3f7945a472112fdd9140da87f8a4e7f1d44e045d0" +checksum = "e620ff4d5c02fd6f7752931aa74b16a26af66a63022cc1ad412c77edbe0bab47" dependencies = [ "heck 0.5.0", + "indexmap 2.14.0", "pbjson", "pbjson-build", "pbjson-types", @@ -7224,6 +7166,9 @@ name = "twox-hash" version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" +dependencies = [ + "rand 0.9.4", +] [[package]] name = "typenum" @@ -7416,7 +7361,6 @@ dependencies = [ "atomic", "getrandom 0.4.2", "js-sys", - "serde_core", "wasm-bindgen", ] diff --git a/Cargo.toml b/Cargo.toml index 99d2b85932..c778a192b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,19 +26,22 @@ edition = "2024" rust-version = "1.88.0" [workspace.dependencies] -arrow = { version = "58", features = ["ipc_compression"] } -arrow-flight = { version = "58", features = ["flight-sql-experimental"] } +arrow = { version = "58.3", features = ["ipc_compression"] } +arrow-flight = { version = "58.3", features = ["flight-sql-experimental"] } clap = { version = "4.5", features = ["derive", "cargo"] } -datafusion = "53" -datafusion-cli = "53" -datafusion-proto = "53" -datafusion-proto-common = "53" -datafusion-spark = "53" -datafusion-substrait = "53" +# DataFusion 54 has not been published to crates.io yet, so we pin to a commit +# on apache/datafusion `branch-54`. Switch back to the published `"54"` version +# once it is released. +datafusion = { git = "https://github.com/apache/datafusion.git", rev = "1321d60cc37ee487d1e7ce7f501357c3236b2542" } +datafusion-cli = { git = "https://github.com/apache/datafusion.git", rev = "1321d60cc37ee487d1e7ce7f501357c3236b2542" } +datafusion-proto = { git = "https://github.com/apache/datafusion.git", rev = "1321d60cc37ee487d1e7ce7f501357c3236b2542" } +datafusion-proto-common = { git = "https://github.com/apache/datafusion.git", rev = "1321d60cc37ee487d1e7ce7f501357c3236b2542" } +datafusion-spark = { git = "https://github.com/apache/datafusion.git", rev = "1321d60cc37ee487d1e7ce7f501357c3236b2542" } +datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "1321d60cc37ee487d1e7ce7f501357c3236b2542" } insta = "1.47" -object_store = "0.13" +object_store = "0.13.2" prost = "0.14" prost-types = "0.14" rstest = { version = "0.26" } diff --git a/ballista-cli/Cargo.toml b/ballista-cli/Cargo.toml index cdfe436b98..385ab9ac68 100644 --- a/ballista-cli/Cargo.toml +++ b/ballista-cli/Cargo.toml @@ -44,7 +44,7 @@ percent-encoding = { version = "2.3.2", optional = true } prometheus-parse = { version = "0.2", optional = true } ratatui = { version = "0.30.0", optional = true } reqwest = { version = "0.13.3", features = ["json"], optional = true } -rustyline = "17.0.1" +rustyline = "18.0.0" serde = { version = "1", features = ["derive"], optional = true } serde_json = { version = "1", optional = true } tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "sync", "time", "parking_lot"] } diff --git a/ballista/core/src/diagram.rs b/ballista/core/src/diagram.rs index 5498c1161c..cbd148c85b 100644 --- a/ballista/core/src/diagram.rs +++ b/ballista/core/src/diagram.rs @@ -84,43 +84,27 @@ fn build_exec_plan_diagram( id: &mut AtomicUsize, draw_entity: bool, ) -> Result { - let operator_str = if plan.as_any().downcast_ref::().is_some() { + let operator_str = if plan.is::() { "AggregateExec" - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.is::() { "SortExec" - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.is::() { "ProjectionExec" - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.is::() { "HashJoinExec" - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.is::() { "DataSourceExec" - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.is::() { "FilterExec" - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.is::() { "ShuffleWriterExec" - } else if plan - .as_any() - .downcast_ref::() - .is_some() - { + } else if plan.is::() { "SortShuffleWriterExec" - } else if plan - .as_any() - .downcast_ref::() - .is_some() - { + } else if plan.is::() { "UnresolvedShuffleExec" - } else if plan - .as_any() - .downcast_ref::() - .is_some() - { + } else if plan.is::() { "CoalesceBatchesExec" - } else if plan - .as_any() - .downcast_ref::() - .is_some() - { + } else if plan.is::() { "CoalescePartitionsExec" } else { warn!("Unknown: {plan:?}"); @@ -137,7 +121,7 @@ fn build_exec_plan_diagram( )?; } for child in plan.children() { - if let Some(shuffle) = child.as_any().downcast_ref::() { + if let Some(shuffle) = child.downcast_ref::() { if !draw_entity { writeln!( w, diff --git a/ballista/core/src/execution_plans/distributed_explain_analyze.rs b/ballista/core/src/execution_plans/distributed_explain_analyze.rs index 0a7ddabb36..7508847a15 100644 --- a/ballista/core/src/execution_plans/distributed_explain_analyze.rs +++ b/ballista/core/src/execution_plans/distributed_explain_analyze.rs @@ -36,7 +36,6 @@ use datafusion::physical_plan::{ }; use datafusion_proto::logical_plan::AsLogicalPlan; use futures::StreamExt; -use std::any::Any; use std::convert::TryInto; use std::marker::PhantomData; use std::sync::Arc; @@ -107,10 +106,6 @@ impl ExecutionPlan for DistributedExplainAnalyzeExec "DistributedExplainAnalyzeExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn properties(&self) -> &Arc { &self.properties } @@ -132,7 +127,6 @@ impl ExecutionPlan for DistributedExplainAnalyzeExec let query_exec = children.pop().unwrap(); if query_exec - .as_any() .downcast_ref::>() .is_some() { @@ -172,7 +166,6 @@ impl ExecutionPlan for DistributedExplainAnalyzeExec } let job_id = query_exec - .as_any() .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal( diff --git a/ballista/core/src/execution_plans/distributed_query.rs b/ballista/core/src/execution_plans/distributed_query.rs index 2ef4521f67..223e6cfa00 100644 --- a/ballista/core/src/execution_plans/distributed_query.rs +++ b/ballista/core/src/execution_plans/distributed_query.rs @@ -48,7 +48,6 @@ use datafusion_proto::logical_plan::{ use futures::{Stream, StreamExt, TryFutureExt, TryStreamExt}; use log::{debug, error, info}; use parking_lot::Mutex; -use std::any::Any; use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; @@ -176,10 +175,6 @@ impl ExecutionPlan for DistributedQueryExec { "DistributedQueryExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { self.plan.schema().as_arrow().clone().into() } @@ -839,7 +834,6 @@ mod test { let new_exec = exec.clone().with_new_children(vec![]).unwrap(); let new_exec = new_exec - .as_any() .downcast_ref::>() .unwrap(); diff --git a/ballista/core/src/execution_plans/shuffle_reader.rs b/ballista/core/src/execution_plans/shuffle_reader.rs index 8311c2a6a0..5b80c186dd 100644 --- a/ballista/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/core/src/execution_plans/shuffle_reader.rs @@ -47,7 +47,6 @@ use itertools::Itertools; use log::{debug, error, trace}; use rand::prelude::SliceRandom; use rand::rng; -use std::any::Any; use std::collections::HashMap; use std::fmt::Debug; use std::fs::File; @@ -311,10 +310,6 @@ impl ExecutionPlan for ShuffleReaderExec { "ShuffleReaderExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { self.schema.clone() } @@ -426,7 +421,7 @@ impl ExecutionPlan for ShuffleReaderExec { Some(self.metrics.clone_inner()) } - fn partition_statistics(&self, partition: Option) -> Result { + fn partition_statistics(&self, partition: Option) -> Result> { if self.broadcast { if let Some(idx) = partition && idx != 0 @@ -445,7 +440,7 @@ impl ExecutionPlan for ShuffleReaderExec { "broadcast shuffle reader at stage {} returned aggregated statistics: {:?}", self.stage_id, stats ); - return Ok(stats); + return Ok(Arc::new(stats)); } if let Some(idx) = partition { let partition_count = self.properties().partitioning.partition_count(); @@ -474,7 +469,7 @@ impl ExecutionPlan for ShuffleReaderExec { "shuffle reader at stage: {} and partition {} returned statistics: {:?}", self.stage_id, idx, stat_for_partition ); - stat_for_partition + stat_for_partition.map(Arc::new) } else { let stats_for_partitions = stats_for_partitions( self.schema.fields().len(), @@ -487,7 +482,7 @@ impl ExecutionPlan for ShuffleReaderExec { "shuffle reader at stage: {} returned statistics for all partitions: {:?}", self.stage_id, stats_for_partitions ); - Ok(stats_for_partitions) + Ok(Arc::new(stats_for_partitions)) } } } diff --git a/ballista/core/src/execution_plans/shuffle_writer.rs b/ballista/core/src/execution_plans/shuffle_writer.rs index a852b91fc5..0e68c42393 100644 --- a/ballista/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/core/src/execution_plans/shuffle_writer.rs @@ -24,7 +24,6 @@ use datafusion::arrow::ipc::CompressionType; use datafusion::arrow::ipc::writer::IpcWriteOptions; use datafusion::arrow::ipc::writer::StreamWriter; -use std::any::Any; use std::fmt::Debug; use std::fs; use std::fs::File; @@ -285,7 +284,7 @@ impl ShuffleWriterExec { exprs, num_output_partitions, repart_time, - ); + )?; while let Some(input_batch) = rx.blocking_recv() { partitioner.partition( @@ -437,10 +436,6 @@ impl ExecutionPlan for ShuffleWriterExec { "ShuffleWriterExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { self.plan.schema() } @@ -560,7 +555,7 @@ impl ExecutionPlan for ShuffleWriterExec { Some(self.metrics.clone_inner()) } - fn partition_statistics(&self, partition: Option) -> Result { + fn partition_statistics(&self, partition: Option) -> Result> { self.plan.partition_statistics(partition) } } diff --git a/ballista/core/src/execution_plans/sort_shuffle/writer.rs b/ballista/core/src/execution_plans/sort_shuffle/writer.rs index 8d88b7562d..7990d642cd 100644 --- a/ballista/core/src/execution_plans/sort_shuffle/writer.rs +++ b/ballista/core/src/execution_plans/sort_shuffle/writer.rs @@ -21,7 +21,6 @@ //! per input partition, along with an index file mapping partition IDs to //! byte offsets. -use std::any::Any; use std::fs::File; use std::future::Future; use std::io::{BufWriter, Seek, Write}; @@ -538,10 +537,6 @@ impl ExecutionPlan for SortShuffleWriterExec { "SortShuffleWriterExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { self.plan.schema() } @@ -662,7 +657,7 @@ impl ExecutionPlan for SortShuffleWriterExec { Some(self.metrics.clone_inner()) } - fn partition_statistics(&self, partition: Option) -> Result { + fn partition_statistics(&self, partition: Option) -> Result> { self.plan.partition_statistics(partition) } } @@ -1163,7 +1158,8 @@ mod tests { // Reference: DataFusion's BatchPartitioner::new_hash_partitioner let mut ref_partitioner = - BatchPartitioner::new_hash_partitioner(exprs.clone(), 4, Time::default()); + BatchPartitioner::new_hash_partitioner(exprs.clone(), 4, Time::default()) + .unwrap(); let mut ref_assignments = [usize::MAX; 10]; ref_partitioner .partition(batch.clone(), |partition, sub_batch| { diff --git a/ballista/core/src/execution_plans/unresolved_shuffle.rs b/ballista/core/src/execution_plans/unresolved_shuffle.rs index 3f3567b6a9..64bfd8fec9 100644 --- a/ballista/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/core/src/execution_plans/unresolved_shuffle.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; use std::sync::Arc; use datafusion::arrow::datatypes::SchemaRef; @@ -192,10 +191,6 @@ impl ExecutionPlan for UnresolvedShuffleExec { "UnresolvedShuffleExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { self.schema.clone() } diff --git a/ballista/core/src/planner.rs b/ballista/core/src/planner.rs index 54e3f605f0..5ceb66f17b 100644 --- a/ballista/core/src/planner.rs +++ b/ballista/core/src/planner.rs @@ -308,12 +308,10 @@ mod test { assert!(matches!(analyze_df.logical_plan(), LogicalPlan::Analyze(_))); let explain = plan - .as_any() .downcast_ref::>() .unwrap(); assert!( explain.children()[0] - .as_any() .downcast_ref::>() .is_some() ); diff --git a/ballista/core/src/registry.rs b/ballista/core/src/registry.rs index 1d45e7b58a..0d291ed00e 100644 --- a/ballista/core/src/registry.rs +++ b/ballista/core/src/registry.rs @@ -21,7 +21,7 @@ use datafusion::functions::all_default_functions; use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; use datafusion::logical_expr::planner::ExprPlanner; -use datafusion::logical_expr::{AggregateUDF, ScalarUDF, WindowUDF}; +use datafusion::logical_expr::{AggregateUDF, HigherOrderUDF, ScalarUDF, WindowUDF}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; @@ -135,6 +135,19 @@ impl FunctionRegistry for BallistaFunctionRegistry { )) }) } + + fn higher_order_function_names(&self) -> HashSet { + HashSet::new() + } + + fn higher_order_function( + &self, + name: &str, + ) -> datafusion::common::Result> { + Err(DataFusionError::Internal(format!( + "There is no higher order function named \"{name}\" in the TaskContext" + ))) + } } impl From<&SessionState> for BallistaFunctionRegistry { diff --git a/ballista/core/src/serde/mod.rs b/ballista/core/src/serde/mod.rs index b908674337..6c2a6e8c4d 100644 --- a/ballista/core/src/serde/mod.rs +++ b/ballista/core/src/serde/mod.rs @@ -36,6 +36,7 @@ use datafusion_proto::physical_plan::from_proto::parse_protobuf_partitioning; use datafusion_proto::physical_plan::to_proto::serialize_partitioning; use datafusion_proto::physical_plan::{ DefaultPhysicalExtensionCodec, DefaultPhysicalProtoConverter, + PhysicalPlanDecodeContext, }; use datafusion_proto::protobuf::proto_error; use datafusion_proto::protobuf::{LogicalPlanNode, PhysicalPlanNode}; @@ -381,15 +382,15 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { ) })?; let converter = DefaultPhysicalProtoConverter {}; + let decode_ctx = PhysicalPlanDecodeContext::new(ctx, self.default_codec.as_ref()); match ballista_plan { PhysicalPlanType::ShuffleWriter(shuffle_writer) => { let input = inputs[0].clone(); let shuffle_output_partitioning = parse_protobuf_hash_partitioning( shuffle_writer.output_partitioning.as_ref(), - ctx, + &decode_ctx, input.schema().as_ref(), - self.default_codec.as_ref(), &converter, )?; @@ -406,9 +407,8 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { let shuffle_output_partitioning = parse_protobuf_hash_partitioning( sort_shuffle_writer.output_partitioning.as_ref(), - ctx, + &decode_ctx, input.schema().as_ref(), - self.default_codec.as_ref(), &converter, )?; @@ -460,9 +460,8 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { .collect::, DataFusionError>>()?; let partitioning = parse_protobuf_partitioning( shuffle_reader.partitioning.as_ref(), - ctx, + &decode_ctx, schema.as_ref(), - self.default_codec.as_ref(), &converter, )?; let partitioning = partitioning @@ -503,9 +502,8 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { Arc::new(convert_required!(unresolved_shuffle.schema)?); let partitioning = parse_protobuf_partitioning( unresolved_shuffle.partitioning.as_ref(), - ctx, + &decode_ctx, schema.as_ref(), - self.default_codec.as_ref(), &converter, )?; let partitioning = partitioning @@ -540,7 +538,7 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { node: Arc, buf: &mut Vec, ) -> Result<(), DataFusionError> { - if let Some(exec) = node.as_any().downcast_ref::() { + if let Some(exec) = node.downcast_ref::() { // note that we use shuffle_output_partitioning() rather than output_partitioning() // to get the true output partitioning let output_partitioning = match exec.shuffle_output_partitioning() { @@ -579,7 +577,7 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { })?; Ok(()) - } else if let Some(exec) = node.as_any().downcast_ref::() { + } else if let Some(exec) = node.downcast_ref::() { let output_partitioning = match exec.shuffle_output_partitioning() { Partitioning::Hash(exprs, partition_count) => { Some(datafusion_proto::protobuf::PhysicalHashRepartition { @@ -622,7 +620,7 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { })?; Ok(()) - } else if let Some(exec) = node.as_any().downcast_ref::() { + } else if let Some(exec) = node.downcast_ref::() { let stage_id = exec.stage_id as u32; let mut partition = vec![]; for location in &exec.partition { @@ -665,7 +663,7 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec { })?; Ok(()) - } else if let Some(exec) = node.as_any().downcast_ref::() { + } else if let Some(exec) = node.downcast_ref::() { let converter = DefaultPhysicalProtoConverter {}; let partitioning = serialize_partitioning( &exec.properties().partitioning, @@ -800,7 +798,6 @@ mod test { let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected UnresolvedShuffleExec"); @@ -837,7 +834,6 @@ mod test { let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected ShuffleReaderExec"); @@ -880,7 +876,6 @@ mod test { let ctx = SessionContext::new().task_ctx(); let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected ShuffleReaderExec"); @@ -934,7 +929,6 @@ mod test { let ctx = SessionContext::new().task_ctx(); let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected ShuffleReaderExec"); @@ -980,7 +974,6 @@ mod test { let ctx = SessionContext::new().task_ctx(); let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected UnresolvedShuffleExec"); @@ -1031,7 +1024,6 @@ mod test { let ctx = SessionContext::new().task_ctx(); let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected ShuffleReaderExec"); @@ -1129,7 +1121,6 @@ mod test { let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected UnresolvedShuffleExec"); @@ -1156,7 +1147,6 @@ mod test { let decoded_plan = codec.try_decode(&buf, &[], &ctx).unwrap(); let decoded_exec = decoded_plan - .as_any() .downcast_ref::() .expect("Expected ShuffleReaderExec"); diff --git a/ballista/core/src/serde/scheduler/from_proto.rs b/ballista/core/src/serde/scheduler/from_proto.rs index b99cb274fb..4b2601cbb5 100644 --- a/ballista/core/src/serde/scheduler/from_proto.rs +++ b/ballista/core/src/serde/scheduler/from_proto.rs @@ -361,6 +361,7 @@ pub fn get_task_definition &dyn Any { - self - } - fn schema(&self) -> SchemaRef { self.plan.schema() } @@ -125,7 +121,7 @@ impl ExecutionPlan for CollectExec { })) } - fn partition_statistics(&self, partition: Option) -> Result { + fn partition_statistics(&self, partition: Option) -> Result> { self.plan.partition_statistics(partition) } } diff --git a/ballista/executor/src/execution_engine.rs b/ballista/executor/src/execution_engine.rs index c57d104227..a54971b5e7 100644 --- a/ballista/executor/src/execution_engine.rs +++ b/ballista/executor/src/execution_engine.rs @@ -114,7 +114,7 @@ impl ExecutionEngine for DefaultExecutionEngine { ) -> Result> { let plan = plan .transform(|p| { - if let Some(reader) = p.as_any().downcast_ref::() { + if let Some(reader) = p.downcast_ref::() { match &self.client_pool { Some(client_pool) => Ok(Transformed::yes(Arc::new( reader @@ -133,7 +133,7 @@ impl ExecutionEngine for DefaultExecutionEngine { // the query plan created by the scheduler always starts with a shuffle writer // (either ShuffleWriterExec or SortShuffleWriterExec) - if let Some(shuffle_writer) = plan.as_any().downcast_ref::() { + if let Some(shuffle_writer) = plan.downcast_ref::() { // recreate the shuffle writer with the correct working directory let exec = ShuffleWriterExec::try_new( job_id, @@ -146,7 +146,7 @@ impl ExecutionEngine for DefaultExecutionEngine { ShuffleWriterVariant::Hash(exec), ))) } else if let Some(sort_shuffle_writer) = - plan.as_any().downcast_ref::() + plan.downcast_ref::() { // recreate the sort shuffle writer with the correct working directory let exec = SortShuffleWriterExec::try_new( diff --git a/ballista/executor/src/execution_loop.rs b/ballista/executor/src/execution_loop.rs index 6cc8c1192f..1d22b3e5ae 100644 --- a/ballista/executor/src/execution_loop.rs +++ b/ballista/executor/src/execution_loop.rs @@ -40,6 +40,7 @@ use datafusion_proto::physical_plan::AsExecutionPlan; use futures::FutureExt; use log::{debug, error, info, trace, warn}; use std::any::Any; +use std::collections::HashMap; use std::convert::TryInto; use std::error::Error; use std::sync::mpsc::{Receiver, Sender, TryRecvError}; @@ -264,6 +265,7 @@ async fn run_received_task &dyn Any { - self - } - fn schema(&self) -> SchemaRef { Arc::new(Schema::empty()) } diff --git a/ballista/executor/src/executor_server.rs b/ballista/executor/src/executor_server.rs index c87c1322b5..ef257bba55 100644 --- a/ballista/executor/src/executor_server.rs +++ b/ballista/executor/src/executor_server.rs @@ -398,6 +398,7 @@ impl ExecutorServer Result>> { - let transformed = - if let Some(hash_join) = plan.as_any().downcast_ref::() { - match hash_join.partition_mode() { - PartitionMode::Auto => try_collect_left( - hash_join, - false, - collect_threshold_byte_size, - collect_threshold_num_rows, - )? + let transformed = if let Some(hash_join) = plan.downcast_ref::() { + match hash_join.partition_mode() { + PartitionMode::Auto => try_collect_left( + hash_join, + false, + collect_threshold_byte_size, + collect_threshold_num_rows, + )? + .map_or_else( + || partitioned_hash_join(hash_join).map(Some), + |v| Ok(Some(v)), + )?, + PartitionMode::CollectLeft => try_collect_left(hash_join, true, 0, 0)? .map_or_else( || partitioned_hash_join(hash_join).map(Some), |v| Ok(Some(v)), )?, - PartitionMode::CollectLeft => try_collect_left(hash_join, true, 0, 0)? - .map_or_else( - || partitioned_hash_join(hash_join).map(Some), - |v| Ok(Some(v)), - )?, - PartitionMode::Partitioned => { - let left = hash_join.left(); - let right = hash_join.right(); - if hash_join.join_type().supports_swap() - && should_swap_join_order(&**left, &**right)? - { - hash_join - .swap_inputs(PartitionMode::Partitioned) - .map(Some)? - } else { - None - } + PartitionMode::Partitioned => { + let left = hash_join.left(); + let right = hash_join.right(); + if hash_join.join_type().supports_swap() + && should_swap_join_order(&**left, &**right)? + { + hash_join + .swap_inputs(PartitionMode::Partitioned) + .map(Some)? + } else { + None } } - } else if let Some(cross_join) = plan.as_any().downcast_ref::() { - let left = cross_join.left(); - let right = cross_join.right(); - if right.properties().output_partitioning().partition_count() > 1 { - None - } else if should_swap_join_order(&**left, &**right)? { - cross_join.swap_inputs().map(Some)? - } else { - None - } - } else if let Some(nl_join) = plan.as_any().downcast_ref::() { - let left = nl_join.left(); - let right = nl_join.right(); - // next few lines are different from original datafusion rule - // partition count of right side has to be equal one to be - // able to swap inputs - if right.properties().output_partitioning().partition_count() > 1 { - None - } else if nl_join.join_type().supports_swap() - && should_swap_join_order(&**left, &**right)? - { - nl_join.swap_inputs().map(Some)? - } else { - None - } + } + } else if let Some(cross_join) = plan.downcast_ref::() { + let left = cross_join.left(); + let right = cross_join.right(); + if right.properties().output_partitioning().partition_count() > 1 { + None + } else if should_swap_join_order(&**left, &**right)? { + cross_join.swap_inputs().map(Some)? } else { None - }; + } + } else if let Some(nl_join) = plan.downcast_ref::() { + let left = nl_join.left(); + let right = nl_join.right(); + // next few lines are different from original datafusion rule + // partition count of right side has to be equal one to be + // able to swap inputs + if right.properties().output_partitioning().partition_count() > 1 { + None + } else if nl_join.join_type().supports_swap() + && should_swap_join_order(&**left, &**right)? + { + nl_join.swap_inputs().map(Some)? + } else { + None + } + } else { + None + }; Ok(if let Some(transformed) = transformed { Transformed::yes(transformed) @@ -416,7 +415,7 @@ fn hash_join_convert_symmetric_subrule( config_options: &ConfigOptions, ) -> Result> { // Check if the current plan node is a HashJoinExec. - if let Some(hash_join) = input.as_any().downcast_ref::() { + if let Some(hash_join) = input.downcast_ref::() { let left_unbounded = hash_join.left.boundedness().is_unbounded(); let left_incremental = matches!( hash_join.left.pipeline_behavior(), @@ -556,7 +555,7 @@ pub fn hash_join_swap_subrule( mut input: Arc, _config_options: &ConfigOptions, ) -> Result> { - if let Some(hash_join) = input.as_any().downcast_ref::() + if let Some(hash_join) = input.downcast_ref::() && hash_join.left.boundedness().is_unbounded() && !hash_join.right.boundedness().is_unbounded() && matches!( @@ -830,7 +829,7 @@ mod test { // `swap_inputs` for Inner wraps the join in a ProjectionExec to // restore the output column order. Walk the tree to find the join. fn find_hash_join(plan: &Arc) -> Option<&HashJoinExec> { - if let Some(hj) = plan.as_any().downcast_ref::() { + if let Some(hj) = plan.downcast_ref::() { return Some(hj); } for child in plan.children() { diff --git a/ballista/scheduler/src/planner.rs b/ballista/scheduler/src/planner.rs index 04405d9fd2..cdf81e1bf6 100644 --- a/ballista/scheduler/src/planner.rs +++ b/ballista/scheduler/src/planner.rs @@ -139,15 +139,13 @@ impl DefaultDistributedPlanner { // Broadcast-join lowering: HashJoinExec(CollectLeft) gets its own // controlled recursion so the build side is written as a broadcast stage. - if let Some(hash_join) = execution_plan.as_any().downcast_ref::() + if let Some(hash_join) = execution_plan.downcast_ref::() && *hash_join.partition_mode() == PartitionMode::CollectLeft { // Build subtree: peel CoalescePartitionsExec if present, then // recurse to lower its internal stages. let mut build = hash_join.left().clone(); - if let Some(coalesce) = - build.as_any().downcast_ref::() - { + if let Some(coalesce) = build.downcast_ref::() { build = coalesce.children()[0].clone(); } let (build, mut stages) = @@ -191,10 +189,7 @@ impl DefaultDistributedPlanner { stages.append(&mut child_stages); } - if let Some(_coalesce) = execution_plan - .as_any() - .downcast_ref::() - { + if let Some(_coalesce) = execution_plan.downcast_ref::() { let input = children[0].clone(); let input = self.optimizer_enforce_sorting.optimize(input, config)?; let shuffle_writer = create_shuffle_writer_with_config( @@ -211,10 +206,9 @@ impl DefaultDistributedPlanner { with_new_children_if_necessary(execution_plan, vec![unresolved_shuffle])?, stages, )) - } else if let Some(_sort_preserving_merge) = execution_plan - .as_any() - .downcast_ref::( - ) { + } else if let Some(_sort_preserving_merge) = + execution_plan.downcast_ref::() + { let shuffle_writer = create_shuffle_writer_with_config( job_id, self.next_stage_id(), @@ -228,9 +222,7 @@ impl DefaultDistributedPlanner { with_new_children_if_necessary(execution_plan, vec![unresolved_shuffle])?, stages, )) - } else if let Some(repart) = - execution_plan.as_any().downcast_ref::() - { + } else if let Some(repart) = execution_plan.downcast_ref::() { match repart.properties().output_partitioning() { Partitioning::Hash(_, _) => { let input = children[0].clone(); @@ -288,7 +280,7 @@ impl DefaultDistributedPlanner { debug!("broadcast check: threshold is 0, broadcast disabled"); return Ok(plan); } - let Some(hash_join) = plan.as_any().downcast_ref::() else { + let Some(hash_join) = plan.downcast_ref::() else { return Ok(plan); }; debug!( @@ -391,7 +383,6 @@ impl DefaultDistributedPlanner { }; let promoted_join = promoted - .as_any() .downcast_ref::() .expect("promoted plan must still be a HashJoinExec"); let new_left: Arc = if promoted_join @@ -429,9 +420,7 @@ fn create_unresolved_shuffle( pub fn find_unresolved_shuffles( plan: &Arc, ) -> Result> { - if let Some(unresolved_shuffle) = - plan.as_any().downcast_ref::() - { + if let Some(unresolved_shuffle) = plan.downcast_ref::() { Ok(vec![unresolved_shuffle.clone()]) } else { Ok(plan @@ -454,9 +443,7 @@ pub fn remove_unresolved_shuffles( ) -> Result> { let mut new_children: Vec> = vec![]; for child in stage.children() { - if let Some(unresolved_shuffle) = - child.as_any().downcast_ref::() - { + if let Some(unresolved_shuffle) = child.downcast_ref::() { let p = partition_locations .get(&unresolved_shuffle.stage_id) .ok_or_else(|| { @@ -517,7 +504,7 @@ pub fn rollback_resolved_shuffles( ) -> Result> { let mut new_children: Vec> = vec![]; for child in stage.children() { - if let Some(shuffle_reader) = child.as_any().downcast_ref::() { + if let Some(shuffle_reader) = child.downcast_ref::() { let stage_id = shuffle_reader.stage_id; let unresolved = if shuffle_reader.broadcast { Arc::new(UnresolvedShuffleExec::new_broadcast( @@ -614,7 +601,7 @@ mod test { macro_rules! downcast_exec { ($exec: expr, $ty: ty) => { - $exec.as_any().downcast_ref::<$ty>().expect(&format!( + $exec.downcast_ref::<$ty>().expect(&format!( "Downcast to {} failed. Got {:?}", stringify!($ty), $exec @@ -915,11 +902,10 @@ order by let mut walker: Vec> = vec![stage.clone() as Arc]; while let Some(node) = walker.pop() { - if let Some(hj) = node.as_any().downcast_ref::() { + if let Some(hj) = node.downcast_ref::() { assert_eq!(*hj.partition_mode(), PartitionMode::CollectLeft); let left = hj.children()[0].clone(); let unresolved = left - .as_any() .downcast_ref::() .expect("left input should be UnresolvedShuffleExec"); assert!(unresolved.broadcast, "left input should be broadcast"); @@ -959,15 +945,13 @@ order by let mut walker: Vec> = vec![stage.clone() as Arc]; while let Some(node) = walker.pop() { - if let Some(unresolved) = - node.as_any().downcast_ref::() - { + if let Some(unresolved) = node.downcast_ref::() { assert!( !unresolved.broadcast, "no broadcast reader expected with threshold=0" ); } - if let Some(hj) = node.as_any().downcast_ref::() { + if let Some(hj) = node.downcast_ref::() { assert_ne!( *hj.partition_mode(), PartitionMode::CollectLeft, @@ -1009,8 +993,7 @@ order by let mut walker: Vec> = vec![stage.clone() as Arc]; while let Some(node) = walker.pop() { - if let Some(unresolved) = - node.as_any().downcast_ref::() + if let Some(unresolved) = node.downcast_ref::() && unresolved.broadcast { max_upstream = max_upstream.max(unresolved.upstream_partition_count); @@ -1080,7 +1063,6 @@ order by let resolved_child = resolved.children()[0].clone(); let reader = resolved_child - .as_any() .downcast_ref::() .expect("expected resolved ShuffleReaderExec"); assert!(reader.broadcast); @@ -1110,7 +1092,6 @@ order by let rolled_back = crate::planner::rollback_resolved_shuffles(parent)?; let child = rolled_back.children()[0].clone(); let unresolved = child - .as_any() .downcast_ref::() .expect("expected rolled-back UnresolvedShuffleExec"); assert!(unresolved.broadcast); @@ -1180,13 +1161,13 @@ order by assert_eq!(3, stages.len()); // stage0 - let stage0 = stages[0].clone(); + let stage0 = stages[0].as_ref() as &dyn ExecutionPlan; let shuffle_write = downcast_exec!(stage0, SortShuffleWriterExec); let partitioning = shuffle_write.shuffle_output_partitioning(); assert_eq!(2, partitioning.partition_count()); let partition_col = match partitioning { Partitioning::Hash(exprs, 2) => match exprs.as_slice() { - [col] => col.as_any().downcast_ref::(), + [col] => col.downcast_ref::(), _ => None, }, _ => None, @@ -1200,7 +1181,7 @@ order by let window = downcast_exec!(filter.children()[0], BoundedWindowAggExec); let partition_by = window.partition_keys(); let partition_by = match partition_by[..] { - [ref col] => col.as_any().downcast_ref::(), + [ref col] => col.downcast_ref::(), _ => None, }; assert_eq!(Some(&Column::new("l_shipmode", 1)), partition_by); @@ -1217,7 +1198,7 @@ order by ); assert_eq!( Some(&Column::new("l_shipmode", 1)), - expr1.expr.as_any().downcast_ref() + expr1.expr.downcast_ref() ); assert_eq!( SortOptions { @@ -1228,7 +1209,7 @@ order by ); assert_eq!( Some(&Column::new("l_shipdate", 0)), - expr2.expr.as_any().downcast_ref() + expr2.expr.downcast_ref() ); } _ => panic!("invalid sort {sort:?}"), diff --git a/ballista/scheduler/src/state/aqe/adapter.rs b/ballista/scheduler/src/state/aqe/adapter.rs index dae2a2944e..02edef0079 100644 --- a/ballista/scheduler/src/state/aqe/adapter.rs +++ b/ballista/scheduler/src/state/aqe/adapter.rs @@ -44,7 +44,7 @@ impl BallistaAdapter { &mut self, plan: Arc, ) -> datafusion::error::Result>> { - if let Some(exchange) = plan.as_any().downcast_ref::() { + if let Some(exchange) = plan.downcast_ref::() { let schema = exchange.schema().clone(); let partitions = exchange.shuffle_partitions().ok_or_else(|| { DataFusionError::Execution( @@ -111,7 +111,7 @@ impl BallistaAdapter { job_id: &str, config: &ConfigOptions, ) -> datafusion::error::Result { - if let Some(root) = plan.as_any().downcast_ref::() { + if let Some(root) = plan.downcast_ref::() { let mut adapter = BallistaAdapter::default(); let plan = root .input() @@ -138,8 +138,7 @@ impl BallistaAdapter { plan: writer, inputs: adapter.inputs, }) - } else if let Some(root) = plan.as_any().downcast_ref::() - { + } else if let Some(root) = plan.downcast_ref::() { let mut adapter = BallistaAdapter::default(); let plan = root .input() diff --git a/ballista/scheduler/src/state/aqe/execution_plan/adaptive.rs b/ballista/scheduler/src/state/aqe/execution_plan/adaptive.rs index 1a1c3962c4..9bc7451ae7 100644 --- a/ballista/scheduler/src/state/aqe/execution_plan/adaptive.rs +++ b/ballista/scheduler/src/state/aqe/execution_plan/adaptive.rs @@ -22,7 +22,6 @@ use datafusion::{ physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}, }; use parking_lot::Mutex; -use std::any::Any; use std::fmt::Formatter; use std::sync::atomic::AtomicBool; use std::sync::{Arc, atomic::AtomicI64}; @@ -126,10 +125,6 @@ impl ExecutionPlan for AdaptiveDatafusionExec { "AdaptiveDatafusionExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn properties(&self) -> &Arc { self.input.properties() } diff --git a/ballista/scheduler/src/state/aqe/execution_plan/exchange.rs b/ballista/scheduler/src/state/aqe/execution_plan/exchange.rs index 7577878cd9..41c25b25f4 100644 --- a/ballista/scheduler/src/state/aqe/execution_plan/exchange.rs +++ b/ballista/scheduler/src/state/aqe/execution_plan/exchange.rs @@ -29,7 +29,6 @@ use datafusion::{ }; use log::trace; use parking_lot::Mutex; -use std::any::Any; use std::ops::Deref; use std::sync::{Arc, atomic::AtomicI64}; @@ -267,10 +266,6 @@ impl ExecutionPlan for ExchangeExec { "ExchangeExec" } - fn as_any(&self) -> &dyn Any { - self - } - fn properties(&self) -> &Arc { &self.properties } @@ -321,7 +316,7 @@ impl ExecutionPlan for ExchangeExec { )) } - fn partition_statistics(&self, partition: Option) -> Result { + fn partition_statistics(&self, partition: Option) -> Result> { let schema = self.input.schema(); match self.shuffle_partitions.lock().deref() { // @@ -346,7 +341,7 @@ impl ExecutionPlan for ExchangeExec { "shuffle reader at stage: {:?} and partition {} returned statistics: {:?}", self.stage_id, idx, stat_for_partition ); - stat_for_partition + stat_for_partition.map(Arc::new) } else { let stats_for_partitions = stats_for_partitions( schema.fields().len(), @@ -359,10 +354,10 @@ impl ExecutionPlan for ExchangeExec { "shuffle reader at stage: {:?} returned statistics for all partitions: {:?}", self.stage_id, stats_for_partitions ); - Ok(stats_for_partitions) + Ok(Arc::new(stats_for_partitions)) } } - None => Ok(Statistics::new_unknown(&schema)), + None => Ok(Arc::new(Statistics::new_unknown(&schema))), } } } diff --git a/ballista/scheduler/src/state/aqe/optimizer_rule/coalesce_partitions.rs b/ballista/scheduler/src/state/aqe/optimizer_rule/coalesce_partitions.rs index 6efd6b332a..b4b25346bb 100644 --- a/ballista/scheduler/src/state/aqe/optimizer_rule/coalesce_partitions.rs +++ b/ballista/scheduler/src/state/aqe/optimizer_rule/coalesce_partitions.rs @@ -155,7 +155,7 @@ impl PhysicalOptimizerRule for CoalescePartitionsRule { ); // Get the subtree below the root. Two root kinds, same outcome. - let input = if let Some(ex) = plan.as_any().downcast_ref::() { + let input = if let Some(ex) = plan.downcast_ref::() { debug!( "[coalesce-rule] root=ExchangeExec plan_id={} stage_id={:?} stage_resolved={}", ex.plan_id, @@ -163,7 +163,7 @@ impl PhysicalOptimizerRule for CoalescePartitionsRule { ex.shuffle_partitions().is_some(), ); ex.input().clone() - } else if let Some(adp) = plan.as_any().downcast_ref::() { + } else if let Some(adp) = plan.downcast_ref::() { debug!( "[coalesce-rule] root=AdaptiveDatafusionExec stage_id={:?}", adp.stage_id(), @@ -182,7 +182,7 @@ impl PhysicalOptimizerRule for CoalescePartitionsRule { // *this* stage's group, they belong to whatever stage wrote them. let mut leaves: Vec> = Vec::new(); input.apply(|node| { - if node.as_any().is::() { + if node.is::() { leaves.push(node.clone()); Ok(TreeNodeRecursion::Jump) } else { @@ -192,8 +192,7 @@ impl PhysicalOptimizerRule for CoalescePartitionsRule { // Helper: downcast each Arc back to &ExchangeExec. fn as_exchange(arc: &Arc) -> &ExchangeExec { - arc.as_any() - .downcast_ref::() + arc.downcast_ref::() .expect("filtered to ExchangeExec above") } diff --git a/ballista/scheduler/src/state/aqe/optimizer_rule/distributed_exchange.rs b/ballista/scheduler/src/state/aqe/optimizer_rule/distributed_exchange.rs index 739cf460d8..f7e6a6812a 100644 --- a/ballista/scheduler/src/state/aqe/optimizer_rule/distributed_exchange.rs +++ b/ballista/scheduler/src/state/aqe/optimizer_rule/distributed_exchange.rs @@ -41,12 +41,9 @@ impl DistributedExchangeRule { &self, execution_plan: Arc, ) -> datafusion::error::Result>> { - if let Some(coalesce) = execution_plan - .as_any() - .downcast_ref::() - { + if let Some(coalesce) = execution_plan.downcast_ref::() { let input = coalesce.input(); - if input.as_any().downcast_ref::().is_none() + if input.downcast_ref::().is_none() && !matches!(nearest_exchange_status(input), ExchangeStatus::Unresolved) { let exchange_exec = ExchangeExec::new( @@ -59,12 +56,11 @@ impl DistributedExchangeRule { execution_plan.with_new_children(vec![Arc::new(exchange_exec)])?, )); } - } else if let Some(sort_preserving_merge) = execution_plan - .as_any() - .downcast_ref::( - ) { + } else if let Some(sort_preserving_merge) = + execution_plan.downcast_ref::() + { let input = sort_preserving_merge.input(); - if input.as_any().downcast_ref::().is_none() + if input.downcast_ref::().is_none() && !matches!(nearest_exchange_status(input), ExchangeStatus::Unresolved) { let exchange_exec = ExchangeExec::new( @@ -77,8 +73,7 @@ impl DistributedExchangeRule { execution_plan.with_new_children(vec![Arc::new(exchange_exec)])?, )); } - } else if let Some(repartition) = - execution_plan.as_any().downcast_ref::() + } else if let Some(repartition) = execution_plan.downcast_ref::() && let execution_plan::Partitioning::Hash(_, _) = repartition.partitioning() { let input = repartition.input(); @@ -108,7 +103,6 @@ impl PhysicalOptimizerRule for DistributedExchangeRule { if result .data - .as_any() .downcast_ref::() .is_some() { @@ -139,7 +133,7 @@ impl PhysicalOptimizerRule for DistributedExchangeRule { /// (short-circuits), `Resolved` if every branch that has an exchange has a resolved /// one, and `None` if no exchange is found anywhere. fn nearest_exchange_status(plan: &Arc) -> ExchangeStatus { - if let Some(exchange) = plan.as_any().downcast_ref::() { + if let Some(exchange) = plan.downcast_ref::() { if exchange.shuffle_created() && !exchange.inactive_stage { ExchangeStatus::Resolved } else { @@ -237,25 +231,18 @@ mod tests { let result = rule.optimize(coalesce, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let coalesce_out = adaptive .input() - .as_any() .downcast_ref::() .unwrap(); let child = coalesce_out.children()[0]; assert!( - child.as_any().downcast_ref::().is_some(), + child.downcast_ref::().is_some(), "direct child should remain ExchangeExec" ); assert!( - child.children()[0] - .as_any() - .downcast_ref::() - .is_none(), + child.children()[0].downcast_ref::().is_none(), "ExchangeExec should not wrap another ExchangeExec" ); } @@ -273,18 +260,13 @@ mod tests { let result = rule.optimize(outer, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let outer_coalesce = adaptive .input() - .as_any() .downcast_ref::() .unwrap(); assert!( outer_coalesce.children()[0] - .as_any() .downcast_ref::() .is_none(), "should not inject ExchangeExec when unresolved exchange is in subtree" @@ -303,18 +285,13 @@ mod tests { let result = rule.optimize(outer, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let outer_coalesce = adaptive .input() - .as_any() .downcast_ref::() .unwrap(); assert!( outer_coalesce.children()[0] - .as_any() .downcast_ref::() .is_some(), "should inject ExchangeExec when subtree only has resolved exchanges" @@ -330,20 +307,13 @@ mod tests { let result = rule.optimize(input, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let spm = adaptive .input() - .as_any() .downcast_ref::() .expect("child should be SortPreservingMergeExec"); assert!( - spm.children()[0] - .as_any() - .downcast_ref::() - .is_some(), + spm.children()[0].downcast_ref::().is_some(), "SortPreservingMergeExec should have ExchangeExec injected as its child" ); } @@ -355,23 +325,14 @@ mod tests { let result = rule.optimize(input, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let spm = adaptive .input() - .as_any() .downcast_ref::() .unwrap(); let child = spm.children()[0]; - assert!(child.as_any().downcast_ref::().is_some()); - assert!( - child.children()[0] - .as_any() - .downcast_ref::() - .is_none() - ); + assert!(child.downcast_ref::().is_some()); + assert!(child.children()[0].downcast_ref::().is_none()); } #[test] @@ -384,20 +345,13 @@ mod tests { let result = rule.optimize(input, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let spm = adaptive .input() - .as_any() .downcast_ref::() .unwrap(); assert!( - spm.children()[0] - .as_any() - .downcast_ref::() - .is_none(), + spm.children()[0].downcast_ref::().is_none(), "should not inject ExchangeExec when unresolved exchange is in subtree" ); } @@ -415,13 +369,9 @@ mod tests { let result = rule.optimize(repartition, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); let exchange = adaptive .input() - .as_any() .downcast_ref::() .expect("Hash RepartitionExec should be replaced with ExchangeExec"); assert!( @@ -440,16 +390,9 @@ mod tests { let result = rule.optimize(repartition, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); assert!( - adaptive - .input() - .as_any() - .downcast_ref::() - .is_some(), + adaptive.input().downcast_ref::().is_some(), "RoundRobin repartition should be kept as-is (not replaced)" ); } @@ -468,16 +411,9 @@ mod tests { let result = rule.optimize(repartition, &config()).unwrap(); - let adaptive = result - .as_any() - .downcast_ref::() - .unwrap(); + let adaptive = result.downcast_ref::().unwrap(); assert!( - adaptive - .input() - .as_any() - .downcast_ref::() - .is_some(), + adaptive.input().downcast_ref::().is_some(), "Hash repartition should be kept when input has an unresolved exchange" ); } @@ -489,10 +425,7 @@ mod tests { let rule = DistributedExchangeRule::default(); let result = rule.optimize(leaf_exec(), &config()).unwrap(); assert!( - result - .as_any() - .downcast_ref::() - .is_some(), + result.downcast_ref::().is_some(), "optimize should always wrap the result in AdaptiveDatafusionExec" ); } @@ -506,13 +439,11 @@ mod tests { let result = rule.optimize(adaptive, &config()).unwrap(); let outer = result - .as_any() .downcast_ref::() .expect("result should be AdaptiveDatafusionExec"); assert!( outer .input() - .as_any() .downcast_ref::() .is_none(), "existing AdaptiveDatafusionExec should not be wrapped in another one" @@ -605,15 +536,12 @@ mod tests { ) .unwrap(); let exchange1 = result1 - .as_any() .downcast_ref::() .unwrap() .input() - .as_any() .downcast_ref::() .unwrap() .children()[0] - .as_any() .downcast_ref::() .unwrap(); assert_eq!( @@ -628,15 +556,12 @@ mod tests { ) .unwrap(); let exchange2 = result2 - .as_any() .downcast_ref::() .unwrap() .input() - .as_any() .downcast_ref::() .unwrap() .children()[0] - .as_any() .downcast_ref::() .unwrap(); assert_eq!( diff --git a/ballista/scheduler/src/state/aqe/optimizer_rule/propagate_empty.rs b/ballista/scheduler/src/state/aqe/optimizer_rule/propagate_empty.rs index e888976673..de5b9e5c50 100644 --- a/ballista/scheduler/src/state/aqe/optimizer_rule/propagate_empty.rs +++ b/ballista/scheduler/src/state/aqe/optimizer_rule/propagate_empty.rs @@ -34,7 +34,7 @@ use std::sync::Arc; macro_rules! is_empty_exec { ($e:expr) => { - $e.as_any().downcast_ref::().is_some() + $e.downcast_ref::().is_some() }; } @@ -57,41 +57,41 @@ impl PropagateEmptyExecRule { fn transform( plan: Arc, ) -> datafusion::error::Result>> { - if let Some(filter) = plan.as_any().downcast_ref::() + if let Some(filter) = plan.downcast_ref::() && is_empty_exec!(filter.input()) { Ok(Transformed::yes(filter.input().clone())) - } else if let Some(coalesce) = plan.as_any().downcast_ref::() + } else if let Some(coalesce) = plan.downcast_ref::() && is_empty_exec!(coalesce.input()) { Ok(Transformed::yes(coalesce.input().clone())) - } else if let Some(exchange) = plan.as_any().downcast_ref::() + } else if let Some(exchange) = plan.downcast_ref::() && is_empty_exec!(exchange.input()) { Ok(Transformed::yes(exchange.input().clone())) - } else if let Some(projection) = plan.as_any().downcast_ref::() + } else if let Some(projection) = plan.downcast_ref::() && is_empty_exec!(projection.input()) { empty_exec!(projection) - } else if let Some(limit) = plan.as_any().downcast_ref::() + } else if let Some(limit) = plan.downcast_ref::() && is_empty_exec!(limit.input()) { Ok(Transformed::yes(limit.input().clone())) - } else if let Some(limit) = plan.as_any().downcast_ref::() + } else if let Some(limit) = plan.downcast_ref::() && is_empty_exec!(limit.input()) { Ok(Transformed::yes(limit.input().clone())) - } else if let Some(aggregation) = plan.as_any().downcast_ref::() + } else if let Some(aggregation) = plan.downcast_ref::() && is_empty_exec!(aggregation.input()) { empty_exec!(aggregation) - } else if let Some(hash_join) = plan.as_any().downcast_ref::() + } else if let Some(hash_join) = plan.downcast_ref::() // TODO: - we need other joins, this one is used for testing cancellation && hash_join.join_type == Inner && (is_empty_exec!(hash_join.left) || is_empty_exec!(hash_join.right)) { empty_exec!(hash_join) - } else if let Some(exchange) = plan.as_any().downcast_ref::() { + } else if let Some(exchange) = plan.downcast_ref::() { let stats = exchange.partition_statistics(None)?; match stats.num_rows { Precision::Exact(0) => empty_exec!(plan), diff --git a/ballista/scheduler/src/state/aqe/planner.rs b/ballista/scheduler/src/state/aqe/planner.rs index 03361dc3c5..0107a0cc6b 100644 --- a/ballista/scheduler/src/state/aqe/planner.rs +++ b/ballista/scheduler/src/state/aqe/planner.rs @@ -166,8 +166,8 @@ impl AdaptivePlanner { .as_ref() .map(|stage| { ( - stage.as_any().downcast_ref::(), - stage.as_any().downcast_ref::(), + stage.downcast_ref::(), + stage.downcast_ref::(), ) }) { Some((Some(stage), None)) => { @@ -338,7 +338,7 @@ impl AdaptivePlanner { if !runnable_stages.is_empty() { let mut runnable = Vec::new(); for exec in runnable_stages.into_iter() { - match exec.as_any().downcast_ref::() { + match exec.downcast_ref::() { Some(exchange) if exchange.inactive_stage => continue, Some(exchange) if exchange.stage_id().is_none() => { exchange.set_stage_id(self.stage_id_generator); @@ -361,9 +361,7 @@ impl AdaptivePlanner { } Ok(Some(runnable)) - } else if let Some(root) = - self.plan.as_any().downcast_ref::() - { + } else if let Some(root) = self.plan.downcast_ref::() { // shuffle writer has finished // there is no more runnable stages if root.shuffle_created() { @@ -405,8 +403,7 @@ impl AdaptivePlanner { runnable_stages .into_iter() .map(|exec| { - exec.as_any() - .downcast_ref::() + exec.downcast_ref::() .ok_or_else(|| { datafusion::common::DataFusionError::Plan( "ExchangeExec expected".into(), @@ -473,7 +470,7 @@ impl AdaptivePlanner { node: &Arc, runnable_stages: &mut Vec>, ) -> bool { - if let Some(exchange) = node.as_any().downcast_ref::() + if let Some(exchange) = node.downcast_ref::() && exchange.shuffle_created() { // we found exchange which has partitions resolved or this stage is not @@ -481,7 +478,7 @@ impl AdaptivePlanner { // all runnable children has been run false - } else if let Some(exchange) = node.as_any().downcast_ref::() + } else if let Some(exchange) = node.downcast_ref::() && !exchange.shuffle_created() { // we found exchange which has not been resolved (run) diff --git a/ballista/scheduler/src/state/aqe/test/alter_stages.rs b/ballista/scheduler/src/state/aqe/test/alter_stages.rs index 419a780696..1a51c91845 100644 --- a/ballista/scheduler/src/state/aqe/test/alter_stages.rs +++ b/ballista/scheduler/src/state/aqe/test/alter_stages.rs @@ -37,7 +37,6 @@ use datafusion::physical_plan::test::exec::StatisticsExec; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PhysicalExpr, PlanProperties, }; -use std::any::Any; use std::collections::HashSet; use std::fmt::Formatter; use std::sync::Arc; @@ -649,10 +648,6 @@ impl ExecutionPlan for MockPartitionedScan { "MockPartitionedScan" } - fn as_any(&self) -> &dyn Any { - self - } - fn properties(&self) -> &Arc { &self.plan_properties } @@ -679,7 +674,7 @@ impl ExecutionPlan for MockPartitionedScan { fn partition_statistics( &self, _partition: Option, - ) -> datafusion::common::Result { - Ok(self.statistics.clone()) + ) -> datafusion::common::Result> { + Ok(Arc::new(self.statistics.clone())) } } diff --git a/ballista/scheduler/src/state/aqe/test/plan_to_stages.rs b/ballista/scheduler/src/state/aqe/test/plan_to_stages.rs index 9e1313d3cf..a9dc152738 100644 --- a/ballista/scheduler/src/state/aqe/test/plan_to_stages.rs +++ b/ballista/scheduler/src/state/aqe/test/plan_to_stages.rs @@ -25,8 +25,8 @@ use crate::state::aqe::test::{ use ballista_core::execution_plans::SortShuffleWriterExec; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::ColumnStatistics; -use datafusion::physical_plan::Statistics; use datafusion::physical_plan::test::exec::StatisticsExec; +use datafusion::physical_plan::{ExecutionPlan, Statistics}; use std::collections::HashSet; use std::sync::Arc; @@ -451,7 +451,7 @@ async fn should_use_sort_shuffle_when_enabled() -> datafusion::error::Result<()> let plan = stages.first().unwrap().plan.as_ref(); assert!( - plan.as_any() + (plan as &dyn ExecutionPlan) .downcast_ref::() .is_some(), "expected SortShuffleWriterExec when sort shuffle is enabled, got plan: {plan:?}" @@ -478,7 +478,7 @@ async fn should_use_sort_shuffle_by_default() -> datafusion::error::Result<()> { let plan = stages.first().unwrap().plan.as_ref(); assert!( - plan.as_any() + (plan as &dyn ExecutionPlan) .downcast_ref::() .is_some(), "expected SortShuffleWriterExec by default, got plan: {plan:?}" diff --git a/ballista/scheduler/src/state/distributed_explain.rs b/ballista/scheduler/src/state/distributed_explain.rs index fc31580329..b42a930889 100644 --- a/ballista/scheduler/src/state/distributed_explain.rs +++ b/ballista/scheduler/src/state/distributed_explain.rs @@ -194,7 +194,7 @@ pub(crate) async fn handle_explain_plan( plan: Arc, ) -> ballista_core::error::Result> { if let LogicalPlan::Explain(explain_plan) = &logical_plan - && let Some(explain) = plan.as_any().downcast_ref::() + && let Some(explain) = plan.downcast_ref::() { let inner_plan = explain_plan.plan.clone(); let plans = explain.stringified_plans(); diff --git a/ballista/scheduler/src/state/execution_graph.rs b/ballista/scheduler/src/state/execution_graph.rs index 8f0c30cfa5..4101c3debf 100644 --- a/ballista/scheduler/src/state/execution_graph.rs +++ b/ballista/scheduler/src/state/execution_graph.rs @@ -1625,14 +1625,12 @@ impl ExecutionPlanVisitor for ExecutionStageBuilder { plan: &dyn ExecutionPlan, ) -> std::result::Result { // Handle both ShuffleWriterExec and SortShuffleWriterExec - if let Some(shuffle_write) = plan.as_any().downcast_ref::() { + if let Some(shuffle_write) = plan.downcast_ref::() { self.current_stage_id = shuffle_write.stage_id(); - } else if let Some(shuffle_write) = - plan.as_any().downcast_ref::() - { + } else if let Some(shuffle_write) = plan.downcast_ref::() { self.current_stage_id = shuffle_write.stage_id(); } else if let Some(unresolved_shuffle) = - plan.as_any().downcast_ref::() + plan.downcast_ref::() { if let Some(output_links) = self.output_links.get_mut(&unresolved_shuffle.stage_id) @@ -1702,18 +1700,14 @@ impl TaskDescription { /// Returns the number of output partitions this task will produce. pub fn get_output_partition_number(&self) -> usize { // Try ShuffleWriterExec first - if let Some(shuffle_writer) = - self.plan.as_any().downcast_ref::() - { + if let Some(shuffle_writer) = self.plan.downcast_ref::() { return shuffle_writer .shuffle_output_partitioning() .map(|partitioning| partitioning.partition_count()) .unwrap_or(1); } // Try SortShuffleWriterExec - if let Some(shuffle_writer) = - self.plan.as_any().downcast_ref::() - { + if let Some(shuffle_writer) = self.plan.downcast_ref::() { return shuffle_writer .shuffle_output_partitioning() .partition_count(); diff --git a/ballista/scheduler/src/state/execution_graph_dot.rs b/ballista/scheduler/src/state/execution_graph_dot.rs index e1a24c4da5..13095b9ac0 100644 --- a/ballista/scheduler/src/state/execution_graph_dot.rs +++ b/ballista/scheduler/src/state/execution_graph_dot.rs @@ -148,7 +148,7 @@ fn write_plan_recursive( let node_name = format!("{prefix}_{i}"); let display_name = get_operator_name(plan); - if let Some(reader) = plan.as_any().downcast_ref::() { + if let Some(reader) = plan.downcast_ref::() { for part in &reader.partition { for loc in part { state @@ -156,7 +156,7 @@ fn write_plan_recursive( .insert(node_name.clone(), loc.partition_id.stage_id); } } - } else if let Some(reader) = plan.as_any().downcast_ref::() { + } else if let Some(reader) = plan.downcast_ref::() { state.readers.insert(node_name.clone(), reader.stage_id); } @@ -230,9 +230,9 @@ fn sanitize(str: &str, max_len: Option) -> String { } #[allow(deprecated)] fn get_operator_name(plan: &dyn ExecutionPlan) -> String { - if let Some(exec) = plan.as_any().downcast_ref::() { + if let Some(exec) = plan.downcast_ref::() { format!("Filter: {}", exec.predicate()) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { let expr = exec .expr() .iter() @@ -241,7 +241,7 @@ fn get_operator_name(plan: &dyn ExecutionPlan) -> String { .collect::>() .join(", "); format!("Projection: {}", sanitize_dot_label(&expr)) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { let sort_expr = exec .expr() .iter() @@ -257,7 +257,7 @@ fn get_operator_name(plan: &dyn ExecutionPlan) -> String { .collect::>() .join(", "); format!("Sort: {}", sanitize_dot_label(&sort_expr)) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { let group_exprs_with_alias = exec.group_expr().expr(); let group_expr = group_exprs_with_alias .iter() @@ -277,19 +277,19 @@ aggr=[{}]", sanitize_dot_label(&group_expr), sanitize_dot_label(&aggr_expr) ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!("CoalesceBatches [batchSize={}]", exec.target_batch_size()) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!( "CoalescePartitions [{}]", format_partitioning(exec.properties().output_partitioning().clone()) ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!( "RepartitionExec [{}]", format_partitioning(exec.properties().output_partitioning().clone()) ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { let join_expr = exec .on() .iter() @@ -308,49 +308,46 @@ filter_expr={}", sanitize_dot_label(&join_expr), sanitize_dot_label(&filter_expr) ) - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.downcast_ref::().is_some() { "CrossJoin".to_string() - } else if plan.as_any().downcast_ref::().is_some() { + } else if plan.downcast_ref::().is_some() { "Union".to_string() - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!("UnresolvedShuffleExec [stage_id={}]", exec.stage_id) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!("ShuffleReader [{} partitions]", exec.partition.len()) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!( "ShuffleWriter [{} partitions]", exec.input_partition_count() ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!( "SortShuffleWriter [{} partitions]", exec.input_partition_count() ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - let config = if let Some(config) = - exec.data_source().as_any().downcast_ref::() - { - get_file_scan(config) - } else if let Some(_config) = exec - .data_source() - .as_any() - .downcast_ref::() - { - "Memory".to_string() - } else { - "Unknown".to_string() - }; + } else if let Some(exec) = plan.downcast_ref::() { + let config = + if let Some(config) = exec.data_source().downcast_ref::() { + get_file_scan(config) + } else if let Some(_config) = + exec.data_source().downcast_ref::() + { + "Memory".to_string() + } else { + "Unknown".to_string() + }; let parts = exec.properties().output_partitioning().partition_count(); format!("DataSourceExec: ({config}) [{parts} partitions]") - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!( "GlobalLimit(skip={}, fetch={:?})", exec.skip(), exec.fetch() ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { format!("LocalLimit({})", exec.fetch()) } else { debug!("Unknown physical operator when producing DOT graph: {plan:?}"); diff --git a/ballista/scheduler/src/state/execution_stage.rs b/ballista/scheduler/src/state/execution_stage.rs index bb702c2898..ea2c6b6c07 100644 --- a/ballista/scheduler/src/state/execution_stage.rs +++ b/ballista/scheduler/src/state/execution_stage.rs @@ -1005,11 +1005,11 @@ impl Debug for FailedStage { /// will be different. Here, we should use the input partition count. fn get_stage_partitions(plan: Arc) -> usize { // Try ShuffleWriterExec first - if let Some(shuffle_writer) = plan.as_any().downcast_ref::() { + if let Some(shuffle_writer) = plan.downcast_ref::() { return shuffle_writer.input_partition_count(); } // Try SortShuffleWriterExec - if let Some(shuffle_writer) = plan.as_any().downcast_ref::() { + if let Some(shuffle_writer) = plan.downcast_ref::() { return shuffle_writer.input_partition_count(); } // Fallback to output partitioning diff --git a/ballista/scheduler/src/test_utils.rs b/ballista/scheduler/src/test_utils.rs index 93ca29d752..2e900f4d71 100644 --- a/ballista/scheduler/src/test_utils.rs +++ b/ballista/scheduler/src/test_utils.rs @@ -19,7 +19,6 @@ use ballista_core::JobStatusSubscriber; use ballista_core::error::{BallistaError, Result}; use ballista_core::extension::SessionConfigExt; use datafusion::catalog::Session; -use std::any::Any; use std::collections::HashMap; use std::future::Future; use std::sync::Arc; @@ -82,10 +81,6 @@ pub struct ExplodingTableProvider; #[async_trait] impl TableProvider for ExplodingTableProvider { - fn as_any(&self) -> &dyn Any { - self - } - fn schema(&self) -> SchemaRef { Arc::new(Schema::empty()) } From 25dec12903938930730fd2bcfaef2401a5fac1c6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 26 May 2026 08:05:33 -0600 Subject: [PATCH 2/6] fix(test): pick input values that split across hash partitions DataFusion 54 changed the deterministic seed used by the repartition hash partitioner (REPARTITION_RANDOM_STATE). The values 1 and 3 that the shuffle_writer tests fed in now hash to the same bucket under that seed, so the writer produced a single partition file instead of two and the assertions on per-partition row counts failed on every platform. Switch the test input to 0 and 2, which split cleanly under the new seed, and leave a comment noting the dependency. --- ballista/core/src/execution_plans/shuffle_writer.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ballista/core/src/execution_plans/shuffle_writer.rs b/ballista/core/src/execution_plans/shuffle_writer.rs index 0e68c42393..00088fd8a0 100644 --- a/ballista/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/core/src/execution_plans/shuffle_writer.rs @@ -782,7 +782,10 @@ mod tests { let batch = RecordBatch::try_new( schema.clone(), vec![ - Arc::new(UInt32Array::from(vec![Some(1), Some(3)])), + // 0 and 2 deliberately hash to different partitions under + // DataFusion 54's repartition hash seed; bumping the seed + // again may require picking new values here. + Arc::new(UInt32Array::from(vec![Some(0), Some(2)])), Arc::new(StringArray::from(vec![Some("hello"), Some("world")])), ], )?; From b68f73e3515face5eb03cf2038ca4e305240ab25 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 26 May 2026 09:28:06 -0600 Subject: [PATCH 3/6] test: re-baseline scheduler tests for DF 54 plan shapes and add work-stealing regression coverage DataFusion 54 ships a smarter planner: a 3-table join now collapses to a single distributed stage with a broadcast inner join, and HashJoinExec fuses the trailing ProjectionExec into its own projection field. Update the dot snapshot tests, the executor-loss recovery assertions in execution_graph::test, and the AQE insta snapshot for should_support_join_re_ordering to match these new plan shapes. Also add ballista/client/tests/multi_file_scan.rs as a follow-up regression suite. DataFusion 54's FileScanConfig now populates a shared work source over every file in the scan, and each Ballista task ends up draining that queue locally, so a 6-file table scanned with 6 tasks reads 36 files instead of 6. The two tests document the failure and are #[ignore]d for now; they should turn green once the deserialised plan is pre-split per task (the approach datafusion-distributed used in PR #467). --- ballista/client/tests/multi_file_scan.rs | 202 ++++++++++++++++++ .../src/state/aqe/test/alter_stages.rs | 48 ++--- .../scheduler/src/state/execution_graph.rs | 49 +++-- .../src/state/execution_graph_dot.rs | 125 +++++------ 4 files changed, 310 insertions(+), 114 deletions(-) create mode 100644 ballista/client/tests/multi_file_scan.rs diff --git a/ballista/client/tests/multi_file_scan.rs b/ballista/client/tests/multi_file_scan.rs new file mode 100644 index 0000000000..4c121ab89f --- /dev/null +++ b/ballista/client/tests/multi_file_scan.rs @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod common; + +// Regression coverage for the DataFusion 54 upgrade tracked in +// https://github.com/apache/datafusion-ballista/issues/1776 (see also the +// linked datafusion-distributed issue #460 and PR #467 about FileScanConfig +// work stealing). +// +// DataFusion 54's `FileScanConfig::create_sibling_state` returns a +// `SharedWorkSource` populated with every file in the scan. When any +// partition of that DataSourceExec opens its stream, it pulls files off the +// shared queue until empty. In a single-process DataFusion run this is +// harmless because all partitions of the same DataSourceExec instance share +// the queue and the queue is drained exactly once across them. +// +// Ballista breaks that invariant: each task deserialises its *own* copy of +// the plan and executes a single partition. Each task therefore has its own +// shared queue containing every file, and the partition it runs drains the +// whole queue. The result is that every file is scanned once per task, so a +// 6-file table with 6 tasks reads 36 files and returns 6x the correct row +// count. +// +// These tests are deliberately left enabled but #[ignore]d so they document +// the failure mode without blocking CI. They should turn green once Ballista +// either pre-splits FileScanConfig file_groups per task before serialisation +// (the approach datafusion-distributed took in PR #467) or otherwise stops +// each task from inheriting the full shared work queue. +#[cfg(test)] +#[cfg(feature = "standalone")] +mod work_stealing { + use ballista::prelude::SessionContextExt; + use datafusion::arrow::array::{Int64Array, RecordBatch}; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::common::Result; + use datafusion::config::TableParquetOptions; + use datafusion::dataframe::DataFrameWriteOptions; + use datafusion::prelude::{ParquetReadOptions, SessionContext}; + use std::sync::Arc; + use tempfile::TempDir; + + /// Writes `num_files` parquet files into `dir`, each holding the rows + /// `[file_idx * rows_per_file .. (file_idx + 1) * rows_per_file)`. + /// Returns the total number of rows written and the expected sum across + /// the `value` column, which the tests use to detect duplicated or missing + /// reads. + async fn write_parquet_dataset( + dir: &std::path::Path, + num_files: usize, + rows_per_file: usize, + ) -> Result<(usize, i64)> { + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int64, + false, + )])); + + // DataFusion-only context for writing the fixture so we don't depend + // on the cluster being healthy for setup. + let writer_ctx = SessionContext::new(); + for file_idx in 0..num_files { + let start = (file_idx * rows_per_file) as i64; + let values: Vec = (start..start + rows_per_file as i64).collect(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int64Array::from(values))], + )?; + let df = writer_ctx.read_batch(batch)?; + let path = dir.join(format!("part-{file_idx:04}.parquet")); + df.write_parquet( + path.to_str().unwrap(), + DataFrameWriteOptions::default(), + Some(TableParquetOptions::default()), + ) + .await?; + } + + let total_rows = num_files * rows_per_file; + let total_sum = (0..total_rows as i64).sum(); + Ok((total_rows, total_sum)) + } + + // Each Ballista task currently drains the full shared work queue, so the + // returned row count is `num_files * tasks` instead of `num_files * + // rows_per_file`. Re-enable once the upstream issue is addressed. + #[ignore = "FileScanConfig shared work queue causes per-task over-reads under DF 54"] + #[tokio::test] + async fn multi_file_parquet_scan_counts_every_row_exactly_once() -> Result<()> { + let tmp_dir = TempDir::new().unwrap(); + let (expected_rows, expected_sum) = + write_parquet_dataset(tmp_dir.path(), 6, 7).await?; + + let ctx = SessionContext::standalone().await?; + ctx.register_parquet( + "t", + tmp_dir.path().to_str().unwrap(), + ParquetReadOptions::default(), + ) + .await?; + + let batches = ctx + .sql("SELECT COUNT(*) AS row_count, SUM(value) AS value_sum FROM t") + .await? + .collect() + .await?; + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + let row_count = batch + .column_by_name("row_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + let value_sum = batch + .column_by_name("value_sum") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + + assert_eq!( + row_count, expected_rows as i64, + "Ballista returned the wrong row count; work stealing causes \ + duplicated rows here" + ); + assert_eq!( + value_sum, expected_sum, + "Ballista returned the wrong column sum; duplicated reads inflate \ + this" + ); + + Ok(()) + } + + #[ignore = "FileScanConfig shared work queue causes per-task over-reads under DF 54"] + #[tokio::test] + async fn multi_file_parquet_group_by_returns_each_value_once() -> Result<()> { + let tmp_dir = TempDir::new().unwrap(); + let (expected_rows, _) = write_parquet_dataset(tmp_dir.path(), 4, 5).await?; + + let ctx = SessionContext::standalone().await?; + ctx.register_parquet( + "t", + tmp_dir.path().to_str().unwrap(), + ParquetReadOptions::default(), + ) + .await?; + + // GROUP BY across the whole dataset exercises a shuffle on top of the + // multi-file scan. If the scan double-counts, the per-key counts + // become 2 or higher. + let batches = ctx + .sql("SELECT value, COUNT(*) AS c FROM t GROUP BY value") + .await? + .collect() + .await?; + + let mut total_keys = 0usize; + for batch in &batches { + let counts = batch + .column_by_name("c") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..counts.len() { + assert_eq!( + counts.value(i), + 1, + "value at row {i} of batch was read {} times instead of \ + once; work stealing surfaces as a count > 1 here", + counts.value(i) + ); + total_keys += 1; + } + } + assert_eq!( + total_keys, expected_rows, + "expected every distinct value to be present exactly once" + ); + + Ok(()) + } +} diff --git a/ballista/scheduler/src/state/aqe/test/alter_stages.rs b/ballista/scheduler/src/state/aqe/test/alter_stages.rs index 1a51c91845..7661eacc0e 100644 --- a/ballista/scheduler/src/state/aqe/test/alter_stages.rs +++ b/ballista/scheduler/src/state/aqe/test/alter_stages.rs @@ -209,17 +209,17 @@ async fn should_support_join_re_ordering() -> datafusion::error::Result<()> { planner.finalise_stage_internal(1, small_statistics_exchange())?; // join ordering changes as build side is bigger than probe side - // after exchange statistic updated. + // after exchange statistic updated. DataFusion 54 fuses the trailing + // ProjectionExec into the HashJoinExec via the `projection` field. assert_plan!(planner.current_plan(), @ r" AdaptiveDatafusionExec: is_final=false, plan_id=2, stage_id=pending, stage_resolved=false - ProjectionExec: expr=[big_col@1 as big_col, big_col@0 as big_col] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(big_col@0, big_col@0)] - ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=1, stage_id=1, stage_resolved=true - CooperativeExec - MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] - ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=0, stage_id=0, stage_resolved=true - CooperativeExec - MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(big_col@0, big_col@0)], projection=[big_col@1, big_col@0] + ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=1, stage_id=1, stage_resolved=true + CooperativeExec + MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] + ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=0, stage_id=0, stage_resolved=true + CooperativeExec + MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] "); let stages = planner.runnable_stages()?.unwrap(); @@ -227,28 +227,26 @@ async fn should_support_join_re_ordering() -> datafusion::error::Result<()> { assert_plan!(planner.current_plan(), @ r" AdaptiveDatafusionExec: is_final=true, plan_id=2, stage_id=2, stage_resolved=false - ProjectionExec: expr=[big_col@1 as big_col, big_col@0 as big_col] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(big_col@0, big_col@0)] - ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=1, stage_id=1, stage_resolved=true - CooperativeExec - MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] - ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=0, stage_id=0, stage_resolved=true - CooperativeExec - MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(big_col@0, big_col@0)], projection=[big_col@1, big_col@0] + ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=1, stage_id=1, stage_resolved=true + CooperativeExec + MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] + ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=0, stage_id=0, stage_resolved=true + CooperativeExec + MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] "); planner.finalise_stage_internal(2, small_statistics_exchange())?; assert_plan!(planner.current_plan(), @ r" AdaptiveDatafusionExec: is_final=true, plan_id=2, stage_id=2, stage_resolved=true - ProjectionExec: expr=[big_col@1 as big_col, big_col@0 as big_col] - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(big_col@0, big_col@0)] - ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=1, stage_id=1, stage_resolved=true - CooperativeExec - MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] - ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=0, stage_id=0, stage_resolved=true - CooperativeExec - MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(big_col@0, big_col@0)], projection=[big_col@1, big_col@0] + ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=1, stage_id=1, stage_resolved=true + CooperativeExec + MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] + ExchangeExec: partitioning=Hash([big_col@0], 2), plan_id=0, stage_id=0, stage_resolved=true + CooperativeExec + MockPartitionedScan: num_partitions=2, statistics=[Rows=Exact(262144), Bytes=Exact(2097152), [(Col[0]:)]] "); Ok(()) diff --git a/ballista/scheduler/src/state/execution_graph.rs b/ballista/scheduler/src/state/execution_graph.rs index 4101c3debf..86222476ac 100644 --- a/ballista/scheduler/src/state/execution_graph.rs +++ b/ballista/scheduler/src/state/execution_graph.rs @@ -1881,15 +1881,18 @@ mod test { let mut join_graph = test_join_plan(4).await; // With the improvement of https://github.com/apache/arrow-datafusion/pull/4122, - // unnecessary RepartitionExec can be removed + // unnecessary RepartitionExec can be removed. DataFusion 54 took this + // a step further and now broadcasts one side of the join, so the + // graph has a single leaf stage (the scan of "left") with 2 tasks + // instead of the previous Y-shape with two 2-task leaves. assert_eq!(join_graph.stage_count(), 4); assert_eq!(join_graph.available_tasks(), 0); - // Call revive to move the two leaf Resolved stages to Running + // Call revive to move the leaf Resolved stage to Running join_graph.revive(); assert_eq!(join_graph.stage_count(), 4); - assert_eq!(join_graph.available_tasks(), 4); + assert_eq!(join_graph.available_tasks(), 2); // Complete the first stage revive_graph_and_complete_next_stage_with_executor(&mut join_graph, &executor1)?; @@ -1911,9 +1914,14 @@ mod test { let reset = join_graph.reset_stages_on_lost_executor(&executor1.id)?; - // Two stages were reset, 1 Running stage rollback to Unresolved and 1 Completed stage move to Running - assert_eq!(reset.0.len(), 2); - assert_eq!(join_graph.available_tasks(), 2); + // With the new linear plan, the running stage (stage 3) reads from + // stage 2 which was completed by executor2, so losing executor1 only + // resets the tasks that executor1 itself ran (1 completed + 1 + // in-flight on stage 3). No upstream stages are rolled back because + // their outputs are already consumed. After the reset, stage 3 has + // all 4 tasks pending again (2 reset + 2 that hadn't been popped). + assert_eq!(reset.0.len(), 1); + assert_eq!(join_graph.available_tasks(), 4); drain_tasks(&mut join_graph)?; assert!(join_graph.is_successful(), "Failed to complete join plan"); @@ -1924,36 +1932,31 @@ mod test { #[tokio::test] async fn test_reset_resolved_stage_executor_lost() -> Result<()> { let executor1 = mock_executor("executor-id1".to_string()); - let executor2 = mock_executor("executor-id2".to_string()); + let _executor2 = mock_executor("executor-id2".to_string()); let mut join_graph = test_join_plan(4).await; assert_eq!(join_graph.stage_count(), 4); assert_eq!(join_graph.available_tasks(), 0); - // Call revive to move the two leaf Resolved stages to Running + // Call revive to move the leaf Resolved stage to Running. See + // test_reset_completed_stage_executor_lost for why DataFusion 54 + // produces a single leaf instead of two. join_graph.revive(); assert_eq!(join_graph.stage_count(), 4); - assert_eq!(join_graph.available_tasks(), 4); + assert_eq!(join_graph.available_tasks(), 2); - // Complete the first stage + // Complete the first stage with executor1. Do NOT complete the second + // stage: this leaves stage 2 in Resolved state holding inputs that + // live on executor1, which is exactly the scenario this test wants + // to exercise. assert_eq!(revive_graph_and_complete_next_stage(&mut join_graph)?, 2); - // Complete the second stage - assert_eq!( - revive_graph_and_complete_next_stage_with_executor( - &mut join_graph, - &executor2 - )?, - 2 - ); - - // There are 0 tasks pending schedule now - assert_eq!(join_graph.available_tasks(), 0); - let reset = join_graph.reset_stages_on_lost_executor(&executor1.id)?; - // Two stages were reset, 1 Resolved stage rollback to Unresolved and 1 Completed stage move to Running + // Stage 2 (Resolved) rolls back to Unresolved because its input came + // from executor1, and stage 1 (Successful) is resubmitted so its + // output can be recomputed. assert_eq!(reset.0.len(), 2); assert_eq!(join_graph.available_tasks(), 2); diff --git a/ballista/scheduler/src/state/execution_graph_dot.rs b/ballista/scheduler/src/state/execution_graph_dot.rs index 13095b9ac0..9f48160ddb 100644 --- a/ballista/scheduler/src/state/execution_graph_dot.rs +++ b/ballista/scheduler/src/state/execution_graph_dot.rs @@ -423,53 +423,43 @@ mod tests { let dot = ExecutionGraphDot::generate(&graph) .map_err(|e| BallistaError::Internal(format!("{e:?}")))?; + // DataFusion 54's physical planner picks a more efficient join plan + // for this query: `baz` is scanned inside the join stage instead of + // being given its own shuffle stage. That collapses the previous + // 5-stage plan into 3 stages. let expected = r#"digraph G { subgraph cluster0 { label = "Stage 1 [Resolved]"; - stage_1_0 [shape=box, label="SortShuffleWriter [2 partitions]"] + stage_1_0 [shape=box, label="ShuffleWriter [2 partitions]"] stage_1_0_0 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] stage_1_0_0 -> stage_1_0 } subgraph cluster1 { label = "Stage 2 [Resolved]"; - stage_2_0 [shape=box, label="SortShuffleWriter [2 partitions]"] + stage_2_0 [shape=box, label="ShuffleWriter [2 partitions]"] stage_2_0_0 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] stage_2_0_0 -> stage_2_0 } subgraph cluster2 { label = "Stage 3 [Unresolved]"; - stage_3_0 [shape=box, label="SortShuffleWriter [48 partitions]"] + stage_3_0 [shape=box, label="ShuffleWriter [2 partitions]"] stage_3_0_0 [shape=box, label="HashJoin -join_expr=a@0 = a@0 +join_expr=b@1 = b@3 filter_expr="] stage_3_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=1]"] stage_3_0_0_0 -> stage_3_0_0 - stage_3_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] + stage_3_0_0_1 [shape=box, label="HashJoin +join_expr=a@0 = a@0 +filter_expr="] + stage_3_0_0_1_0 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] + stage_3_0_0_1_0 -> stage_3_0_0_1 + stage_3_0_0_1_1 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] + stage_3_0_0_1_1 -> stage_3_0_0_1 stage_3_0_0_1 -> stage_3_0_0 stage_3_0_0 -> stage_3_0 } - subgraph cluster3 { - label = "Stage 4 [Resolved]"; - stage_4_0 [shape=box, label="SortShuffleWriter [2 partitions]"] - stage_4_0_0 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] - stage_4_0_0 -> stage_4_0 - } - subgraph cluster4 { - label = "Stage 5 [Unresolved]"; - stage_5_0 [shape=box, label="ShuffleWriter [48 partitions]"] - stage_5_0_0 [shape=box, label="HashJoin -join_expr=b@3 = b@1 -filter_expr="] - stage_5_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=3]"] - stage_5_0_0_0 -> stage_5_0_0 - stage_5_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=4]"] - stage_5_0_0_1 -> stage_5_0_0 - stage_5_0_0 -> stage_5_0 - } stage_1_0 -> stage_3_0_0_0 - stage_2_0 -> stage_3_0_0_1 - stage_3_0 -> stage_5_0_0_0 - stage_4_0 -> stage_5_0_0_1 + stage_2_0 -> stage_3_0_0_1_0 } "#; assert_eq!(expected, &dot); @@ -483,13 +473,19 @@ filter_expr="] .map_err(|e| BallistaError::Internal(format!("{e:?}")))?; let expected = r#"digraph G { - stage_3_0 [shape=box, label="SortShuffleWriter [48 partitions]"] + stage_3_0 [shape=box, label="ShuffleWriter [2 partitions]"] stage_3_0_0 [shape=box, label="HashJoin -join_expr=a@0 = a@0 +join_expr=b@1 = b@3 filter_expr="] stage_3_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=1]"] stage_3_0_0_0 -> stage_3_0_0 - stage_3_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] + stage_3_0_0_1 [shape=box, label="HashJoin +join_expr=a@0 = a@0 +filter_expr="] + stage_3_0_0_1_0 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] + stage_3_0_0_1_0 -> stage_3_0_0_1 + stage_3_0_0_1_1 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] + stage_3_0_0_1_1 -> stage_3_0_0_1 stage_3_0_0_1 -> stage_3_0_0 stage_3_0_0 -> stage_3_0 } @@ -504,46 +500,43 @@ filter_expr="] let dot = ExecutionGraphDot::generate(&graph) .map_err(|e| BallistaError::Internal(format!("{e:?}")))?; + // DataFusion 54 collapses the join graph for this query into a single + // distributed stage that absorbs the third scan as the inner side of a + // broadcast hash join. The previous 4-stage shape was a planning + // artifact, not a Ballista requirement. let expected = r#"digraph G { subgraph cluster0 { label = "Stage 1 [Resolved]"; - stage_1_0 [shape=box, label="SortShuffleWriter [2 partitions]"] + stage_1_0 [shape=box, label="ShuffleWriter [2 partitions]"] stage_1_0_0 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] stage_1_0_0 -> stage_1_0 } subgraph cluster1 { label = "Stage 2 [Resolved]"; - stage_2_0 [shape=box, label="SortShuffleWriter [2 partitions]"] + stage_2_0 [shape=box, label="ShuffleWriter [2 partitions]"] stage_2_0_0 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] stage_2_0_0 -> stage_2_0 } subgraph cluster2 { - label = "Stage 3 [Resolved]"; - stage_3_0 [shape=box, label="SortShuffleWriter [2 partitions]"] - stage_3_0_0 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] - stage_3_0_0 -> stage_3_0 - } - subgraph cluster3 { - label = "Stage 4 [Unresolved]"; - stage_4_0 [shape=box, label="ShuffleWriter [48 partitions]"] - stage_4_0_0 [shape=box, label="HashJoin -join_expr=a@1 = a@0 + label = "Stage 3 [Unresolved]"; + stage_3_0 [shape=box, label="ShuffleWriter [2 partitions]"] + stage_3_0_0 [shape=box, label="HashJoin +join_expr=a@0 = a@1 filter_expr="] - stage_4_0_0_0 [shape=box, label="HashJoin + stage_3_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=1]"] + stage_3_0_0_0 -> stage_3_0_0 + stage_3_0_0_1 [shape=box, label="HashJoin join_expr=a@0 = a@0 filter_expr="] - stage_4_0_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=1]"] - stage_4_0_0_0_0 -> stage_4_0_0_0 - stage_4_0_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] - stage_4_0_0_0_1 -> stage_4_0_0_0 - stage_4_0_0_0 -> stage_4_0_0 - stage_4_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=3]"] - stage_4_0_0_1 -> stage_4_0_0 - stage_4_0_0 -> stage_4_0 + stage_3_0_0_1_0 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] + stage_3_0_0_1_0 -> stage_3_0_0_1 + stage_3_0_0_1_1 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] + stage_3_0_0_1_1 -> stage_3_0_0_1 + stage_3_0_0_1 -> stage_3_0_0 + stage_3_0_0 -> stage_3_0 } - stage_1_0 -> stage_4_0_0_0_0 - stage_2_0 -> stage_4_0_0_0_1 - stage_3_0 -> stage_4_0_0_1 + stage_1_0 -> stage_3_0_0_0 + stage_2_0 -> stage_3_0_0_1_0 } "#; assert_eq!(expected, &dot); @@ -553,25 +546,25 @@ filter_expr="] #[tokio::test] async fn query_stage_optimized() -> Result<()> { let graph = test_graph_optimized().await?; - let dot = ExecutionGraphDot::generate_for_query_stage(&graph, 4) + let dot = ExecutionGraphDot::generate_for_query_stage(&graph, 3) .map_err(|e| BallistaError::Internal(format!("{e:?}")))?; let expected = r#"digraph G { - stage_4_0 [shape=box, label="ShuffleWriter [48 partitions]"] - stage_4_0_0 [shape=box, label="HashJoin -join_expr=a@1 = a@0 + stage_3_0 [shape=box, label="ShuffleWriter [2 partitions]"] + stage_3_0_0 [shape=box, label="HashJoin +join_expr=a@0 = a@1 filter_expr="] - stage_4_0_0_0 [shape=box, label="HashJoin + stage_3_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=1]"] + stage_3_0_0_0 -> stage_3_0_0 + stage_3_0_0_1 [shape=box, label="HashJoin join_expr=a@0 = a@0 filter_expr="] - stage_4_0_0_0_0 [shape=box, label="UnresolvedShuffleExec [stage_id=1]"] - stage_4_0_0_0_0 -> stage_4_0_0_0 - stage_4_0_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] - stage_4_0_0_0_1 -> stage_4_0_0_0 - stage_4_0_0_0 -> stage_4_0_0 - stage_4_0_0_1 [shape=box, label="UnresolvedShuffleExec [stage_id=3]"] - stage_4_0_0_1 -> stage_4_0_0 - stage_4_0_0 -> stage_4_0 + stage_3_0_0_1_0 [shape=box, label="UnresolvedShuffleExec [stage_id=2]"] + stage_3_0_0_1_0 -> stage_3_0_0_1 + stage_3_0_0_1_1 [shape=box, label="DataSourceExec: (Memory) [2 partitions]"] + stage_3_0_0_1_1 -> stage_3_0_0_1 + stage_3_0_0_1 -> stage_3_0_0 + stage_3_0_0 -> stage_3_0 } "#; assert_eq!(expected, &dot); From 2c1e962e3964a99781b63aff4700fc5618ca21f8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 26 May 2026 09:47:31 -0600 Subject: [PATCH 4/6] fix: narrow FileScanConfig file_groups per task to defeat DF 54 work stealing DataFusion 54's FileScanConfig publishes a SharedWorkSource populated with every file in the scan, and each partition's stream drains that shared queue. In a single-process DataFusion run that's fine because all partitions share one queue and cooperatively drain it; in Ballista each task deserialises its own DataSourceExec and runs a single partition, so the partition that does run drains the whole queue and ends up reading every file in the scan. A 6-file scan dispatched as 6 tasks therefore returns 6x the rows. Introduce restrict_file_scan_to_partition, a TreeNode transform that walks the plan tree just before execution and rebuilds every FileScanConfig so only the target partition's file group keeps its files. The other slots become empty FileGroups so file_groups.len() (and therefore the advertised partition count) stays the same, leaving partition routing through the rest of the plan untouched. Wire the transform into ShuffleWriterExec::execute_shuffle_write and SortShuffleWriterExec::execute_shuffle_write so every task scans its assigned slice and only its slice. Drops the #[ignore] from the multi_file_scan integration tests, which now exercise this fix end-to-end through a standalone Ballista cluster. --- ballista/client/tests/multi_file_scan.rs | 34 ++-- ballista/core/src/execution_plans/mod.rs | 2 + .../src/execution_plans/restrict_file_scan.rs | 188 ++++++++++++++++++ .../src/execution_plans/shuffle_writer.rs | 8 +- .../execution_plans/sort_shuffle/writer.rs | 8 +- 5 files changed, 214 insertions(+), 26 deletions(-) create mode 100644 ballista/core/src/execution_plans/restrict_file_scan.rs diff --git a/ballista/client/tests/multi_file_scan.rs b/ballista/client/tests/multi_file_scan.rs index 4c121ab89f..6ccd283fb6 100644 --- a/ballista/client/tests/multi_file_scan.rs +++ b/ballista/client/tests/multi_file_scan.rs @@ -23,24 +23,19 @@ mod common; // work stealing). // // DataFusion 54's `FileScanConfig::create_sibling_state` returns a -// `SharedWorkSource` populated with every file in the scan. When any -// partition of that DataSourceExec opens its stream, it pulls files off the -// shared queue until empty. In a single-process DataFusion run this is -// harmless because all partitions of the same DataSourceExec instance share -// the queue and the queue is drained exactly once across them. +// `SharedWorkSource` populated with every file in the scan, and each +// partition's stream drains files from that queue. In a single-process +// DataFusion run that's fine because all partitions of the same +// DataSourceExec instance cooperatively drain one queue, but Ballista +// deserialises a fresh DataSourceExec for every task and runs a single +// partition against it. Without intervention the partition that does run +// drains the whole queue and reads every file, so a 6-file table executed +// by 6 tasks returns 6x the data. // -// Ballista breaks that invariant: each task deserialises its *own* copy of -// the plan and executes a single partition. Each task therefore has its own -// shared queue containing every file, and the partition it runs drains the -// whole queue. The result is that every file is scanned once per task, so a -// 6-file table with 6 tasks reads 36 files and returns 6x the correct row -// count. -// -// These tests are deliberately left enabled but #[ignore]d so they document -// the failure mode without blocking CI. They should turn green once Ballista -// either pre-splits FileScanConfig file_groups per task before serialisation -// (the approach datafusion-distributed took in PR #467) or otherwise stops -// each task from inheriting the full shared work queue. +// `restrict_file_scan_to_partition` in ballista-core narrows the +// FileScanConfig to the running partition's files just before execution so +// the SharedWorkSource only contains the slice this task is supposed to +// process. These tests would fail without that helper. #[cfg(test)] #[cfg(feature = "standalone")] mod work_stealing { @@ -95,10 +90,6 @@ mod work_stealing { Ok((total_rows, total_sum)) } - // Each Ballista task currently drains the full shared work queue, so the - // returned row count is `num_files * tasks` instead of `num_files * - // rows_per_file`. Re-enable once the upstream issue is addressed. - #[ignore = "FileScanConfig shared work queue causes per-task over-reads under DF 54"] #[tokio::test] async fn multi_file_parquet_scan_counts_every_row_exactly_once() -> Result<()> { let tmp_dir = TempDir::new().unwrap(); @@ -150,7 +141,6 @@ mod work_stealing { Ok(()) } - #[ignore = "FileScanConfig shared work queue causes per-task over-reads under DF 54"] #[tokio::test] async fn multi_file_parquet_group_by_returns_each_value_once() -> Result<()> { let tmp_dir = TempDir::new().unwrap(); diff --git a/ballista/core/src/execution_plans/mod.rs b/ballista/core/src/execution_plans/mod.rs index ae46fad687..d97964ef03 100644 --- a/ballista/core/src/execution_plans/mod.rs +++ b/ballista/core/src/execution_plans/mod.rs @@ -20,6 +20,7 @@ mod distributed_explain_analyze; mod distributed_query; +mod restrict_file_scan; mod shuffle_reader; mod shuffle_writer; mod shuffle_writer_trait; @@ -31,6 +32,7 @@ use std::path::{Path, PathBuf}; use datafusion::common::exec_err; pub use distributed_explain_analyze::DistributedExplainAnalyzeExec; pub use distributed_query::DistributedQueryExec; +pub use restrict_file_scan::restrict_file_scan_to_partition; pub use shuffle_reader::{CoalescePlan, PartitionGroup, ShuffleReaderExec}; pub use shuffle_reader::{stats_for_partition, stats_for_partitions}; pub use shuffle_writer::DEFAULT_SHUFFLE_CHANNEL_CAPACITY; diff --git a/ballista/core/src/execution_plans/restrict_file_scan.rs b/ballista/core/src/execution_plans/restrict_file_scan.rs new file mode 100644 index 0000000000..4c7f61a4f1 --- /dev/null +++ b/ballista/core/src/execution_plans/restrict_file_scan.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Per-task `FileScanConfig` narrowing. +//! +//! DataFusion 54 added a shared work queue (`SharedWorkSource`) to +//! `FileScanConfig`: when any partition opens its stream, it pulls files from +//! a queue populated with every file in the scan. That model assumes every +//! partition runs together inside the same DataSourceExec instance so they +//! cooperatively drain the queue exactly once. Ballista breaks that +//! assumption — each task deserialises its own copy of the plan and runs a +//! single partition — so the partition that does run drains the whole queue +//! and ends up scanning every file. Concretely, a 6-file scan executed by 6 +//! tasks reads 36 files and returns six copies of the data. +//! +//! [`restrict_file_scan_to_partition`] rewrites the plan tree just before +//! execution so that every `FileScanConfig` only contains files for the +//! partition being executed. The file group count (and therefore the +//! advertised partitioning) is preserved by replacing the other slots with +//! empty groups, so partition routing through the rest of the plan is +//! unaffected. +//! +//! See `ballista/client/tests/multi_file_scan.rs` for the end-to-end +//! regression that motivated this helper. + +use std::sync::Arc; + +use datafusion::common::Result; +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::datasource::physical_plan::{ + FileGroup, FileScanConfig, FileScanConfigBuilder, +}; +use datafusion::datasource::source::DataSourceExec; +use datafusion::physical_plan::ExecutionPlan; + +/// Narrow every `FileScanConfig` in `plan` so that only `partition`'s file +/// group has files; all other slots become empty. +/// +/// This keeps `file_groups.len()` (and therefore the advertised partition +/// count) unchanged, which means the rest of the plan can still route +/// `execute(partition)` calls through unmodified operators. The +/// `SharedWorkSource` that the file scan builds from `file_groups` ends up +/// containing only the relevant partition's files, so the active partition +/// reads exactly its assigned slice instead of draining the full set. +/// +/// If the leaf is something other than a `FileScanConfig`-backed +/// `DataSourceExec`, the node is left alone — there's nothing for the shared +/// queue to mishandle. +pub fn restrict_file_scan_to_partition( + plan: Arc, + partition: usize, +) -> Result> { + plan.transform_down(|node| { + let Some(data_source_exec) = node.downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + let Some(file_scan) = data_source_exec + .data_source() + .downcast_ref::() + else { + return Ok(Transformed::no(node)); + }; + + // Nothing to do for single-partition scans: the shared queue still + // matches what `execute(0)` would consume, so this is already + // correct. + if file_scan.file_groups.len() <= 1 { + return Ok(Transformed::no(node)); + } + + let mut new_groups: Vec = + Vec::with_capacity(file_scan.file_groups.len()); + for (idx, group) in file_scan.file_groups.iter().enumerate() { + if idx == partition { + new_groups.push(group.clone()); + } else { + new_groups.push(FileGroup::new(Vec::new())); + } + } + + let new_config = FileScanConfigBuilder::from(file_scan.clone()) + .with_file_groups(new_groups) + .build(); + let new_exec = + DataSourceExec::from_data_source(new_config) as Arc; + Ok(Transformed::yes(new_exec)) + }) + .map(|t| t.data) +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::datasource::listing::PartitionedFile; + use datafusion::datasource::physical_plan::ParquetSource; + use datafusion::execution::object_store::ObjectStoreUrl; + use std::sync::Arc; + + fn dummy_file(name: &str) -> PartitionedFile { + PartitionedFile::new(name.to_string(), 0) + } + + fn build_plan(num_groups: usize) -> Arc { + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int64, + false, + )])); + let groups: Vec = (0..num_groups) + .map(|i| FileGroup::new(vec![dummy_file(&format!("f{i}.parquet"))])) + .collect(); + let file_source = Arc::new(ParquetSource::new(schema.clone())); + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .with_file_groups(groups) + .build(); + DataSourceExec::from_data_source(config) as Arc + } + + fn file_groups_of(plan: &Arc) -> Vec> { + let exec = plan + .downcast_ref::() + .expect("DataSourceExec"); + let scan = exec + .data_source() + .downcast_ref::() + .expect("FileScanConfig"); + scan.file_groups + .iter() + .map(|g| g.iter().map(|f| f.path().to_string()).collect()) + .collect() + } + + #[test] + fn keeps_only_target_partition_files() { + let plan = build_plan(4); + let restricted = restrict_file_scan_to_partition(plan, 2).unwrap(); + let groups = file_groups_of(&restricted); + assert_eq!( + groups, + vec![ + vec![] as Vec, + vec![], + vec!["f2.parquet".to_string()], + vec![], + ], + "only file_groups[2] should keep its file; the others must be empty so \ + the SharedWorkSource contains only this task's slice" + ); + } + + #[test] + fn preserves_partition_count() { + let plan = build_plan(3); + let restricted = restrict_file_scan_to_partition(plan, 1).unwrap(); + let groups = file_groups_of(&restricted); + assert_eq!( + groups.len(), + 3, + "file_groups length must be preserved so DataSourceExec keeps its \ + advertised partition count" + ); + } + + #[test] + fn single_partition_scan_is_left_alone() { + let plan = build_plan(1); + let restricted = restrict_file_scan_to_partition(Arc::clone(&plan), 0).unwrap(); + // The transform should detect there's nothing to narrow and return + // the original Arc untouched (it's a no-op in that case). + assert!(Arc::ptr_eq(&plan, &restricted)); + } +} diff --git a/ballista/core/src/execution_plans/shuffle_writer.rs b/ballista/core/src/execution_plans/shuffle_writer.rs index 00088fd8a0..c52c9a0c4c 100644 --- a/ballista/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/core/src/execution_plans/shuffle_writer.rs @@ -34,7 +34,7 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; -use crate::execution_plans::create_shuffle_path; +use crate::execution_plans::{create_shuffle_path, restrict_file_scan_to_partition}; use crate::extension::SessionConfigExt; use crate::utils; @@ -206,9 +206,13 @@ impl ShuffleWriterExec { ) -> impl Future>> { let write_metrics = ShuffleWriteMetrics::new(input_partition, &self.metrics); let output_partitioning = self.shuffle_output_partitioning.clone(); - let plan = self.plan.clone(); + // Restrict file scans to this task's partition so DataFusion 54's + // shared work queue can't pull files from sibling partitions; see + // [`restrict_file_scan_to_partition`] for the full story. + let plan = restrict_file_scan_to_partition(self.plan.clone(), input_partition); async move { + let plan = plan?; let now = Instant::now(); let channel_capacity = context .session_config() diff --git a/ballista/core/src/execution_plans/sort_shuffle/writer.rs b/ballista/core/src/execution_plans/sort_shuffle/writer.rs index 7990d642cd..7e0029ca67 100644 --- a/ballista/core/src/execution_plans/sort_shuffle/writer.rs +++ b/ballista/core/src/execution_plans/sort_shuffle/writer.rs @@ -34,7 +34,7 @@ use super::config::SortShuffleConfig; use super::index::ShuffleIndex; use super::partitioned_batch_iterator::PartitionedBatchIterator; use super::spill::SpillManager; -use crate::execution_plans::create_shuffle_path; +use crate::execution_plans::{create_shuffle_path, restrict_file_scan_to_partition}; use crate::serde::protobuf::ShuffleWritePartition; use datafusion::arrow::array::{ @@ -202,13 +202,17 @@ impl SortShuffleWriterExec { ) -> impl Future>> { let metrics = SortShuffleWriteMetrics::new(input_partition, &self.metrics); let config = self.config.clone(); - let plan = self.plan.clone(); + // Restrict file scans to this task's partition so DataFusion 54's + // shared work queue can't pull files from sibling partitions; see + // [`restrict_file_scan_to_partition`] for the full story. + let plan = restrict_file_scan_to_partition(self.plan.clone(), input_partition); let work_dir = self.work_dir.clone(); let job_id = self.job_id.clone(); let stage_id = self.stage_id; let partitioning = self.shuffle_output_partitioning.clone(); async move { + let plan = plan?; let now = Instant::now(); let mut stream = plan.execute(input_partition, context.clone())?; let schema = stream.schema(); From 1f7721d192b1d6bc9fd4ac9a487a4de57396b5f9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 26 May 2026 16:28:17 -0600 Subject: [PATCH 5/6] fix: disable shared work source via preserve_order instead of emptying file_groups The previous work-stealing fix narrowed each FileScanConfig's file_groups so only the running partition's slot kept its files. That works for the multi-file scan smoke tests, but it broke broadcast hash joins. In a CollectLeft-style HashJoinExec the join collects its build-side DataSourceExec by calling execute(0..K) on it from inside the join, and emptying out every slot except the running task's left the hash table starved. TPC-H Q11 hangs in that configuration: queries 1-10 finished in under 25s, then Q11 sat with no progress until GitHub Actions killed the job at the 6h limit. Switch to setting preserve_order=true on every FileScanConfig instead. That short-circuits FileScanConfig::create_sibling_state to None, which disables the SharedWorkSource entirely. Each partition then falls back to WorkSource::Local(file_groups[partition]) and scans exactly the files the planner assigned to it. File group membership is left untouched, so broadcast joins can still iterate the full set on the build side. preserve_order itself only suppresses scan-time file reordering; it's already implicitly true whenever the config has an output ordering, so the code path is well exercised upstream. Adds a multi_file_parquet_broadcast_hash_join_returns_full_result test that joins two multi-file parquet tables and checks the row count, as a smaller-than-TPC-H regression for the build-side-starvation failure mode. --- ballista/client/tests/multi_file_scan.rs | 69 +++++++- .../src/execution_plans/restrict_file_scan.rs | 159 ++++++++++-------- 2 files changed, 150 insertions(+), 78 deletions(-) diff --git a/ballista/client/tests/multi_file_scan.rs b/ballista/client/tests/multi_file_scan.rs index 6ccd283fb6..8ed65c498c 100644 --- a/ballista/client/tests/multi_file_scan.rs +++ b/ballista/client/tests/multi_file_scan.rs @@ -32,10 +32,13 @@ mod common; // drains the whole queue and reads every file, so a 6-file table executed // by 6 tasks returns 6x the data. // -// `restrict_file_scan_to_partition` in ballista-core narrows the -// FileScanConfig to the running partition's files just before execution so -// the SharedWorkSource only contains the slice this task is supposed to -// process. These tests would fail without that helper. +// `restrict_file_scan_to_partition` in ballista-core sets +// `preserve_order = true` on every FileScanConfig before execution, which +// short-circuits `FileScanConfig::create_sibling_state` to `None`. Each +// partition then falls back to `WorkSource::Local(file_groups[partition])` +// and scans exactly the files the planner assigned to it, so a 6-file scan +// dispatched as 6 tasks reads 6 files instead of 36. These tests would fail +// without that helper. #[cfg(test)] #[cfg(feature = "standalone")] mod work_stealing { @@ -141,6 +144,64 @@ mod work_stealing { Ok(()) } + // Regression for an earlier version of the work-stealing fix that emptied + // out file_groups for all partition slots except the running task's. That + // broke TPC-H Q11: in a broadcast hash join the build-side + // DataSourceExec is read with execute(0..K) by the join itself, so + // emptying the other slots starved the hash table and the join hung. + // This test joins two multi-file parquet tables under a configuration + // that strongly biases the planner toward broadcast hash join, and + // checks the join still returns every matched row. + #[tokio::test] + async fn multi_file_parquet_broadcast_hash_join_returns_full_result() -> Result<()> { + let left_dir = TempDir::new().unwrap(); + let right_dir = TempDir::new().unwrap(); + // Left side is intentionally larger so the planner picks the small + // right side as the broadcast build input. + let (left_rows, _) = write_parquet_dataset(left_dir.path(), 5, 8).await?; + let (right_rows, _) = write_parquet_dataset(right_dir.path(), 4, 4).await?; + + let ctx = SessionContext::standalone().await?; + ctx.register_parquet( + "l", + left_dir.path().to_str().unwrap(), + ParquetReadOptions::default(), + ) + .await?; + ctx.register_parquet( + "r", + right_dir.path().to_str().unwrap(), + ParquetReadOptions::default(), + ) + .await?; + + let batches = ctx + .sql("SELECT COUNT(*) AS matched FROM l JOIN r ON l.value = r.value") + .await? + .collect() + .await?; + + let matched = batches[0] + .column_by_name("matched") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + // Both sides use disjoint ranges (left = 0..40, right = 0..16), so + // the join must match exactly `right_rows` rows. Anything less means + // the build-side scan lost data; anything more would mean the probe + // side double-read. + assert_eq!( + matched, right_rows as i64, + "broadcast hash join over multi-file scans must see every \ + build-side row exactly once; left had {left_rows} rows, right \ + had {right_rows}" + ); + + Ok(()) + } + #[tokio::test] async fn multi_file_parquet_group_by_returns_each_value_once() -> Result<()> { let tmp_dir = TempDir::new().unwrap(); diff --git a/ballista/core/src/execution_plans/restrict_file_scan.rs b/ballista/core/src/execution_plans/restrict_file_scan.rs index 4c7f61a4f1..8d79ded090 100644 --- a/ballista/core/src/execution_plans/restrict_file_scan.rs +++ b/ballista/core/src/execution_plans/restrict_file_scan.rs @@ -15,54 +15,59 @@ // specific language governing permissions and limitations // under the License. -//! Per-task `FileScanConfig` narrowing. +//! Disable DataFusion 54's cross-partition file work stealing on every +//! `FileScanConfig` in a plan tree. //! -//! DataFusion 54 added a shared work queue (`SharedWorkSource`) to -//! `FileScanConfig`: when any partition opens its stream, it pulls files from -//! a queue populated with every file in the scan. That model assumes every -//! partition runs together inside the same DataSourceExec instance so they -//! cooperatively drain the queue exactly once. Ballista breaks that -//! assumption — each task deserialises its own copy of the plan and runs a -//! single partition — so the partition that does run drains the whole queue -//! and ends up scanning every file. Concretely, a 6-file scan executed by 6 -//! tasks reads 36 files and returns six copies of the data. +//! DataFusion 54 added a `SharedWorkSource` to `FileScanConfig`: when any +//! partition opens its stream, it pulls files from a queue populated with +//! every file in the scan. That model assumes every partition of the same +//! `DataSourceExec` instance runs together and cooperatively drains the +//! queue exactly once. Ballista breaks the assumption — each task +//! deserialises its own copy of the plan and runs a single partition — so +//! the partition that does run drains the whole queue and ends up scanning +//! every file. A 6-file scan executed by 6 tasks reads 36 files and returns +//! six copies of the data. //! -//! [`restrict_file_scan_to_partition`] rewrites the plan tree just before -//! execution so that every `FileScanConfig` only contains files for the -//! partition being executed. The file group count (and therefore the -//! advertised partitioning) is preserved by replacing the other slots with -//! empty groups, so partition routing through the rest of the plan is -//! unaffected. +//! The fix is to pin every `FileScanConfig` to `preserve_order = true` +//! before execution. DataFusion's `FileScanConfig::create_sibling_state` +//! short-circuits to `None` when that flag is set, so no shared queue is +//! ever installed. Each partition then falls back to its own +//! `WorkSource::Local(file_groups[partition])` and scans exactly the files +//! the planner assigned to it. //! -//! See `ballista/client/tests/multi_file_scan.rs` for the end-to-end -//! regression that motivated this helper. +//! Notes: +//! * We can't just narrow `file_groups` per task, because broadcast hash +//! joins call `execute(0..K)` on the build-side `DataSourceExec` from +//! inside the join, so every partition slot must keep its files. TPC-H +//! Q11 hangs if you empty out the build-side slots — see +//! `ballista/client/tests/multi_file_scan.rs` for the simpler regression. +//! * `preserve_order = true` only disables file reordering at scan time; +//! it's already implicitly true whenever the config has an output +//! ordering, so the runtime path is well-exercised upstream. use std::sync::Arc; use datafusion::common::Result; use datafusion::common::tree_node::{Transformed, TreeNode}; -use datafusion::datasource::physical_plan::{ - FileGroup, FileScanConfig, FileScanConfigBuilder, -}; +use datafusion::datasource::physical_plan::{FileScanConfig, FileScanConfigBuilder}; use datafusion::datasource::source::DataSourceExec; use datafusion::physical_plan::ExecutionPlan; -/// Narrow every `FileScanConfig` in `plan` so that only `partition`'s file -/// group has files; all other slots become empty. +/// Rewrite every `FileScanConfig` in `plan` so its sibling work source is +/// suppressed, forcing each partition to scan only its own file group. /// -/// This keeps `file_groups.len()` (and therefore the advertised partition -/// count) unchanged, which means the rest of the plan can still route -/// `execute(partition)` calls through unmodified operators. The -/// `SharedWorkSource` that the file scan builds from `file_groups` ends up -/// containing only the relevant partition's files, so the active partition -/// reads exactly its assigned slice instead of draining the full set. +/// The `partition` argument is the index of the partition this task will +/// execute. It is currently unused — pinning `preserve_order = true` is +/// enough to disable work stealing for any partition — but kept in the +/// signature so callers can stay symmetric across writer types and so a +/// future per-task narrowing scheme can drop in without touching them. /// /// If the leaf is something other than a `FileScanConfig`-backed -/// `DataSourceExec`, the node is left alone — there's nothing for the shared -/// queue to mishandle. +/// `DataSourceExec`, or the config is single-partition (and so already has +/// nothing to share), the node is returned unchanged. pub fn restrict_file_scan_to_partition( plan: Arc, - partition: usize, + _partition: usize, ) -> Result> { plan.transform_down(|node| { let Some(data_source_exec) = node.downcast_ref::() else { @@ -75,25 +80,15 @@ pub fn restrict_file_scan_to_partition( return Ok(Transformed::no(node)); }; - // Nothing to do for single-partition scans: the shared queue still - // matches what `execute(0)` would consume, so this is already - // correct. - if file_scan.file_groups.len() <= 1 { + // Single-partition scans don't trigger the work-stealing bug + // (there's nothing to steal from), and the flag is already set if + // the user opted into ordering preservation. + if file_scan.file_groups.len() <= 1 || file_scan.preserve_order { return Ok(Transformed::no(node)); } - let mut new_groups: Vec = - Vec::with_capacity(file_scan.file_groups.len()); - for (idx, group) in file_scan.file_groups.iter().enumerate() { - if idx == partition { - new_groups.push(group.clone()); - } else { - new_groups.push(FileGroup::new(Vec::new())); - } - } - let new_config = FileScanConfigBuilder::from(file_scan.clone()) - .with_file_groups(new_groups) + .with_preserve_order(true) .build(); let new_exec = DataSourceExec::from_data_source(new_config) as Arc; @@ -107,7 +102,7 @@ mod tests { use super::*; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::datasource::listing::PartitionedFile; - use datafusion::datasource::physical_plan::ParquetSource; + use datafusion::datasource::physical_plan::{FileGroup, ParquetSource}; use datafusion::execution::object_store::ObjectStoreUrl; use std::sync::Arc; @@ -132,57 +127,73 @@ mod tests { DataSourceExec::from_data_source(config) as Arc } - fn file_groups_of(plan: &Arc) -> Vec> { + fn file_scan(plan: &Arc) -> FileScanConfig { let exec = plan .downcast_ref::() .expect("DataSourceExec"); - let scan = exec - .data_source() + exec.data_source() .downcast_ref::() - .expect("FileScanConfig"); - scan.file_groups - .iter() - .map(|g| g.iter().map(|f| f.path().to_string()).collect()) - .collect() + .expect("FileScanConfig") + .clone() } #[test] - fn keeps_only_target_partition_files() { + fn sets_preserve_order_to_disable_work_stealing() { let plan = build_plan(4); + assert!( + !file_scan(&plan).preserve_order, + "test fixture should start with default preserve_order=false" + ); let restricted = restrict_file_scan_to_partition(plan, 2).unwrap(); - let groups = file_groups_of(&restricted); + let scan = file_scan(&restricted); + assert!( + scan.preserve_order, + "preserve_order must be set so create_sibling_state returns None and \ + the SharedWorkSource is never installed" + ); + } + + #[test] + fn keeps_all_files_in_their_original_groups() { + let plan = build_plan(3); + let restricted = restrict_file_scan_to_partition(plan, 1).unwrap(); + let scan = file_scan(&restricted); + let groups: Vec> = scan + .file_groups + .iter() + .map(|g| g.iter().map(|f| f.path().to_string()).collect()) + .collect(); assert_eq!( groups, vec![ - vec![] as Vec, - vec![], + vec!["f0.parquet".to_string()], + vec!["f1.parquet".to_string()], vec!["f2.parquet".to_string()], - vec![], ], - "only file_groups[2] should keep its file; the others must be empty so \ - the SharedWorkSource contains only this task's slice" + "every file_groups slot must keep its files so broadcast hash joins \ + can still iterate the full set on the build side" ); } + #[test] + fn single_partition_scan_is_left_alone() { + let plan = build_plan(1); + let restricted = restrict_file_scan_to_partition(Arc::clone(&plan), 0).unwrap(); + // Single-partition scans have nothing to steal; the transform skips + // them and returns the original Arc untouched. + assert!(Arc::ptr_eq(&plan, &restricted)); + } + #[test] fn preserves_partition_count() { let plan = build_plan(3); let restricted = restrict_file_scan_to_partition(plan, 1).unwrap(); - let groups = file_groups_of(&restricted); + let scan = file_scan(&restricted); assert_eq!( - groups.len(), + scan.file_groups.len(), 3, "file_groups length must be preserved so DataSourceExec keeps its \ advertised partition count" ); } - - #[test] - fn single_partition_scan_is_left_alone() { - let plan = build_plan(1); - let restricted = restrict_file_scan_to_partition(Arc::clone(&plan), 0).unwrap(); - // The transform should detect there's nothing to narrow and return - // the original Arc untouched (it's a no-op in that case). - assert!(Arc::ptr_eq(&plan, &restricted)); - } } From d07b0858268051eb3c80b7c14f070e29815cf10c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 27 May 2026 07:15:21 -0600 Subject: [PATCH 6/6] fix(scheduler): treat ScalarSubqueryExec as an opaque stage barrier TPC-H Q11 hung in CI for the full 6h timeout. The scheduler log showed the executor rejecting the stage with DataFusion error: Internal error: ScalarSubqueryExpr can only be deserialized as part of a surrounding ScalarSubqueryExec. DataFusion 54 introduced ScalarSubqueryExec to hold the ScalarSubqueryResults that ScalarSubqueryExpr nodes reference. The proto codec only installs that results context while it decodes the input of a ScalarSubqueryExec, so any stage whose plan contains a bare ScalarSubqueryExpr without the surrounding wrapper fails to deserialise. Ballista's planner was descending into ScalarSubqueryExec children and splitting them at the usual RepartitionExec / SortPreservingMergeExec / CoalescePartitionsExec boundaries, which left the FilterExec carrying the ScalarSubqueryExpr in its own stage and the ScalarSubqueryExec wrapper somewhere else. The executor then rejected the bad plan with InvalidArgument; the scheduler marked the executor as lost and rebroadcast the stage forever, producing the 6h hang. Treat ScalarSubqueryExec as an opaque subtree in plan_query_stages_internal. The wrapping operator stays in the same stage as everything it owns (main input + each subquery branch); any RepartitionExec or SortPreservingMergeExec inside runs in-process under that single distributed stage. Loses pipeline parallelism for subquery-bearing queries, but is correct and fixes the hang. Verified locally against TPC-H SF1 (q1-q22, skipping q16) through a standalone cluster; all 21 queries pass in under a second each. --- ballista/scheduler/src/planner.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ballista/scheduler/src/planner.rs b/ballista/scheduler/src/planner.rs index cdf81e1bf6..182c90c60f 100644 --- a/ballista/scheduler/src/planner.rs +++ b/ballista/scheduler/src/planner.rs @@ -38,6 +38,7 @@ use datafusion::physical_optimizer::enforce_sorting::EnforceSorting; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode}; use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::scalar_subquery::ScalarSubqueryExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion::physical_plan::{ ExecutionPlan, Partitioning, with_new_children_if_necessary, @@ -132,6 +133,23 @@ impl DefaultDistributedPlanner { // Apply broadcast-join promotion before recursing. let execution_plan = Self::maybe_promote_to_broadcast(execution_plan, config)?; + // ScalarSubqueryExec must travel with its embedded ScalarSubqueryExpr + // nodes in the same serialized plan, otherwise the executor cannot + // deserialise them: the proto codec only installs the + // ScalarSubqueryResults context while it decodes the input under a + // surrounding ScalarSubqueryExec, and a bare ScalarSubqueryExpr + // returns "ScalarSubqueryExpr can only be deserialized as part of a + // surrounding ScalarSubqueryExec". Treat the whole subtree (main + // input + subqueries) as opaque so it stays inside one Ballista + // stage; any internal RepartitionExec / SortPreservingMergeExec runs + // in-process under the wrapping ScalarSubqueryExec instead of being + // hoisted into separate distributed stages. TPC-H Q11 hits this and + // would otherwise hang forever as the executor rejects the bad plan + // and the scheduler keeps retrying. + if execution_plan.is::() { + return Ok((execution_plan, vec![])); + } + // recurse down and replace children if execution_plan.children().is_empty() { return Ok((execution_plan, vec![]));